diff --git a/scripts/human_evaluation/README.md b/scripts/human_evaluation/README.md new file mode 100644 index 0000000..fa135e3 --- /dev/null +++ b/scripts/human_evaluation/README.md @@ -0,0 +1,198 @@ +# ๐Ÿงช Query Evaluation UI + +Lightweight Flask UI for **human evaluation of LLM-generated multi-hop queries**. + +--- + +# ๐Ÿš€ Setup & Run + +### Input JSON (`task_data.json`) +Get the input file from https://ibm.box.com/s/2b9g7nq1r2h1ll866sebw5cuaf1az2xj + +From this folder: + +```bash +cd vakra/scripts/human_evaluation +python3 -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt +python task_eval_ui.py --data task_data.json --out task_annotations.json +``` + +Open: + +``` +http://127.0.0.1:5000 +``` + +### Annotation Flow + +* Press **1โ€“4** โ†’ score metric +* Press **0** โ†’ mark `retrieval_sufficiency` or `cross_hop_entity_consistency` as not applicable in Task 4 +* It auto-advances to next metric +* Press **N** โ†’ next example +* Press **P** โ†’ previous +* Press **T** โ†’ jump to notes + +### Output + +Saved to: + +``` +annotations.json +``` +annotations.json saved at the end of every sample. + +Contains scores + notes + timestamps. + +### Task 3 Metrics + +```python +METRICS = { + "faithfulness": { + "title": "Faithfulness Score", + "desc": "Does the merged query contain only information grounded in the component queries, without introducing unsupported facts?", + "scale": { + 1: "Completely hallucinated: major unsupported entities or relations introduced", + 2: "Harmful hallucination: incorrect info that breaks reasoning", + 3: "Minor hallucination: small issues but mostly correct", + 4: "No hallucination: fully grounded in component queries" + } + }, + "naturalness": { + "title": "Naturalness", + "desc": "Is the merged query fluent, natural, and phrased in a way a human would realistically understand?", + "scale": { + 1: "Completely unnatural or broken", + 2: "Awkward and clearly stitched", + 3: "Mostly natural but slightly awkward", + 4: "Fully natural and fluent" + } + }, + "logical_consistency": { + "title": "Logical Consistency", + "desc": "Is the merged query logically consistent, with no contradictions between its components, and aligned with the reasoning chain implied by intermediate queries?", + "scale": { + 1: "Completely inconsistent and contradictory leading it to be unanswerable", + 2: "Major inconsistency to the level of being misleading", + 3: "Minor inconsistency not affecting answerability", + 4: "Fully consistent" + } + }, + "answer_leakage": { + "title": "Answer Leakage", + "desc": "Does the merged query explicitly or implicitly reveal the answer entity, making the reasoning task trivial? (i.e., Is the answer entity accidentally mentioned in the query, or does the merged query leak any answer to the hop question?)", + "scale": { + 1: "Complete leakage (answer directly present)", + 2: "Strong leakage (very obvious answer)", + 3: "Minor hints present", + 4: "No leakage" + } + }, + "context_sufficiency": { + "title": "Context Sufficiency", + "desc": "Does the merged query contain sufficient context and constraints to be answerable via the intended tool or retrieval pipeline? (i.e., Does the merged question have enough context present to be answered as well as does the query have enough context to populate the arguments of the tools?)", + "scale": { + 1: "Not answerable", + 2: "Major missing context", + 3: "Mostly sufficient", + 4: "Fully sufficient" + } + }, + "reasoning_hops": { + "title": "Reasoning Steps / Hops", + "desc": "If you had to solve this query, how many reasoning steps would be required according to you?", + "scale": { + 1: "Single hop", + 2: "Two hops", + 3: "Moderate multi-hop", + 4: "Complex multi-hop", + 5: "Very complex reasoning" + } + } +} +``` + +### Task 4 Metrics + +```python +METRICS = { + "faithfulness": { + "title": "Faithfulness Score", + "desc": "Does the merged query contain only information grounded in the component queries, without introducing unsupported facts?", + "scale": { + 1: "Completely hallucinated: major unsupported entities or relations", + 2: "Harmful hallucination: incorrect info that breaks reasoning", + 3: "Minor hallucination: small issues but mostly correct", + 4: "No hallucination: fully grounded in component queries" + } + }, + "naturalness": { + "title": "Naturalness", + "desc": "Is the merged query fluent, natural, and phrased in a way a human would understand the ask? (i.e., check grammaticality, fluency, and overall natural phrasing of the merged query)", + "scale": { + 1: "Cannot understand what is being asked at all", + 2: "Highly awkward phrasing; clearly stitched but can understand the ask from the query", + 3: "Slightly awkward phrasing but can understand the ask from the query", + 4: "Fully natural and fluent" + } + }, + "logical_consistency": { + "title": "Logical Consistency", + "desc": ( + "Does the merged query avoid logically incompatible conditions or contradictions? " + "The different parts of the query should be simultaneously satisfiable and should " + "form a valid reasoning chain. For example, a question like " + "'Name a city on Earth which lies above the equator and is in Australia?' " + "contains logically incompatible constraints." + ), + "scale": { + 1: "Completely inconsistent: logically incompatible component queries or conditions that cannot be satisfied together were merged", + 2: "Major inconsistency: the query contains impossible or directly contradictory conditions that make the question invalid or unanswerable", + 3: "Minor inconsistency: the query is mostly logically valid but contains small ambiguities, weak conflicts, or mildly confusing constraints", + 4: "Fully consistent" + } + }, + "answer_leakage": { + "title": "Answer Leakage", + "desc": "Does the merged query explicitly or implicitly reveal the answer entity, making the reasoning task trivial? (i.e., Is the answer entity accidentally mentioned in the query, or does the merged query leak any answer to the hop question?)", + "scale": { + 1: "Complete leakage (answer directly present)", + 2: "Strong leakage (very obvious answer)", + 3: "Minor hints present", + 4: "No leakage" + } + }, + "context_sufficiency": { + "title": "Context Sufficiency", + "desc": "Does the merged query contain sufficient context and constraints to be answerable via the intended tool or retrieval pipeline? (i.e., Does the merged question have enough context present to be answered as well as does the query have enough context to populate the arguments of the tools?)", + "scale": { + 1: "Not answerable: entities missing in the merged query or insufficient context to answer the question", + 2: "Major missing context: all entities present but insufficient context significantly hindering answerability", + 3: "Mostly sufficient: minor missing context that does not significantly hinder answerability", + 4: "Fully sufficient" + } + }, + "retrieval_sufficiency": { + "title": "Retrieval Sufficiency Score", + "desc": "Do the ground truth documents have sufficient information to answer the RAG query? (Mark '0' if no RAG component in query.)", + "scale": { + 0: "Not applicable (e.g., no retrieval needed for this query)", + 1: "GT document have no relevant information", + 2: "GT document have some missing information", + 3: "GT document have some missing information which is common sense knowledge", + 4: "No information missing" + }, + }, + "cross_hop_entity_consistency": { + "title": "Cross-Hop Entity Consistency Score", + "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions? (Mark '0' if no RAG component in query.)", + "scale": { + 0: "Not applicable (e.g., no retrieval needed for this query)", + 1: "Not answerable", + 2: "Majorly missing context / entities cannot be answered without these entities", + 3: "Mostly sufficient have some missing information which is common sense knowledge", + 4: "Fully sufficient", + }, + } +} +``` diff --git a/scripts/human_evaluation/requirements.txt b/scripts/human_evaluation/requirements.txt new file mode 100644 index 0000000..90ce11d --- /dev/null +++ b/scripts/human_evaluation/requirements.txt @@ -0,0 +1 @@ +Flask>=3.0,<4 diff --git a/scripts/human_evaluation/task3_eval_ui.py b/scripts/human_evaluation/task3_eval_ui.py new file mode 100644 index 0000000..afe66b5 --- /dev/null +++ b/scripts/human_evaluation/task3_eval_ui.py @@ -0,0 +1,451 @@ +import json +import argparse +from pathlib import Path +from datetime import datetime + +from flask import Flask, request, redirect, render_template_string, url_for + +METRICS = { + "faithfulness": { + "title": "Faithfulness Score", + "desc": "Does the merged query contain only information grounded in the component queries, without introducing unsupported facts?", + "scale": { + 1: "Completely hallucinated: major unsupported entities or relations introduced", + 2: "Harmful hallucination: incorrect info that breaks reasoning", + 3: "Minor hallucination: small issues but mostly correct", + 4: "No hallucination: fully grounded in component queries" + } + }, + "naturalness": { + "title": "Naturalness", + "desc": "Is the merged query fluent, natural, and phrased in a way a human would realistically understand?", + "scale": { + 1: "Completely unnatural or broken", + 2: "Awkward and clearly stitched", + 3: "Mostly natural but slightly awkward", + 4: "Fully natural and fluent" + } + }, + "logical_consistency": { + "title": "Logical Consistency", + "desc": "Is the merged query logically consistent, with no contradictions between its components, and aligned with the reasoning chain implied by intermediate queries?", + "scale": { + 1: "Completely inconsistent and contradictory leading it to be unanswerable", + 2: "Major inconsistency to the level of being misleading", + 3: "Minor inconsistency not affecting answerability", + 4: "Fully consistent" + } + }, + "answer_leakage": { + "title": "Answer Leakage", + "desc": "Does the merged query explicitly or implicitly reveal the answer entity, making the reasoning task trivial? (i.e., Is the answer entity accidentally mentioned in the query, or does the merged query leak any answer to the hop question?)", + "scale": { + 1: "Complete leakage (answer directly present)", + 2: "Strong leakage (very obvious answer)", + 3: "Minor hints present", + 4: "No leakage" + } + }, + "context_sufficiency": { + "title": "Context Sufficiency", + "desc": "Does the merged query contain sufficient context and constraints to be answerable via the intended tool or retrieval pipeline? (i.e., Does the merged question have enough context present to be answered as well as does the query have enough context to populate the arguments of the tools?)", + "scale": { + 1: "Not answerable", + 2: "Major missing context", + 3: "Mostly sufficient", + 4: "Fully sufficient" + } + } +} + +HTML = """ + + + + Query Evaluation + + + + + +
+ +
+

Input Data

+ +
+ UUID: {{ item.uuid }}
+ Domain: {{ item.domain }} +
+ +
+

Component Queries

+
    + {% for q in item.component_queries %} +
  • {{ q }}
  • + {% endfor %} +
+
+ +
+

Merged Query

+
{{ item.query }}
+
+ +
+

Ground Truth Tools

+
{{ item.gt_tools | tojson(indent=2) }}
+
+ +
+

Tool Responses

+
{{ item.gt_responses | tojson(indent=2) }}
+
+ +
+

Final Answer

+
{{ item.gt_answer }}
+
+
+ +
+

Evaluation

+ +
+ Item {{ idx + 1 }} / {{ total }} | Completed: {{ completed }} +
+ +
+ Hover over numbers for score explanations. Keyboard: 1-4 = score active metric, N = Save & Next, P = Save & Previous. +
+ +
+ + {% for key, metric in metrics.items() %} +
+

{{ loop.index }}. {{ metric.title }}

+
{{ metric.desc }}
+ + {% for score, explanation in metric.scale.items() %} + + {% endfor %} +
+ {% endfor %} + +
+

Notes

+ +
+ + + + + +
+
+ +
+ + + + + +""" + +def load_json(path): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def save_json(path, data): + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".tmp") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + tmp.replace(path) + + +def make_app(data_path, output_path): + app = Flask(__name__) + + data = load_json(data_path) + + if output_path.exists(): + results = load_json(output_path) + else: + results = {} + + def completed_count(): + return sum(1 for item in data if item["uuid"] in results) + + @app.route("/") + def home(): + return redirect(url_for("annotate", idx=0)) + + @app.route("/item/") + def annotate(idx): + idx = max(0, min(idx, len(data) - 1)) + item = data[idx] + existing = results.get(item["uuid"], {}) + + return render_template_string( + HTML, + item=item, + idx=idx, + total=len(data), + metrics=METRICS, + existing=existing, + completed=completed_count(), + ) + + @app.route("/save/", methods=["POST"]) + def save(idx): + item = data[idx] + uuid = item["uuid"] + + annotation = { + "uuid": uuid, + "domain": item.get("domain"), + "query": item.get("query"), + + "faithfulness": int(request.form["faithfulness"]), + "naturalness": int(request.form["naturalness"]), + "logical_consistency": int(request.form["logical_consistency"]), + "answer_leakage": int(request.form["answer_leakage"]), + "context_sufficiency": int(request.form["context_sufficiency"]), + + "notes": request.form.get("notes", ""), + "updated_at": datetime.utcnow().isoformat() + "Z", + } + + quality_keys = [ + "faithfulness", + "naturalness", + "logical_consistency", + "answer_leakage", + "context_sufficiency", + ] + + annotation["mean_quality_score_1_to_4"] = sum( + annotation[k] for k in quality_keys + ) / len(quality_keys) + + results[uuid] = annotation + save_json(output_path, results) + + action = request.form.get("action") + if action == "prev": + return redirect(url_for("annotate", idx=max(0, idx - 1))) + if action == "next": + return redirect(url_for("annotate", idx=min(len(data) - 1, idx + 1))) + return redirect(url_for("annotate", idx=idx)) + + return app + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data", required=True, help="Input JSON file") + parser.add_argument("--out", default="annotations.json", help="Output JSON file") + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", default=5000, type=int) + args = parser.parse_args() + + app = make_app(Path(args.data), Path(args.out)) + app.run(host=args.host, port=args.port, debug=True) diff --git a/scripts/human_evaluation/task4_eval_ui.py b/scripts/human_evaluation/task4_eval_ui.py new file mode 100644 index 0000000..690f1c8 --- /dev/null +++ b/scripts/human_evaluation/task4_eval_ui.py @@ -0,0 +1,526 @@ +import argparse +import json +from datetime import datetime +from pathlib import Path + +from flask import Flask, redirect, render_template_string, request, url_for + + +METRICS = { + "faithfulness": { + "title": "Faithfulness Score", + "desc": "Does the merged query contain only information grounded in the component queries, without introducing unsupported facts?", + "scale": { + 1: "Completely hallucinated: major unsupported entities or relations", + 2: "Harmful hallucination: incorrect info that breaks reasoning", + 3: "Minor hallucination: small issues but mostly correct", + 4: "No hallucination: fully grounded in component queries" + } + }, + "naturalness": { + "title": "Naturalness", + "desc": "Is the merged query fluent, natural, and phrased in a way a human would understand the ask? (i.e., check grammaticality, fluency, and overall natural phrasing of the merged query)", + "scale": { + 1: "Cannot understand what is being asked at all", + 2: "Highly awkward phrasing; clearly stitched but can understand the ask from the query", + 3: "Slightly awkward phrasing but can understand the ask from the query", + 4: "Fully natural and fluent" + } + }, + "logical_consistency": { + "title": "Logical Consistency", + "desc": ( + "Does the merged query avoid logically incompatible conditions or contradictions? " + "The different parts of the query should be simultaneously satisfiable and should " + "form a valid reasoning chain. For example, a question like " + "'Name a city on Earth which lies above the equator and is in Australia?' " + "contains logically incompatible constraints." + ), + "scale": { + 1: "Completely inconsistent: logically incompatible component queries or conditions that cannot be satisfied together were merged", + 2: "Major inconsistency: the query contains impossible or directly contradictory conditions that make the question invalid or unanswerable", + 3: "Minor inconsistency: the query is mostly logically valid but contains small ambiguities, weak conflicts, or mildly confusing constraints", + 4: "Fully consistent" + } + }, + "answer_leakage": { + "title": "Answer Leakage", + "desc": "Does the merged query explicitly or implicitly reveal the answer entity, making the reasoning task trivial? (i.e., Is the answer entity accidentally mentioned in the query, or does the merged query leak any answer to the hop question?)", + "scale": { + 1: "Complete leakage (answer directly present)", + 2: "Strong leakage (very obvious answer)", + 3: "Minor hints present", + 4: "No leakage" + } + }, + "context_sufficiency": { + "title": "Context Sufficiency", + "desc": "Does the merged query contain sufficient context and constraints to be answerable via the intended tool or retrieval pipeline? (i.e., Does the merged question have enough context present to be answered as well as does the query have enough context to populate the arguments of the tools?)", + "scale": { + 1: "Not answerable: entities missing in the merged query or insufficient context to answer the question", + 2: "Major missing context: all entities present but insufficient context significantly hindering answerability", + 3: "Mostly sufficient: minor missing context that does not significantly hinder answerability", + 4: "Fully sufficient" + } + }, + # "reasoning_hops": { + # "title": "Reasoning Steps / Hops", + # "desc": "If you had to solve this query, how many reasoning steps would be required according to you?", + # "scale": { + # 1: "Single step", + # 2: "Two steps", + # 3: "Moderate multi-hop", + # 4: "Complex multi-hop", + # 5: "Very complex reasoning", + # }, + # }, + "retrieval_sufficiency": { + "title": "Retrieval Sufficiency Score", + "desc": "Do the ground truth documents have sufficient information to answer the RAG query? (Mark '0' if no RAG component in query.)", + "scale": { + 0: "Not applicable (e.g., no retrieval needed for this query)", + 1: "GT document have no relevant information", + 2: "GT document have some missing information", + 3: "GT document have some missing information which is common sense knowledge", + 4: "No information missing" + }, + }, + "cross_hop_entity_consistency": { + "title": "Cross-Hop Entity Consistency Score", + "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions? (Mark '0' if no RAG component in query.)", + "scale": { + 0: "Not applicable (e.g., no retrieval needed for this query)", + 1: "Not answerable", + 2: "Majorly missing context / entities cannot be answered without these entities", + 3: "Mostly sufficient have some missing information which is common sense knowledge", + 4: "Fully sufficient", + }, + }, +} + +QUALITY_KEYS = [ + "faithfulness", + "naturalness", + "logical_consistency", + "answer_leakage", + "context_sufficiency", + "retrieval_sufficiency", + "cross_hop_entity_consistency", +] + +HTML = """ + + + + Task 4 Query Evaluation + + + + + +
+ +
+

Input Data

+ +
+ UUID: {{ item.uuid }}
+ Domain: {{ item.domain }}
+ Type: {{ item.type }}
+ Question Type: {{ item.question_type }} +
+ +
+

Dialogue Context

+ {% if item.dialogue %} + {% for turn in item.dialogue %} +
+ Turn {{ turn.turn_id }} +
{{ turn.query }}
+ {% if turn.answer is defined %} + Answer +
{{ turn.answer | tojson(indent=2) }}
+ {% endif %} +
+ {% endfor %} + {% else %} +
No prior dialogue turns.
+ {% endif %} +
+ +
+

Component Queries

+ {% if item.component_queries %} +
    + {% for q in item.component_queries %} +
  • {{ q }}
  • + {% endfor %} +
+ {% else %} +
No component queries for this sample.
+ {% endif %} +
+ +
+

Final Query

+
{{ item.query }}
+
+ +
+

Ground Truth Tools

+
{{ item.gt_tools | tojson(indent=2) }}
+
+ +
+

Tool Responses / Documents

+
{{ item.gt_responses | tojson(indent=2) }}
+
+ +
+

Final Answer

+
{{ item.gt_answer }}
+
+
+ +
+

Evaluation

+ +
+ Item {{ idx + 1 }} / {{ total }} | Completed: {{ completed }} +
+ +
+ Hover over numbers for score explanations. Keyboard: 0-5 = score active metric when available, N = Save & Next, P = Save & Previous. +
+ +
+ + {% for key, metric in metrics.items() %} +
+

{{ loop.index }}. {{ metric.title }}

+
{{ metric.desc }}
+ + {% for score, explanation in metric.scale.items() %} + + {% endfor %} +
+ {% endfor %} + +
+

Notes

+ +
+ + + + + +
+
+ +
+ + + + + +""" + + +def load_json(path): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def save_json(path, data): + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".tmp") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + tmp.replace(path) + + +def make_app(data_path, output_path): + app = Flask(__name__) + + data = load_json(data_path) + + if output_path.exists(): + results = load_json(output_path) + else: + results = {} + + def completed_count(): + return sum(1 for item in data if item["uuid"] in results) + + @app.route("/") + def home(): + return redirect(url_for("annotate", idx=0)) + + @app.route("/item/") + def annotate(idx): + idx = max(0, min(idx, len(data) - 1)) + item = data[idx] + existing = results.get(item["uuid"], {}) + + return render_template_string( + HTML, + item=item, + idx=idx, + total=len(data), + metrics=METRICS, + existing=existing, + completed=completed_count(), + ) + + @app.route("/save/", methods=["POST"]) + def save(idx): + item = data[idx] + uuid = item["uuid"] + + annotation = { + "uuid": uuid, + "domain": item.get("domain"), + "type": item.get("type"), + "question_type": item.get("question_type"), + "query": item.get("query"), + "notes": request.form.get("notes", ""), + "updated_at": datetime.utcnow().isoformat() + "Z", + } + + for key in METRICS: + annotation[key] = int(request.form[key]) + + annotation["mean_quality_score_1_to_4"] = sum( + annotation[k] for k in QUALITY_KEYS + ) / len(QUALITY_KEYS) + + results[uuid] = annotation + save_json(output_path, results) + + action = request.form.get("action") + if action == "prev": + return redirect(url_for("annotate", idx=max(0, idx - 1))) + if action == "next": + return redirect(url_for("annotate", idx=min(len(data) - 1, idx + 1))) + return redirect(url_for("annotate", idx=idx)) + + return app + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data", default="task4_data.json", help="Input JSON file") + parser.add_argument("--out", default="task4_annotations.json", help="Output JSON file") + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", default=5001, type=int) + args = parser.parse_args() + + app = make_app(Path(args.data), Path(args.out)) + app.run(host=args.host, port=args.port, debug=True)