-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.py
More file actions
140 lines (109 loc) · 5.36 KB
/
eval.py
File metadata and controls
140 lines (109 loc) · 5.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from __future__ import annotations
import json
from collections import Counter, defaultdict
from pathlib import Path
ROOT = Path(__file__).resolve().parent
A4VL_PATH = ROOT / "egoschema_results.json"
LVAGENT_PATH = ROOT / "results" / "egoschema_results.json"
def load_results(path: Path) -> list[dict]:
with path.open("r") as fp:
return json.load(fp)
def gold_answer(item: dict) -> str:
return chr(ord("A") + int(item["correct_choice"]))
def final_answer(item: dict) -> str:
return str(item.get("final_answer", ""))[:1]
def round_name(item: dict) -> str:
if "third_samp" in item:
return "third"
if "second_samp" in item:
return "second"
if "final_answer" in item:
return "first"
return "unfinished"
def summarize(name: str, data: list[dict]) -> list[dict]:
done = [item for item in data if "final_answer" in item]
correct = sum(final_answer(item) == gold_answer(item) for item in done)
print(f"\n{name}")
print(f"Items: {len(data)}")
print(f"Finished: {len(done)}")
print(f"Correct: {correct}")
print(f"Accuracy on finished: {correct / len(done):.4f}" if done else "Accuracy on finished: n/a")
print(f"Accuracy on all items: {correct / len(data):.4f}" if data else "Accuracy on all items: n/a")
for stage in ("first", "second", "third"):
stage_items = [item for item in done if round_name(item) == stage]
stage_correct = sum(final_answer(item) == gold_answer(item) for item in stage_items)
acc = stage_correct / len(stage_items) if stage_items else 0.0
print(f"{stage.title()} round finals: {stage_correct}/{len(stage_items)} ({acc:.4f})")
first_vote_stats: dict[str, list[int]] = defaultdict(lambda: [0, 0])
for item in done:
answer_dict = item.get("first_round", {}).get("answer_dict", {})
for agent_name, answer in answer_dict.items():
first_vote_stats[agent_name][1] += 1
first_vote_stats[agent_name][0] += str(answer)[:1] == gold_answer(item)
if first_vote_stats:
print("First-round model vote accuracy:")
for agent_name in sorted(first_vote_stats):
correct_votes, total_votes = first_vote_stats[agent_name]
print(f" {agent_name}: {correct_votes}/{total_votes} ({correct_votes / total_votes:.4f})")
return done
def compare_aligned(a4vl_done: list[dict], lvagent_done: list[dict]) -> None:
a4vl_by_video = {item["video_path"]: item for item in a4vl_done}
lvagent_by_video = {item["video_path"]: item for item in lvagent_done}
shared_videos = sorted(set(a4vl_by_video) & set(lvagent_by_video))
a4vl_correct = 0
lvagent_correct = 0
a4vl_only = 0
lvagent_only = 0
both_correct = 0
both_wrong = 0
round_counter = Counter()
first_vote_stats: dict[str, list[int]] = defaultdict(lambda: [0, 0, 0, 0])
for video_path in shared_videos:
a4vl_item = a4vl_by_video[video_path]
lvagent_item = lvagent_by_video[video_path]
gold = gold_answer(a4vl_item)
a4vl_ok = final_answer(a4vl_item) == gold
lvagent_ok = final_answer(lvagent_item) == gold
a4vl_correct += a4vl_ok
lvagent_correct += lvagent_ok
a4vl_only += a4vl_ok and not lvagent_ok
lvagent_only += lvagent_ok and not a4vl_ok
both_correct += a4vl_ok and lvagent_ok
both_wrong += not a4vl_ok and not lvagent_ok
round_counter[(round_name(a4vl_item), round_name(lvagent_item), a4vl_ok, lvagent_ok)] += 1
a4vl_first = a4vl_item.get("first_round", {}).get("answer_dict", {})
lvagent_first = lvagent_item.get("first_round", {}).get("answer_dict", {})
for agent_name in sorted(set(a4vl_first) | set(lvagent_first)):
if agent_name in a4vl_first:
first_vote_stats[agent_name][1] += 1
first_vote_stats[agent_name][0] += str(a4vl_first[agent_name])[:1] == gold
if agent_name in lvagent_first:
first_vote_stats[agent_name][3] += 1
first_vote_stats[agent_name][2] += str(lvagent_first[agent_name])[:1] == gold
print("\nAligned Comparison")
print(f"Shared finished videos: {len(shared_videos)}")
print(f"A4VL correct: {a4vl_correct}/{len(shared_videos)} ({a4vl_correct / len(shared_videos):.4f})")
print(f"LVAgent correct: {lvagent_correct}/{len(shared_videos)} ({lvagent_correct / len(shared_videos):.4f})")
print(f"Both correct: {both_correct}")
print(f"Both wrong: {both_wrong}")
print(f"A4VL only correct: {a4vl_only}")
print(f"LVAgent only correct: {lvagent_only}")
print("Aligned first-round model vote accuracy:")
for agent_name in sorted(first_vote_stats):
a4vl_hits, a4vl_total, lv_hits, lv_total = first_vote_stats[agent_name]
a4vl_acc = a4vl_hits / a4vl_total if a4vl_total else 0.0
lv_acc = lv_hits / lv_total if lv_total else 0.0
print(
f" {agent_name}: "
f"A4VL {a4vl_hits}/{a4vl_total} ({a4vl_acc:.4f}), "
f"LVAgent {lv_hits}/{lv_total} ({lv_acc:.4f})"
)
print("Top round/correctness patterns:")
for key, count in round_counter.most_common(12):
print(f" {key}: {count}")
def main() -> None:
a4vl_done = summarize("A4VL", load_results(A4VL_PATH))
lvagent_done = summarize("LVAgent", load_results(LVAGENT_PATH))
compare_aligned(a4vl_done, lvagent_done)
if __name__ == "__main__":
main()