-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.py
More file actions
116 lines (75 loc) · 4.33 KB
/
eval.py
File metadata and controls
116 lines (75 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Script to evaluate the answers generated by different LLMs
from clusterAnswers import execEvalOfQuestions,applyBootstrapping
import promptLLM as promptLLM
from CONFIG import ANSWER_CLUSTER_FOLDER
import sys
MODEL = sys.argv[1]
assert MODEL in ["GPT4o","GPT4.1"]
NUM_BOOTSTRAP = 100000 # change to 1M ?
VQAMED2019_LIST_GPT4O = [
"EVAL_VQAMed2019_GPT4o.csv",
]
RAD_DATASET_WITH_CONTEXT_LIST_GPT4O = [
"EVAL_RadDataset_withContext_GPT4o.csv",
]
FULL_LIST_GPT4O = [
"EVAL_VQAMed2019_GPT4o.csv",
"EVAL_RadDataset_withContext_GPT4o.csv",
]
VQAMED2019_LIST_GPT4_1 = [
"EVAL_VQAMed2019_GPT4_1.csv",
]
RAD_DATASET_WITH_CONTEXT_LIST_GPT4_1 = [
"EVAL_RadDataset_withContext_GPT4_1.csv",
]
FULL_LIST_GPT4_1 = [
"EVAL_VQAMed2019_GPT4_1.csv",
"EVAL_RadDataset_withContext_GPT4_1.csv",
]
RAD_DATASET_SUBTOPICS = [
"Angiography",
"ComputedTomography",
"MagneticResonanceImaging",
"Radiography",
]
VQAMED2019_SUBTOPICS = [
"modality",
"plane",
"organ",
"abnormality",
]
# The function addFolder appends a path prefix to each file
addFolder = lambda lst: [ANSWER_CLUSTER_FOLDER + x for x in lst]
#fullClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(FULL_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True)
if MODEL == "GPT4.1":
fullClusterAnswerListGPT4_1 = execEvalOfQuestions(addFolder(FULL_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True)
applyBootstrapping(fullClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
applyBootstrapping(fullClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)
print("+"*60)
radDatasetClusterAnswerListGPT4_1 = execEvalOfQuestions(addFolder(RAD_DATASET_WITH_CONTEXT_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True)
applyBootstrapping(radDatasetClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
applyBootstrapping(radDatasetClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)
for radSubtopic in RAD_DATASET_SUBTOPICS:
execEvalOfQuestions(addFolder(RAD_DATASET_WITH_CONTEXT_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True,subtopic=radSubtopic)
print("+"*60)
vqamed2019ClusterAnswerListGPT4_1 = execEvalOfQuestions(addFolder(VQAMED2019_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True)
applyBootstrapping(vqamed2019ClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
applyBootstrapping(vqamed2019ClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)
for vqaSubtopic in VQAMED2019_SUBTOPICS:
execEvalOfQuestions(addFolder(VQAMED2019_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True,subtopic=vqaSubtopic)
if MODEL == "GPT4o":
fullClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(FULL_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True)
applyBootstrapping(fullClusterAnswerListGPT4o,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
applyBootstrapping(fullClusterAnswerListGPT4o,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)
print("+"*60)
radDatasetClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(RAD_DATASET_WITH_CONTEXT_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True)
applyBootstrapping(radDatasetClusterAnswerListGPT4o,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
applyBootstrapping(radDatasetClusterAnswerListGPT4o,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)
for radSubtopic in RAD_DATASET_SUBTOPICS:
radDatasetClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(RAD_DATASET_WITH_CONTEXT_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True,subtopic=radSubtopic)
print("+"*60)
vqamed2019ClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(VQAMED2019_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True)
applyBootstrapping(vqamed2019ClusterAnswerListGPT4o,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
applyBootstrapping(vqamed2019ClusterAnswerListGPT4o,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)
for vqaSubtopic in VQAMED2019_SUBTOPICS:
fullClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(VQAMED2019_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True,subtopic=vqaSubtopic)