VisionSemanticEntropy/eval.py at main · TruhnLab/VisionSemanticEntropy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Script to evaluate the answers generated by different LLMs
from clusterAnswers import execEvalOfQuestions,applyBootstrapping
import promptLLM as promptLLM
from CONFIG import ANSWER_CLUSTER_FOLDER
import sys

MODEL = sys.argv[1]
assert MODEL in ["GPT4o","GPT4.1"]

NUM_BOOTSTRAP = 100000 # change to 1M ?

VQAMED2019_LIST_GPT4O = [
    "EVAL_VQAMed2019_GPT4o.csv",
]

RAD_DATASET_WITH_CONTEXT_LIST_GPT4O = [
    "EVAL_RadDataset_withContext_GPT4o.csv",
]

FULL_LIST_GPT4O = [
    "EVAL_VQAMed2019_GPT4o.csv",
    "EVAL_RadDataset_withContext_GPT4o.csv",
]


VQAMED2019_LIST_GPT4_1 = [
    "EVAL_VQAMed2019_GPT4_1.csv",
]

RAD_DATASET_WITH_CONTEXT_LIST_GPT4_1 = [
    "EVAL_RadDataset_withContext_GPT4_1.csv",
]

FULL_LIST_GPT4_1 = [
    "EVAL_VQAMed2019_GPT4_1.csv",
    "EVAL_RadDataset_withContext_GPT4_1.csv",
]


RAD_DATASET_SUBTOPICS = [
    "Angiography",
    "ComputedTomography",
    "MagneticResonanceImaging",
    "Radiography",
]

VQAMED2019_SUBTOPICS = [
    "modality",
    "plane",
    "organ",
    "abnormality",
]


# The function addFolder appends a path prefix to each file
addFolder = lambda lst: [ANSWER_CLUSTER_FOLDER + x for x in lst]

#fullClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(FULL_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True)
if MODEL == "GPT4.1":
    fullClusterAnswerListGPT4_1 = execEvalOfQuestions(addFolder(FULL_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True)

    applyBootstrapping(fullClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
    applyBootstrapping(fullClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)

    print("+"*60)

    radDatasetClusterAnswerListGPT4_1 = execEvalOfQuestions(addFolder(RAD_DATASET_WITH_CONTEXT_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True)

    applyBootstrapping(radDatasetClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
    applyBootstrapping(radDatasetClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)

    for radSubtopic in RAD_DATASET_SUBTOPICS:
        execEvalOfQuestions(addFolder(RAD_DATASET_WITH_CONTEXT_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True,subtopic=radSubtopic)

    print("+"*60)

    vqamed2019ClusterAnswerListGPT4_1 = execEvalOfQuestions(addFolder(VQAMED2019_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True)

    applyBootstrapping(vqamed2019ClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
    applyBootstrapping(vqamed2019ClusterAnswerListGPT4_1,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)

    for vqaSubtopic in VQAMED2019_SUBTOPICS:
        execEvalOfQuestions(addFolder(VQAMED2019_LIST_GPT4_1),promptFunc=promptLLM.promptGPT4_1,noNewPrompts=True,subtopic=vqaSubtopic)


if MODEL == "GPT4o":
    fullClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(FULL_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True)

    applyBootstrapping(fullClusterAnswerListGPT4o,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
    applyBootstrapping(fullClusterAnswerListGPT4o,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)

    print("+"*60)

    radDatasetClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(RAD_DATASET_WITH_CONTEXT_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True)

    applyBootstrapping(radDatasetClusterAnswerListGPT4o,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
    applyBootstrapping(radDatasetClusterAnswerListGPT4o,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)

    for radSubtopic in RAD_DATASET_SUBTOPICS:
        radDatasetClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(RAD_DATASET_WITH_CONTEXT_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True,subtopic=radSubtopic)


    print("+"*60)

    vqamed2019ClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(VQAMED2019_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True)

    applyBootstrapping(vqamed2019ClusterAnswerListGPT4o,confP=0.05,entropyVal=0.6,numBootstraps=NUM_BOOTSTRAP)
    applyBootstrapping(vqamed2019ClusterAnswerListGPT4o,confP=0.05,entropyVal=0.3,numBootstraps=NUM_BOOTSTRAP)

    for vqaSubtopic in VQAMED2019_SUBTOPICS:
        fullClusterAnswerListGPT4o = execEvalOfQuestions(addFolder(VQAMED2019_LIST_GPT4O),promptFunc=promptLLM.promptGPT4o,noNewPrompts=True,subtopic=vqaSubtopic)