From b7b22136b310804e7b0927e07d73c270f1749308 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Fri, 15 May 2026 14:52:59 +0000 Subject: [PATCH 1/8] feat: implement MedXpert-QA-Text benchmark scenario and update dependencies --- pyproject.toml | 2 +- requirements.txt | 2 +- .../benchmark/run_specs/medhelm_run_specs.py | 28 +++++++ .../scenarios/medxpert_qa_text_scenario.py | 82 +++++++++++++++++++ src/helm/benchmark/static/schema_medhelm.yaml | 18 ++++ 5 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py diff --git a/pyproject.toml b/pyproject.toml index d6fda049d7b..d2930373a91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ dependencies = [ "sqlitedict>=2.1.0,<3.0", "bottle~=0.12.23", # Basic Scenarios - "datasets~=3.1", + "datasets>=3.1", "pyarrow>=11.0.0", # Pinned transitive dependency for datasets; workaround for #1026 "pyarrow-hotfix~=0.6", # Hotfix for CVE-2023-47248 # Basic metrics diff --git a/requirements.txt b/requirements.txt index ae9d4a70793..defd7e93c57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,7 @@ cryptography==46.0.5 cycler==0.12.1 cymem==2.0.13 dacite==1.9.2 -datasets==3.6.0 +datasets==4.8.5 dill==0.3.8 distlib==0.4.0 distro==1.9.0 diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py index 31daa321592..2841905057a 100644 --- a/src/helm/benchmark/run_specs/medhelm_run_specs.py +++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py @@ -1732,3 +1732,31 @@ def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = Non metric_specs=metric_specs, groups=["health_bench_professional"], ) + +@run_spec_function("medxpert_qa_text") +def get_medxpert_qa_text_spec(jury_config_path: Optional[str] = None) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.medxpert_qa_text_scenario.MedXpertQATextScenario", + args={}, + ) + + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + instructions=( + "You are a medical expert assistant. Answer the following medical question with single letter from options." + ), + input_noun="Question", + output_noun="Answer", + max_tokens=1, + max_train_instances=0, + ) + + metric_specs = get_exact_match_metric_specs() + + return RunSpec( + name="medxpert_qa_text", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=["medxpert_qa_text"], + ) \ No newline at end of file diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py new file mode 100644 index 00000000000..8c3baeb76c2 --- /dev/null +++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py @@ -0,0 +1,82 @@ +import os +from typing import List +import pandas as pd + +from datasets import DatasetDict, load_dataset + +from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo +from helm.common.general import ensure_directory_exists, ensure_file_downloaded + +from helm.benchmark.scenarios.scenario import ( + CORRECT_TAG, + TEST_SPLIT, + Input, + Instance, + Output, + Reference, + Scenario, + ScenarioMetadata, +) + +class MedXpertQATextScenario(Scenario): + """ + The MedXpertQA dataset introduced in the MedXpert paper by Li et al: + @article{zuo2025medxpertqa, + title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding}, + author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen}, + journal={arXiv preprint arXiv:2501.18362}, + year={2025} + } + """ + + HF_DATASET_NAME = "TsinghuaC3I/MedXpertQA" + + name = "medxpert_qa" + description = ( + "MedXpertQA is a benchmark designed to evaluate the medical reasoning and understanding capabilities of" + "language models. Each instance in the dataset consists of a medical question and its corresponding " + "expert-level answer. The benchmark assesses a model's ability to comprehend complex medical information, reason through clinical scenarios, and provide accurate and informative responses that align with expert knowledge in the field of medicine." + ) + tags = ["knowledge", "generation", "question_answering", "biomedical"] + + def get_instances(self, output_path: str) -> List[Instance]: + data_path: str = os.path.join(output_path, "data") + ensure_directory_exists(data_path) + dataset: DatasetDict = load_dataset( + self.HF_DATASET_NAME, + "Text", + cache_dir=data_path) + + # split the dataset into train, validation, and test splits + splits = {TEST_SPLIT: ["test"]} + instances: List[Instance] = [] + for ( + helm_split_name, + dataset_splits_name, + ) in splits.items(): # Iterate over the splits + for dataset_split_name in dataset_splits_name: + split_data = dataset[dataset_split_name] + + for example in split_data: + question = example["question"] + answer = example["label"] + + instance = Instance( + input=Input(text=question), + references=[ + Reference( + Output(text=option), + tags=[CORRECT_TAG] if alpha == answer else [], + ) + for alpha, option in example['options'].items() + ], + split=helm_split_name, + extra_data={ + "id": example["id"], + "medical_task": example["medical_task"], + "body_system": example["body_system"], + "question_type": example["question_type"], + }, + ) + instances.append(instance) + return instances \ No newline at end of file diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml index 34e5dbdc99c..a5182e93d53 100644 --- a/src/helm/benchmark/static/schema_medhelm.yaml +++ b/src/helm/benchmark/static/schema_medhelm.yaml @@ -517,6 +517,7 @@ run_groups: - sct_bench - health_bench - health_bench_professional + - medxpert_qa_text - name: clinical_note_generation display_name: Clinical Note Generation @@ -722,6 +723,23 @@ run_groups: when: Any language: English + - name: medxpert_qa_text + display_name: MedXpert-QA-Text + description: MedXpert-QA-Text is a benchmark designed to evaluate a model's ability to answer medical questions based on unstructured clinical text. Each instance includes a clinical note and a related question, requiring the model to extract relevant information from the text to provide an accurate answer [(Zhang et al., 2024)](https://arxiv.org/abs/2406.12036). + metric_groups: + - accuracy + - efficiency + - general_information + environment: + main_name: exact_match + main_split: test + taxonomy: + task: Question answering + what: Medical knowledge testing + who: Medical student, Researcher + when: Any + language: English + - name: medbullets display_name: Medbullets description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com). From 530ad78a7730f6cdb296c89b34b8195931a5012d Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 18 May 2026 13:36:04 +0000 Subject: [PATCH 2/8] fix: update package specifications for numba and together; add openpyxl to dependencies --- pyproject.toml | 6 +++--- requirements.txt | 4 ++-- uv.lock | 7 ++++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d2930373a91..0bd78a98972 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ scenarios = [ metrics = [ "google-api-python-client~=2.64", # For perspective_api_client via toxicity_metrics - "numba~=0.56", # For copyright_metrics + "numba>=0.56", # For copyright_metrics "sacrebleu~=2.2", # For disinformation_metrics, machine_translation_metrics "langdetect~=1.0", # For ifeval_metrics "immutabledict~=4.2", # For ifeval_metrics @@ -197,7 +197,7 @@ google = [ ] together = [ - "together~=1.1", + "together>=1.1", ] yandex = [ @@ -487,7 +487,7 @@ conflicts = [ # openai-whisper build needs setuptools + wheel; it doesn't declare them (CI uses --no-build-isolation-package + venv with both) [tool.uv.extra-build-dependencies] -openai-whisper = ["setuptools", "wheel"] +openai-whisper = ["setuptools<82", "wheel"] [tool.setuptools] package-dir = { "" = "src" } diff --git a/requirements.txt b/requirements.txt index defd7e93c57..24b694c4019 100644 --- a/requirements.txt +++ b/requirements.txt @@ -79,7 +79,7 @@ mypy-extensions==1.1.0 networkx==3.4.2 nltk==3.9.2 nodeenv==1.10.0 -numba==0.63.1 +numba numpy==2.2.6 nvidia-cublas-cu12==12.8.4.1 nvidia-cuda-cupti-cu12==12.8.90 @@ -153,7 +153,7 @@ tabulate==0.9.0 thinc==8.3.10 threadpoolctl==3.6.0 tiktoken==0.12.0 -together==1.5.29 +together tokenizers==0.21.4 toml==0.10.2 tomli==2.4.0 diff --git a/uv.lock b/uv.lock index 620636cfa4b..857dff48161 100644 --- a/uv.lock +++ b/uv.lock @@ -4234,6 +4234,7 @@ all = [ { name = "opencc" }, { name = "opencv-python" }, { name = "opencv-python-headless" }, + { name = "openpyxl" }, { name = "pdf2image" }, { name = "pillow" }, { name = "pycocoevalcap" }, @@ -4587,7 +4588,7 @@ requires-dist = [ { name = "colorcet", marker = "extra == 'plots'", specifier = "~=3.0" }, { name = "colorlog", specifier = "~=6.9" }, { name = "dacite", specifier = "~=1.6" }, - { name = "datasets", specifier = "~=3.1" }, + { name = "datasets", specifier = ">=3.1" }, { name = "diffusers", marker = "extra == 'heim'", specifier = "~=0.34.0" }, { name = "dspy", marker = "extra == 'dspy'", specifier = "~=3.0" }, { name = "einops", marker = "extra == 'audiolm'", specifier = "~=0.7.0" }, @@ -4687,7 +4688,7 @@ requires-dist = [ { name = "nltk", specifier = "~=3.7,!=3.9.0" }, { name = "nltk", marker = "extra == 'summarization'", specifier = "~=3.7,!=3.9.0" }, { name = "nudenet", marker = "extra == 'heim'", specifier = "~=2.0" }, - { name = "numba", marker = "extra == 'metrics'", specifier = "~=0.56" }, + { name = "numba", marker = "extra == 'metrics'", specifier = ">=0.56" }, { name = "numpy", specifier = ">=1.26,<3" }, { name = "numpy", marker = "extra == 'heim'", specifier = ">=1.26" }, { name = "omegaconf", marker = "extra == 'heim'", specifier = "~=2.3" }, @@ -4751,7 +4752,7 @@ requires-dist = [ { name = "tensorflow", marker = "extra == 'heim'", specifier = "~=2.11" }, { name = "tiktoken", marker = "extra == 'openai'", specifier = "~=0.7" }, { name = "timm", marker = "extra == 'heim'", specifier = "~=0.6.12" }, - { name = "together", marker = "extra == 'together'", specifier = "~=1.1" }, + { name = "together", marker = "extra == 'together'", specifier = ">=1.1" }, { name = "tokenizers", marker = "extra == 'aleph-alpha'", specifier = ">=0.13.3" }, { name = "torch", specifier = ">=1.13.1,<3.0.0" }, { name = "torch", marker = "extra == 'vlm'", specifier = "~=2.1" }, From 63cec538ccbc23bd1a3ff78475b3f4bd55618ce5 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 18 May 2026 13:36:20 +0000 Subject: [PATCH 3/8] feat: implement MedXpertQA Text scenario with detailed documentation and metadata --- .../benchmark/run_specs/medhelm_run_specs.py | 5 +- .../scenarios/medxpert_qa_text_scenario.py | 47 ++++++++++++++----- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py index 2841905057a..922d2e207c1 100644 --- a/src/helm/benchmark/run_specs/medhelm_run_specs.py +++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py @@ -1733,8 +1733,9 @@ def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = Non groups=["health_bench_professional"], ) + @run_spec_function("medxpert_qa_text") -def get_medxpert_qa_text_spec(jury_config_path: Optional[str] = None) -> RunSpec: +def get_medxpert_qa_text_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.medxpert_qa_text_scenario.MedXpertQATextScenario", args={}, @@ -1759,4 +1760,4 @@ def get_medxpert_qa_text_spec(jury_config_path: Optional[str] = None) -> RunSpec adapter_spec=adapter_spec, metric_specs=metric_specs, groups=["medxpert_qa_text"], - ) \ No newline at end of file + ) diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py index 8c3baeb76c2..50eda469002 100644 --- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py +++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py @@ -1,6 +1,5 @@ import os from typing import List -import pandas as pd from datasets import DatasetDict, load_dataset @@ -18,9 +17,23 @@ ScenarioMetadata, ) + class MedXpertQATextScenario(Scenario): """ - The MedXpertQA dataset introduced in the MedXpert paper by Li et al: + From "MedXpertQA: Benchmarking Expert-Level Medical Knowledge and Reasoning" (2025), + MedXpertQA is a highly challenging benchmark designed to evaluate expert-level medical knowledge, + clinical reasoning, and advanced problem-solving abilities in large language models. + The benchmark contains 4,460 questions spanning 17 medical specialties and 11 body systems, + with a dedicated Text subset for text-only evaluation and an MM subset for multimodal clinical reasoning. + + The dataset includes rigorously curated specialty board-style questions enriched with detailed clinical contexts, + patient records, and examination findings. MedXpertQA applies filtering, augmentation, and data synthesis + techniques to improve difficulty, reduce data leakage risks, and ensure strong clinical relevance through + multiple rounds of expert review. + + HuggingFace Dataset: https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA + ArXiv Paper: https://arxiv.org/abs/2501.18362 + @article{zuo2025medxpertqa, title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding}, author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen}, @@ -33,20 +46,18 @@ class MedXpertQATextScenario(Scenario): name = "medxpert_qa" description = ( - "MedXpertQA is a benchmark designed to evaluate the medical reasoning and understanding capabilities of" - "language models. Each instance in the dataset consists of a medical question and its corresponding " - "expert-level answer. The benchmark assesses a model's ability to comprehend complex medical information, reason through clinical scenarios, and provide accurate and informative responses that align with expert knowledge in the field of medicine." + "MedXpertQA Text is a text-only benchmark designed to evaluate expert-level medical knowledge, clinical reasoning," + " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems." + " It features rigorously curated and clinically relevant board-style questions, enhanced through expert review and data synthesis " + "techniques to ensure high difficulty, reliability, and minimal data leakage." ) tags = ["knowledge", "generation", "question_answering", "biomedical"] def get_instances(self, output_path: str) -> List[Instance]: data_path: str = os.path.join(output_path, "data") ensure_directory_exists(data_path) - dataset: DatasetDict = load_dataset( - self.HF_DATASET_NAME, - "Text", - cache_dir=data_path) - + dataset: DatasetDict = load_dataset(self.HF_DATASET_NAME, "Text", cache_dir=data_path) + # split the dataset into train, validation, and test splits splits = {TEST_SPLIT: ["test"]} instances: List[Instance] = [] @@ -68,7 +79,7 @@ def get_instances(self, output_path: str) -> List[Instance]: Output(text=option), tags=[CORRECT_TAG] if alpha == answer else [], ) - for alpha, option in example['options'].items() + for alpha, option in example["options"].items() ], split=helm_split_name, extra_data={ @@ -79,4 +90,16 @@ def get_instances(self, output_path: str) -> List[Instance]: }, ) instances.append(instance) - return instances \ No newline at end of file + return instances + + def get_metadata(self) -> ScenarioMetadata: + return ScenarioMetadata( + name=self.name, + description=self.description, + tags=self.tags, + taxonomy_info=TaxonomyInfo( + subjects=["medicine"], + objects=["question_answering"], + domains=["biomedical"], + ), + ) \ No newline at end of file From e6df289570b27267c59b18a2677d6dcfc42db7a7 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 18 May 2026 13:36:49 +0000 Subject: [PATCH 4/8] fix: remove unused import for ensure_file_downloaded in MedXpertQATextScenario --- src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py index 50eda469002..0bdad444b68 100644 --- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py +++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py @@ -4,7 +4,7 @@ from datasets import DatasetDict, load_dataset from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo -from helm.common.general import ensure_directory_exists, ensure_file_downloaded +from helm.common.general import ensure_directory_exists from helm.benchmark.scenarios.scenario import ( CORRECT_TAG, From 781d2a697d5e2bc63e5b2653cf425515e89a85a8 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 18 May 2026 13:40:03 +0000 Subject: [PATCH 5/8] feat: enhance MedXpertQATextScenario metadata with display name and taxonomy details --- .../scenarios/medxpert_qa_text_scenario.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py index 0bdad444b68..83dcc9eb3f8 100644 --- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py +++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py @@ -95,11 +95,15 @@ def get_instances(self, output_path: str) -> List[Instance]: def get_metadata(self) -> ScenarioMetadata: return ScenarioMetadata( name=self.name, + display_name="MedXpertQA Text", description=self.description, - tags=self.tags, - taxonomy_info=TaxonomyInfo( - subjects=["medicine"], - objects=["question_answering"], - domains=["biomedical"], + taxonomy=TaxonomyInfo( + task="Question answering", + what="Answer expert-level medical questions across diverse specialties and body systems", + when="Any", + who="Medical professionals, Medical students", + language="English", ), - ) \ No newline at end of file + main_metric="exact_match", + main_split=TEST_SPLIT, + ) From 36811ac8c9fd27f3472396f069deff821646249c Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Tue, 19 May 2026 12:52:22 +0000 Subject: [PATCH 6/8] feat: update MedXpertQATextScenario name and add tiktoken dependency --- .gitignore | 1 + pyproject.toml | 1 + src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py | 2 +- uv.lock | 2 ++ 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7e37430952e..78569780175 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # General Python stuff venv +**/**.sqlite __pycache__ *.egg-info .mypy_cache diff --git a/pyproject.toml b/pyproject.toml index 0bd78a98972..69fe520a6d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ dependencies = [ # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers "torch>=1.13.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) "torchvision>=0.14.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) + "tiktoken~=0.7", ] [project.optional-dependencies] diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py index 83dcc9eb3f8..cf18eaa4a0f 100644 --- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py +++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py @@ -44,7 +44,7 @@ class MedXpertQATextScenario(Scenario): HF_DATASET_NAME = "TsinghuaC3I/MedXpertQA" - name = "medxpert_qa" + name = "medxpert_qa_text" description = ( "MedXpertQA Text is a text-only benchmark designed to evaluate expert-level medical knowledge, clinical reasoning," " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems." diff --git a/uv.lock b/uv.lock index 857dff48161..dfcad91467b 100644 --- a/uv.lock +++ b/uv.lock @@ -4166,6 +4166,7 @@ dependencies = [ { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-7-medhelm-human-evaluation' and extra == 'extra-7-medhelm-ibm')" }, { name = "spacy" }, { name = "sqlitedict" }, + { name = "tiktoken" }, { name = "torch" }, { name = "torchvision" }, { name = "tqdm" }, @@ -4750,6 +4751,7 @@ requires-dist = [ { name = "summ-eval", marker = "extra == 'summarization'", specifier = "~=0.892" }, { name = "surge-api", marker = "extra == 'human-evaluation'", specifier = "~=1.1" }, { name = "tensorflow", marker = "extra == 'heim'", specifier = "~=2.11" }, + { name = "tiktoken", specifier = "~=0.7" }, { name = "tiktoken", marker = "extra == 'openai'", specifier = "~=0.7" }, { name = "timm", marker = "extra == 'heim'", specifier = "~=0.6.12" }, { name = "together", marker = "extra == 'together'", specifier = ">=1.1" }, From 5477d6962dbd5d9e81e7b4c5b45d32b66c9e2d5e Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Thu, 21 May 2026 10:56:58 +0000 Subject: [PATCH 7/8] feat: update MedXpertQA descriptions and requirements for improved clarity and accuracy --- requirements.txt | 4 ++-- src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py | 4 ++-- src/helm/benchmark/static/schema_medhelm.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 24b694c4019..4629a8486f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -79,7 +79,7 @@ mypy-extensions==1.1.0 networkx==3.4.2 nltk==3.9.2 nodeenv==1.10.0 -numba +numba>=0.56 numpy==2.2.6 nvidia-cublas-cu12==12.8.4.1 nvidia-cuda-cupti-cu12==12.8.90 @@ -153,7 +153,7 @@ tabulate==0.9.0 thinc==8.3.10 threadpoolctl==3.6.0 tiktoken==0.12.0 -together +together==2.15.0 tokenizers==0.21.4 toml==0.10.2 tomli==2.4.0 diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py index cf18eaa4a0f..b6b0addc4f4 100644 --- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py +++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py @@ -46,7 +46,7 @@ class MedXpertQATextScenario(Scenario): name = "medxpert_qa_text" description = ( - "MedXpertQA Text is a text-only benchmark designed to evaluate expert-level medical knowledge, clinical reasoning," + "MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning," " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems." " It features rigorously curated and clinically relevant board-style questions, enhanced through expert review and data synthesis " "techniques to ensure high difficulty, reliability, and minimal data leakage." @@ -95,7 +95,7 @@ def get_instances(self, output_path: str) -> List[Instance]: def get_metadata(self) -> ScenarioMetadata: return ScenarioMetadata( name=self.name, - display_name="MedXpertQA Text", + display_name="MedXpertQA", description=self.description, taxonomy=TaxonomyInfo( task="Question answering", diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml index a5182e93d53..1b5119b1280 100644 --- a/src/helm/benchmark/static/schema_medhelm.yaml +++ b/src/helm/benchmark/static/schema_medhelm.yaml @@ -725,7 +725,7 @@ run_groups: - name: medxpert_qa_text display_name: MedXpert-QA-Text - description: MedXpert-QA-Text is a benchmark designed to evaluate a model's ability to answer medical questions based on unstructured clinical text. Each instance includes a clinical note and a related question, requiring the model to extract relevant information from the text to provide an accurate answer [(Zhang et al., 2024)](https://arxiv.org/abs/2406.12036). + description: MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning, and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems. Note:- Only text split is used [(Medxpertqa)](https://arxiv.org/abs/2501.18362). metric_groups: - accuracy - efficiency From 9c755d06ce52c7a4004c6c3dd4cc54904b6d0d68 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Thu, 21 May 2026 13:56:52 +0000 Subject: [PATCH 8/8] feat: update display name for MedXpertQA Text scenario for consistency --- src/helm/benchmark/static/schema_medhelm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml index 1b5119b1280..adb3774b667 100644 --- a/src/helm/benchmark/static/schema_medhelm.yaml +++ b/src/helm/benchmark/static/schema_medhelm.yaml @@ -724,7 +724,7 @@ run_groups: language: English - name: medxpert_qa_text - display_name: MedXpert-QA-Text + display_name: MedXpertQA description: MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning, and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems. Note:- Only text split is used [(Medxpertqa)](https://arxiv.org/abs/2501.18362). metric_groups: - accuracy