Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# General Python stuff
venv
**/**.sqlite
__pycache__
*.egg-info
.mypy_cache
Expand Down
9 changes: 5 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ dependencies = [
"sqlitedict>=2.1.0,<3.0",
"bottle~=0.12.23",
# Basic Scenarios
"datasets~=3.1",
"datasets>=3.1",
"pyarrow>=11.0.0", # Pinned transitive dependency for datasets; workaround for #1026
"pyarrow-hotfix~=0.6", # Hotfix for CVE-2023-47248
# Basic metrics
Expand All @@ -60,6 +60,7 @@ dependencies = [
# TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
"torch>=1.13.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
"torchvision>=0.14.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
"tiktoken~=0.7",
]

[project.optional-dependencies]
Expand All @@ -84,7 +85,7 @@ scenarios = [

metrics = [
"google-api-python-client~=2.64", # For perspective_api_client via toxicity_metrics
"numba~=0.56", # For copyright_metrics
"numba>=0.56", # For copyright_metrics
"sacrebleu~=2.2", # For disinformation_metrics, machine_translation_metrics
"langdetect~=1.0", # For ifeval_metrics
"immutabledict~=4.2", # For ifeval_metrics
Expand Down Expand Up @@ -197,7 +198,7 @@ google = [
]

together = [
"together~=1.1",
"together>=1.1",
]

yandex = [
Expand Down Expand Up @@ -487,7 +488,7 @@ conflicts = [

# openai-whisper build needs setuptools + wheel; it doesn't declare them (CI uses --no-build-isolation-package + venv with both)
[tool.uv.extra-build-dependencies]
openai-whisper = ["setuptools", "wheel"]
openai-whisper = ["setuptools<82", "wheel"]

[tool.setuptools]
package-dir = { "" = "src" }
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ cryptography==46.0.5
cycler==0.12.1
cymem==2.0.13
dacite==1.9.2
datasets==3.6.0
datasets==4.8.5
dill==0.3.8
distlib==0.4.0
distro==1.9.0
Expand Down Expand Up @@ -79,7 +79,7 @@ mypy-extensions==1.1.0
networkx==3.4.2
nltk==3.9.2
nodeenv==1.10.0
numba==0.63.1
numba>=0.56
numpy==2.2.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
Expand Down Expand Up @@ -153,7 +153,7 @@ tabulate==0.9.0
thinc==8.3.10
threadpoolctl==3.6.0
tiktoken==0.12.0
together==1.5.29
together==2.15.0
tokenizers==0.21.4
toml==0.10.2
tomli==2.4.0
Expand Down
29 changes: 29 additions & 0 deletions src/helm/benchmark/run_specs/medhelm_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1732,3 +1732,32 @@ def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = Non
metric_specs=metric_specs,
groups=["health_bench_professional"],
)


@run_spec_function("medxpert_qa_text")
def get_medxpert_qa_text_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.medxpert_qa_text_scenario.MedXpertQATextScenario",
args={},
)

adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
instructions=(
"You are a medical expert assistant. Answer the following medical question with single letter from options."
),
input_noun="Question",
output_noun="Answer",
max_tokens=1,
max_train_instances=0,
)

metric_specs = get_exact_match_metric_specs()

return RunSpec(
name="medxpert_qa_text",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["medxpert_qa_text"],
)
109 changes: 109 additions & 0 deletions src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import os
from typing import List

from datasets import DatasetDict, load_dataset

from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
from helm.common.general import ensure_directory_exists

from helm.benchmark.scenarios.scenario import (
CORRECT_TAG,
TEST_SPLIT,
Input,
Instance,
Output,
Reference,
Scenario,
ScenarioMetadata,
)


class MedXpertQATextScenario(Scenario):
"""
From "MedXpertQA: Benchmarking Expert-Level Medical Knowledge and Reasoning" (2025),
MedXpertQA is a highly challenging benchmark designed to evaluate expert-level medical knowledge,
clinical reasoning, and advanced problem-solving abilities in large language models.
The benchmark contains 4,460 questions spanning 17 medical specialties and 11 body systems,
with a dedicated Text subset for text-only evaluation and an MM subset for multimodal clinical reasoning.

The dataset includes rigorously curated specialty board-style questions enriched with detailed clinical contexts,
patient records, and examination findings. MedXpertQA applies filtering, augmentation, and data synthesis
techniques to improve difficulty, reduce data leakage risks, and ensure strong clinical relevance through
multiple rounds of expert review.

HuggingFace Dataset: https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA
ArXiv Paper: https://arxiv.org/abs/2501.18362

@article{zuo2025medxpertqa,
title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding},
author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen},
journal={arXiv preprint arXiv:2501.18362},
year={2025}
}
"""

HF_DATASET_NAME = "TsinghuaC3I/MedXpertQA"

name = "medxpert_qa_text"
description = (
"MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning,"
" and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems."
" It features rigorously curated and clinically relevant board-style questions, enhanced through expert review and data synthesis "
"techniques to ensure high difficulty, reliability, and minimal data leakage."
)
tags = ["knowledge", "generation", "question_answering", "biomedical"]

def get_instances(self, output_path: str) -> List[Instance]:
data_path: str = os.path.join(output_path, "data")
ensure_directory_exists(data_path)
dataset: DatasetDict = load_dataset(self.HF_DATASET_NAME, "Text", cache_dir=data_path)

# split the dataset into train, validation, and test splits
splits = {TEST_SPLIT: ["test"]}
instances: List[Instance] = []
for (
helm_split_name,
dataset_splits_name,
) in splits.items(): # Iterate over the splits
for dataset_split_name in dataset_splits_name:
split_data = dataset[dataset_split_name]

for example in split_data:
question = example["question"]
answer = example["label"]

instance = Instance(
input=Input(text=question),
references=[
Reference(
Output(text=option),
tags=[CORRECT_TAG] if alpha == answer else [],
)
for alpha, option in example["options"].items()
],
split=helm_split_name,
extra_data={
"id": example["id"],
"medical_task": example["medical_task"],
"body_system": example["body_system"],
"question_type": example["question_type"],
},
)
instances.append(instance)
return instances

def get_metadata(self) -> ScenarioMetadata:
return ScenarioMetadata(
name=self.name,
display_name="MedXpertQA",
description=self.description,
taxonomy=TaxonomyInfo(
task="Question answering",
what="Answer expert-level medical questions across diverse specialties and body systems",
when="Any",
who="Medical professionals, Medical students",
language="English",
),
main_metric="exact_match",
main_split=TEST_SPLIT,
)
18 changes: 18 additions & 0 deletions src/helm/benchmark/static/schema_medhelm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@ run_groups:
- sct_bench
- health_bench
- health_bench_professional
- medxpert_qa_text

- name: clinical_note_generation
display_name: Clinical Note Generation
Expand Down Expand Up @@ -722,6 +723,23 @@ run_groups:
when: Any
language: English

- name: medxpert_qa_text
display_name: MedXpertQA
description: MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning, and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems. Note:- Only text split is used [(Medxpertqa)](https://arxiv.org/abs/2501.18362).
metric_groups:
- accuracy
- efficiency
- general_information
environment:
main_name: exact_match
main_split: test
taxonomy:
task: Question answering
what: Medical knowledge testing
who: Medical student, Researcher
when: Any
language: English

- name: medbullets
display_name: Medbullets
description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com).
Expand Down
9 changes: 6 additions & 3 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading