diff --git a/.gitignore b/.gitignore index 7e37430952..7856978017 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # General Python stuff venv +**/**.sqlite __pycache__ *.egg-info .mypy_cache diff --git a/pyproject.toml b/pyproject.toml index d6fda049d7..69fe520a6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ dependencies = [ "sqlitedict>=2.1.0,<3.0", "bottle~=0.12.23", # Basic Scenarios - "datasets~=3.1", + "datasets>=3.1", "pyarrow>=11.0.0", # Pinned transitive dependency for datasets; workaround for #1026 "pyarrow-hotfix~=0.6", # Hotfix for CVE-2023-47248 # Basic metrics @@ -60,6 +60,7 @@ dependencies = [ # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers "torch>=1.13.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) "torchvision>=0.14.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) + "tiktoken~=0.7", ] [project.optional-dependencies] @@ -84,7 +85,7 @@ scenarios = [ metrics = [ "google-api-python-client~=2.64", # For perspective_api_client via toxicity_metrics - "numba~=0.56", # For copyright_metrics + "numba>=0.56", # For copyright_metrics "sacrebleu~=2.2", # For disinformation_metrics, machine_translation_metrics "langdetect~=1.0", # For ifeval_metrics "immutabledict~=4.2", # For ifeval_metrics @@ -197,7 +198,7 @@ google = [ ] together = [ - "together~=1.1", + "together>=1.1", ] yandex = [ @@ -487,7 +488,7 @@ conflicts = [ # openai-whisper build needs setuptools + wheel; it doesn't declare them (CI uses --no-build-isolation-package + venv with both) [tool.uv.extra-build-dependencies] -openai-whisper = ["setuptools", "wheel"] +openai-whisper = ["setuptools<82", "wheel"] [tool.setuptools] package-dir = { "" = "src" } diff --git a/requirements.txt b/requirements.txt index ae9d4a7079..4629a8486f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,7 @@ cryptography==46.0.5 cycler==0.12.1 cymem==2.0.13 dacite==1.9.2 -datasets==3.6.0 +datasets==4.8.5 dill==0.3.8 distlib==0.4.0 distro==1.9.0 @@ -79,7 +79,7 @@ mypy-extensions==1.1.0 networkx==3.4.2 nltk==3.9.2 nodeenv==1.10.0 -numba==0.63.1 +numba>=0.56 numpy==2.2.6 nvidia-cublas-cu12==12.8.4.1 nvidia-cuda-cupti-cu12==12.8.90 @@ -153,7 +153,7 @@ tabulate==0.9.0 thinc==8.3.10 threadpoolctl==3.6.0 tiktoken==0.12.0 -together==1.5.29 +together==2.15.0 tokenizers==0.21.4 toml==0.10.2 tomli==2.4.0 diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py index 31daa32159..922d2e207c 100644 --- a/src/helm/benchmark/run_specs/medhelm_run_specs.py +++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py @@ -1732,3 +1732,32 @@ def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = Non metric_specs=metric_specs, groups=["health_bench_professional"], ) + + +@run_spec_function("medxpert_qa_text") +def get_medxpert_qa_text_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.medxpert_qa_text_scenario.MedXpertQATextScenario", + args={}, + ) + + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + instructions=( + "You are a medical expert assistant. Answer the following medical question with single letter from options." + ), + input_noun="Question", + output_noun="Answer", + max_tokens=1, + max_train_instances=0, + ) + + metric_specs = get_exact_match_metric_specs() + + return RunSpec( + name="medxpert_qa_text", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=["medxpert_qa_text"], + ) diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py new file mode 100644 index 0000000000..b6b0addc4f --- /dev/null +++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py @@ -0,0 +1,109 @@ +import os +from typing import List + +from datasets import DatasetDict, load_dataset + +from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo +from helm.common.general import ensure_directory_exists + +from helm.benchmark.scenarios.scenario import ( + CORRECT_TAG, + TEST_SPLIT, + Input, + Instance, + Output, + Reference, + Scenario, + ScenarioMetadata, +) + + +class MedXpertQATextScenario(Scenario): + """ + From "MedXpertQA: Benchmarking Expert-Level Medical Knowledge and Reasoning" (2025), + MedXpertQA is a highly challenging benchmark designed to evaluate expert-level medical knowledge, + clinical reasoning, and advanced problem-solving abilities in large language models. + The benchmark contains 4,460 questions spanning 17 medical specialties and 11 body systems, + with a dedicated Text subset for text-only evaluation and an MM subset for multimodal clinical reasoning. + + The dataset includes rigorously curated specialty board-style questions enriched with detailed clinical contexts, + patient records, and examination findings. MedXpertQA applies filtering, augmentation, and data synthesis + techniques to improve difficulty, reduce data leakage risks, and ensure strong clinical relevance through + multiple rounds of expert review. + + HuggingFace Dataset: https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA + ArXiv Paper: https://arxiv.org/abs/2501.18362 + + @article{zuo2025medxpertqa, + title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding}, + author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen}, + journal={arXiv preprint arXiv:2501.18362}, + year={2025} + } + """ + + HF_DATASET_NAME = "TsinghuaC3I/MedXpertQA" + + name = "medxpert_qa_text" + description = ( + "MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning," + " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems." + " It features rigorously curated and clinically relevant board-style questions, enhanced through expert review and data synthesis " + "techniques to ensure high difficulty, reliability, and minimal data leakage." + ) + tags = ["knowledge", "generation", "question_answering", "biomedical"] + + def get_instances(self, output_path: str) -> List[Instance]: + data_path: str = os.path.join(output_path, "data") + ensure_directory_exists(data_path) + dataset: DatasetDict = load_dataset(self.HF_DATASET_NAME, "Text", cache_dir=data_path) + + # split the dataset into train, validation, and test splits + splits = {TEST_SPLIT: ["test"]} + instances: List[Instance] = [] + for ( + helm_split_name, + dataset_splits_name, + ) in splits.items(): # Iterate over the splits + for dataset_split_name in dataset_splits_name: + split_data = dataset[dataset_split_name] + + for example in split_data: + question = example["question"] + answer = example["label"] + + instance = Instance( + input=Input(text=question), + references=[ + Reference( + Output(text=option), + tags=[CORRECT_TAG] if alpha == answer else [], + ) + for alpha, option in example["options"].items() + ], + split=helm_split_name, + extra_data={ + "id": example["id"], + "medical_task": example["medical_task"], + "body_system": example["body_system"], + "question_type": example["question_type"], + }, + ) + instances.append(instance) + return instances + + def get_metadata(self) -> ScenarioMetadata: + return ScenarioMetadata( + name=self.name, + display_name="MedXpertQA", + description=self.description, + taxonomy=TaxonomyInfo( + task="Question answering", + what="Answer expert-level medical questions across diverse specialties and body systems", + when="Any", + who="Medical professionals, Medical students", + language="English", + ), + main_metric="exact_match", + main_split=TEST_SPLIT, + ) diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml index 34e5dbdc99..adb3774b66 100644 --- a/src/helm/benchmark/static/schema_medhelm.yaml +++ b/src/helm/benchmark/static/schema_medhelm.yaml @@ -517,6 +517,7 @@ run_groups: - sct_bench - health_bench - health_bench_professional + - medxpert_qa_text - name: clinical_note_generation display_name: Clinical Note Generation @@ -722,6 +723,23 @@ run_groups: when: Any language: English + - name: medxpert_qa_text + display_name: MedXpertQA + description: MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning, and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems. Note:- Only text split is used [(Medxpertqa)](https://arxiv.org/abs/2501.18362). + metric_groups: + - accuracy + - efficiency + - general_information + environment: + main_name: exact_match + main_split: test + taxonomy: + task: Question answering + what: Medical knowledge testing + who: Medical student, Researcher + when: Any + language: English + - name: medbullets display_name: Medbullets description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com). diff --git a/uv.lock b/uv.lock index 620636cfa4..dfcad91467 100644 --- a/uv.lock +++ b/uv.lock @@ -4166,6 +4166,7 @@ dependencies = [ { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-7-medhelm-human-evaluation' and extra == 'extra-7-medhelm-ibm')" }, { name = "spacy" }, { name = "sqlitedict" }, + { name = "tiktoken" }, { name = "torch" }, { name = "torchvision" }, { name = "tqdm" }, @@ -4234,6 +4235,7 @@ all = [ { name = "opencc" }, { name = "opencv-python" }, { name = "opencv-python-headless" }, + { name = "openpyxl" }, { name = "pdf2image" }, { name = "pillow" }, { name = "pycocoevalcap" }, @@ -4587,7 +4589,7 @@ requires-dist = [ { name = "colorcet", marker = "extra == 'plots'", specifier = "~=3.0" }, { name = "colorlog", specifier = "~=6.9" }, { name = "dacite", specifier = "~=1.6" }, - { name = "datasets", specifier = "~=3.1" }, + { name = "datasets", specifier = ">=3.1" }, { name = "diffusers", marker = "extra == 'heim'", specifier = "~=0.34.0" }, { name = "dspy", marker = "extra == 'dspy'", specifier = "~=3.0" }, { name = "einops", marker = "extra == 'audiolm'", specifier = "~=0.7.0" }, @@ -4687,7 +4689,7 @@ requires-dist = [ { name = "nltk", specifier = "~=3.7,!=3.9.0" }, { name = "nltk", marker = "extra == 'summarization'", specifier = "~=3.7,!=3.9.0" }, { name = "nudenet", marker = "extra == 'heim'", specifier = "~=2.0" }, - { name = "numba", marker = "extra == 'metrics'", specifier = "~=0.56" }, + { name = "numba", marker = "extra == 'metrics'", specifier = ">=0.56" }, { name = "numpy", specifier = ">=1.26,<3" }, { name = "numpy", marker = "extra == 'heim'", specifier = ">=1.26" }, { name = "omegaconf", marker = "extra == 'heim'", specifier = "~=2.3" }, @@ -4749,9 +4751,10 @@ requires-dist = [ { name = "summ-eval", marker = "extra == 'summarization'", specifier = "~=0.892" }, { name = "surge-api", marker = "extra == 'human-evaluation'", specifier = "~=1.1" }, { name = "tensorflow", marker = "extra == 'heim'", specifier = "~=2.11" }, + { name = "tiktoken", specifier = "~=0.7" }, { name = "tiktoken", marker = "extra == 'openai'", specifier = "~=0.7" }, { name = "timm", marker = "extra == 'heim'", specifier = "~=0.6.12" }, - { name = "together", marker = "extra == 'together'", specifier = "~=1.1" }, + { name = "together", marker = "extra == 'together'", specifier = ">=1.1" }, { name = "tokenizers", marker = "extra == 'aleph-alpha'", specifier = ">=0.13.3" }, { name = "torch", specifier = ">=1.13.1,<3.0.0" }, { name = "torch", marker = "extra == 'vlm'", specifier = "~=2.1" },