PacificAI · chakravarthik27 · May 27, 2026 · May 15, 2026 · May 18, 2026 · May 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # General Python stuff
 venv
+**/**.sqlite
 __pycache__
 *.egg-info
 .mypy_cache

diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,7 @@ dependencies = [
     "sqlitedict>=2.1.0,<3.0",
     "bottle~=0.12.23",
     # Basic Scenarios
-    "datasets~=3.1",
+    "datasets>=3.1",
     "pyarrow>=11.0.0", # Pinned transitive dependency for datasets; workaround for #1026
     "pyarrow-hotfix~=0.6", # Hotfix for CVE-2023-47248
     # Basic metrics
@@ -60,6 +60,7 @@ dependencies = [
     # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
     "torch>=1.13.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
     "torchvision>=0.14.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
+    "tiktoken~=0.7",
 ]
 
 [project.optional-dependencies]
@@ -84,7 +85,7 @@ scenarios = [
 
 metrics = [
     "google-api-python-client~=2.64",  # For perspective_api_client via toxicity_metrics
-    "numba~=0.56",  # For copyright_metrics
+    "numba>=0.56",  # For copyright_metrics
     "sacrebleu~=2.2",  # For disinformation_metrics, machine_translation_metrics
     "langdetect~=1.0",  # For ifeval_metrics
     "immutabledict~=4.2",  # For ifeval_metrics
@@ -197,7 +198,7 @@ google = [
 ]
 
 together = [
-    "together~=1.1",
+    "together>=1.1",
 ]
 
 yandex = [
@@ -487,7 +488,7 @@ conflicts = [
 
 # openai-whisper build needs setuptools + wheel; it doesn't declare them (CI uses --no-build-isolation-package + venv with both)
 [tool.uv.extra-build-dependencies]
-openai-whisper = ["setuptools", "wheel"]
+openai-whisper = ["setuptools<82", "wheel"]
 
 [tool.setuptools]
 package-dir = { "" = "src" }

diff --git a/requirements.txt b/requirements.txt
@@ -27,7 +27,7 @@ cryptography==46.0.5
 cycler==0.12.1
 cymem==2.0.13
 dacite==1.9.2
-datasets==3.6.0
+datasets==4.8.5
 dill==0.3.8
 distlib==0.4.0
 distro==1.9.0
@@ -79,7 +79,7 @@ mypy-extensions==1.1.0
 networkx==3.4.2
 nltk==3.9.2
 nodeenv==1.10.0
-numba==0.63.1
+numba>=0.56
 numpy==2.2.6
 nvidia-cublas-cu12==12.8.4.1
 nvidia-cuda-cupti-cu12==12.8.90
@@ -153,7 +153,7 @@ tabulate==0.9.0
 thinc==8.3.10
 threadpoolctl==3.6.0
 tiktoken==0.12.0
-together==1.5.29
+together==2.15.0
 tokenizers==0.21.4
 toml==0.10.2
 tomli==2.4.0

diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -1732,3 +1732,32 @@ def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = Non
         metric_specs=metric_specs,
         groups=["health_bench_professional"],
     )
+
+
+@run_spec_function("medxpert_qa_text")
+def get_medxpert_qa_text_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.medxpert_qa_text_scenario.MedXpertQATextScenario",
+        args={},
+    )
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions=(
+            "You are a medical expert assistant. Answer the following medical question with single letter from options."
+        ),
+        input_noun="Question",
+        output_noun="Answer",
+        max_tokens=1,
+        max_train_instances=0,
+    )
+
+    metric_specs = get_exact_match_metric_specs()
+
+    return RunSpec(
+        name="medxpert_qa_text",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["medxpert_qa_text"],
+    )
diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
@@ -0,0 +1,109 @@
+import os
+from typing import List
+
+from datasets import DatasetDict, load_dataset
+
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.common.general import ensure_directory_exists
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Input,
+    Instance,
+    Output,
+    Reference,
+    Scenario,
+    ScenarioMetadata,
+)
+
+
+class MedXpertQATextScenario(Scenario):
+    """
+    From "MedXpertQA: Benchmarking Expert-Level Medical Knowledge and Reasoning" (2025),
+    MedXpertQA is a highly challenging benchmark designed to evaluate expert-level medical knowledge,
+    clinical reasoning, and advanced problem-solving abilities in large language models.
+    The benchmark contains 4,460 questions spanning 17 medical specialties and 11 body systems,
+    with a dedicated Text subset for text-only evaluation and an MM subset for multimodal clinical reasoning.
+
+    The dataset includes rigorously curated specialty board-style questions enriched with detailed clinical contexts,
+    patient records, and examination findings. MedXpertQA applies filtering, augmentation, and data synthesis
+    techniques to improve difficulty, reduce data leakage risks, and ensure strong clinical relevance through
+    multiple rounds of expert review.
+
+    HuggingFace Dataset: https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA
+    ArXiv Paper: https://arxiv.org/abs/2501.18362
+
+    @article{zuo2025medxpertqa,
+    title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding},
+    author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen},
+    journal={arXiv preprint arXiv:2501.18362},
+    year={2025}
+    }
+    """
+
+    HF_DATASET_NAME = "TsinghuaC3I/MedXpertQA"
+
+    name = "medxpert_qa_text"
+    description = (
+        "MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning,"
+        " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems."
+        " It features rigorously curated and clinically relevant board-style questions, enhanced through expert review and data synthesis "
+        "techniques to ensure high difficulty, reliability, and minimal data leakage."
+    )
+    tags = ["knowledge", "generation", "question_answering", "biomedical"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path: str = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+        dataset: DatasetDict = load_dataset(self.HF_DATASET_NAME, "Text", cache_dir=data_path)
+
+        # split the dataset into train, validation, and test splits
+        splits = {TEST_SPLIT: ["test"]}
+        instances: List[Instance] = []
+        for (
+            helm_split_name,
+            dataset_splits_name,
+        ) in splits.items():  # Iterate over the splits
+            for dataset_split_name in dataset_splits_name:
+                split_data = dataset[dataset_split_name]
+
+                for example in split_data:
+                    question = example["question"]
+                    answer = example["label"]
+
+                    instance = Instance(
+                        input=Input(text=question),
+                        references=[
+                            Reference(
+                                Output(text=option),
+                                tags=[CORRECT_TAG] if alpha == answer else [],
+                            )
+                            for alpha, option in example["options"].items()
+                        ],
+                        split=helm_split_name,
+                        extra_data={
+                            "id": example["id"],
+                            "medical_task": example["medical_task"],
+                            "body_system": example["body_system"],
+                            "question_type": example["question_type"],
+                        },
+                    )
+                    instances.append(instance)
+        return instances
+
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="MedXpertQA",
+            description=self.description,
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Answer expert-level medical questions across diverse specialties and body systems",
+                when="Any",
+                who="Medical professionals, Medical students",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split=TEST_SPLIT,
+        )
diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml
@@ -517,6 +517,7 @@ run_groups:
       - sct_bench
       - health_bench
       - health_bench_professional
+      - medxpert_qa_text
 
   - name: clinical_note_generation
     display_name: Clinical Note Generation
@@ -722,6 +723,23 @@ run_groups:
       when: Any
       language: English
 
+  - name: medxpert_qa_text
+    display_name: MedXpertQA
+    description: MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning, and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems. Note:- Only text split is used [(Medxpertqa)](https://arxiv.org/abs/2501.18362).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: Question answering
+      what: Medical knowledge testing
+      who: Medical student, Researcher
+      when: Any
+      language: English
+
   - name: medbullets
     display_name: Medbullets
     description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com).

diff --git a/uv.lock b/uv.lock