From b7b22136b310804e7b0927e07d73c270f1749308 Mon Sep 17 00:00:00 2001
From: Kalyan Chakravarthy <chakravarthik27@gmail.com>
Date: Fri, 15 May 2026 14:52:59 +0000
Subject: [PATCH 1/8] feat: implement MedXpert-QA-Text benchmark scenario and
 update dependencies

---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 .../benchmark/run_specs/medhelm_run_specs.py  | 28 +++++++
 .../scenarios/medxpert_qa_text_scenario.py    | 82 +++++++++++++++++++
 src/helm/benchmark/static/schema_medhelm.yaml | 18 ++++
 5 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py

diff --git a/pyproject.toml b/pyproject.toml
index d6fda049d7b..d2930373a91 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ dependencies = [
     "sqlitedict>=2.1.0,<3.0",
     "bottle~=0.12.23",
     # Basic Scenarios
-    "datasets~=3.1",
+    "datasets>=3.1",
     "pyarrow>=11.0.0", # Pinned transitive dependency for datasets; workaround for #1026
     "pyarrow-hotfix~=0.6", # Hotfix for CVE-2023-47248
     # Basic metrics
diff --git a/requirements.txt b/requirements.txt
index ae9d4a70793..defd7e93c57 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,7 +27,7 @@ cryptography==46.0.5
 cycler==0.12.1
 cymem==2.0.13
 dacite==1.9.2
-datasets==3.6.0
+datasets==4.8.5
 dill==0.3.8
 distlib==0.4.0
 distro==1.9.0
diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
index 31daa321592..2841905057a 100644
--- a/src/helm/benchmark/run_specs/medhelm_run_specs.py
+++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -1732,3 +1732,31 @@ def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = Non
         metric_specs=metric_specs,
         groups=["health_bench_professional"],
     )
+
+@run_spec_function("medxpert_qa_text")
+def get_medxpert_qa_text_spec(jury_config_path: Optional[str] = None) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.medxpert_qa_text_scenario.MedXpertQATextScenario",
+        args={},
+    )
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions=(
+            "You are a medical expert assistant. Answer the following medical question with single letter from options."
+        ),
+        input_noun="Question",
+        output_noun="Answer",
+        max_tokens=1,
+        max_train_instances=0,
+    )
+
+    metric_specs = get_exact_match_metric_specs()
+
+    return RunSpec(
+        name="medxpert_qa_text",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["medxpert_qa_text"],
+    )
\ No newline at end of file
diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
new file mode 100644
index 00000000000..8c3baeb76c2
--- /dev/null
+++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
@@ -0,0 +1,82 @@
+import os
+from typing import List
+import pandas as pd
+
+from datasets import DatasetDict, load_dataset
+
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Input,
+    Instance,
+    Output,
+    Reference,
+    Scenario,
+    ScenarioMetadata,
+)
+
+class MedXpertQATextScenario(Scenario):
+    """
+    The MedXpertQA dataset introduced in the MedXpert paper by Li et al:
+    @article{zuo2025medxpertqa,
+    title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding},
+    author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen},
+    journal={arXiv preprint arXiv:2501.18362},
+    year={2025}
+    }
+    """
+
+    HF_DATASET_NAME = "TsinghuaC3I/MedXpertQA"
+
+    name = "medxpert_qa"
+    description = (
+        "MedXpertQA is a benchmark designed to evaluate the medical reasoning and understanding capabilities of"
+        "language models. Each instance in the dataset consists of a medical question and its corresponding "
+        "expert-level answer. The benchmark assesses a model's ability to comprehend complex medical information, reason through clinical scenarios, and provide accurate and informative responses that align with expert knowledge in the field of medicine."
+    )
+    tags = ["knowledge", "generation", "question_answering", "biomedical"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path: str = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+        dataset: DatasetDict = load_dataset(
+            self.HF_DATASET_NAME,
+            "Text",
+            cache_dir=data_path)
+        
+        # split the dataset into train, validation, and test splits
+        splits = {TEST_SPLIT: ["test"]}
+        instances: List[Instance] = []
+        for (
+            helm_split_name,
+            dataset_splits_name,
+        ) in splits.items():  # Iterate over the splits
+            for dataset_split_name in dataset_splits_name:
+                split_data = dataset[dataset_split_name]
+
+                for example in split_data:
+                    question = example["question"]
+                    answer = example["label"]
+
+                    instance = Instance(
+                        input=Input(text=question),
+                        references=[
+                            Reference(
+                                Output(text=option),
+                                tags=[CORRECT_TAG] if alpha == answer else [],
+                            )
+                            for alpha, option in example['options'].items()
+                        ],
+                        split=helm_split_name,
+                        extra_data={
+                            "id": example["id"],
+                            "medical_task": example["medical_task"],
+                            "body_system": example["body_system"],
+                            "question_type": example["question_type"],
+                        },
+                    )
+                    instances.append(instance)
+        return instances
\ No newline at end of file
diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml
index 34e5dbdc99c..a5182e93d53 100644
--- a/src/helm/benchmark/static/schema_medhelm.yaml
+++ b/src/helm/benchmark/static/schema_medhelm.yaml
@@ -517,6 +517,7 @@ run_groups:
       - sct_bench
       - health_bench
       - health_bench_professional
+      - medxpert_qa_text
 
   - name: clinical_note_generation
     display_name: Clinical Note Generation
@@ -722,6 +723,23 @@ run_groups:
       when: Any
       language: English
 
+  - name: medxpert_qa_text
+    display_name: MedXpert-QA-Text
+    description: MedXpert-QA-Text is a benchmark designed to evaluate a model's ability to answer medical questions based on unstructured clinical text. Each instance includes a clinical note and a related question, requiring the model to extract relevant information from the text to provide an accurate answer [(Zhang et al., 2024)](https://arxiv.org/abs/2406.12036).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: Question answering
+      what: Medical knowledge testing
+      who: Medical student, Researcher
+      when: Any
+      language: English
+
   - name: medbullets
     display_name: Medbullets
     description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com).

From 530ad78a7730f6cdb296c89b34b8195931a5012d Mon Sep 17 00:00:00 2001
From: Kalyan Chakravarthy <chakravarthik27@gmail.com>
Date: Mon, 18 May 2026 13:36:04 +0000
Subject: [PATCH 2/8] fix: update package specifications for numba and
 together; add openpyxl to dependencies

---
 pyproject.toml   | 6 +++---
 requirements.txt | 4 ++--
 uv.lock          | 7 ++++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d2930373a91..0bd78a98972 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,7 +84,7 @@ scenarios = [
 
 metrics = [
     "google-api-python-client~=2.64",  # For perspective_api_client via toxicity_metrics
-    "numba~=0.56",  # For copyright_metrics
+    "numba>=0.56",  # For copyright_metrics
     "sacrebleu~=2.2",  # For disinformation_metrics, machine_translation_metrics
     "langdetect~=1.0",  # For ifeval_metrics
     "immutabledict~=4.2",  # For ifeval_metrics
@@ -197,7 +197,7 @@ google = [
 ]
 
 together = [
-    "together~=1.1",
+    "together>=1.1",
 ]
 
 yandex = [
@@ -487,7 +487,7 @@ conflicts = [
 
 # openai-whisper build needs setuptools + wheel; it doesn't declare them (CI uses --no-build-isolation-package + venv with both)
 [tool.uv.extra-build-dependencies]
-openai-whisper = ["setuptools", "wheel"]
+openai-whisper = ["setuptools<82", "wheel"]
 
 [tool.setuptools]
 package-dir = { "" = "src" }
diff --git a/requirements.txt b/requirements.txt
index defd7e93c57..24b694c4019 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -79,7 +79,7 @@ mypy-extensions==1.1.0
 networkx==3.4.2
 nltk==3.9.2
 nodeenv==1.10.0
-numba==0.63.1
+numba
 numpy==2.2.6
 nvidia-cublas-cu12==12.8.4.1
 nvidia-cuda-cupti-cu12==12.8.90
@@ -153,7 +153,7 @@ tabulate==0.9.0
 thinc==8.3.10
 threadpoolctl==3.6.0
 tiktoken==0.12.0
-together==1.5.29
+together
 tokenizers==0.21.4
 toml==0.10.2
 tomli==2.4.0
diff --git a/uv.lock b/uv.lock
index 620636cfa4b..857dff48161 100644
--- a/uv.lock
+++ b/uv.lock
@@ -4234,6 +4234,7 @@ all = [
     { name = "opencc" },
     { name = "opencv-python" },
     { name = "opencv-python-headless" },
+    { name = "openpyxl" },
     { name = "pdf2image" },
     { name = "pillow" },
     { name = "pycocoevalcap" },
@@ -4587,7 +4588,7 @@ requires-dist = [
     { name = "colorcet", marker = "extra == 'plots'", specifier = "~=3.0" },
     { name = "colorlog", specifier = "~=6.9" },
     { name = "dacite", specifier = "~=1.6" },
-    { name = "datasets", specifier = "~=3.1" },
+    { name = "datasets", specifier = ">=3.1" },
     { name = "diffusers", marker = "extra == 'heim'", specifier = "~=0.34.0" },
     { name = "dspy", marker = "extra == 'dspy'", specifier = "~=3.0" },
     { name = "einops", marker = "extra == 'audiolm'", specifier = "~=0.7.0" },
@@ -4687,7 +4688,7 @@ requires-dist = [
     { name = "nltk", specifier = "~=3.7,!=3.9.0" },
     { name = "nltk", marker = "extra == 'summarization'", specifier = "~=3.7,!=3.9.0" },
     { name = "nudenet", marker = "extra == 'heim'", specifier = "~=2.0" },
-    { name = "numba", marker = "extra == 'metrics'", specifier = "~=0.56" },
+    { name = "numba", marker = "extra == 'metrics'", specifier = ">=0.56" },
     { name = "numpy", specifier = ">=1.26,<3" },
     { name = "numpy", marker = "extra == 'heim'", specifier = ">=1.26" },
     { name = "omegaconf", marker = "extra == 'heim'", specifier = "~=2.3" },
@@ -4751,7 +4752,7 @@ requires-dist = [
     { name = "tensorflow", marker = "extra == 'heim'", specifier = "~=2.11" },
     { name = "tiktoken", marker = "extra == 'openai'", specifier = "~=0.7" },
     { name = "timm", marker = "extra == 'heim'", specifier = "~=0.6.12" },
-    { name = "together", marker = "extra == 'together'", specifier = "~=1.1" },
+    { name = "together", marker = "extra == 'together'", specifier = ">=1.1" },
     { name = "tokenizers", marker = "extra == 'aleph-alpha'", specifier = ">=0.13.3" },
     { name = "torch", specifier = ">=1.13.1,<3.0.0" },
     { name = "torch", marker = "extra == 'vlm'", specifier = "~=2.1" },

From 63cec538ccbc23bd1a3ff78475b3f4bd55618ce5 Mon Sep 17 00:00:00 2001
From: Kalyan Chakravarthy <chakravarthik27@gmail.com>
Date: Mon, 18 May 2026 13:36:20 +0000
Subject: [PATCH 3/8] feat: implement MedXpertQA Text scenario with detailed
 documentation and metadata

---
 .../benchmark/run_specs/medhelm_run_specs.py  |  5 +-
 .../scenarios/medxpert_qa_text_scenario.py    | 47 ++++++++++++++-----
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
index 2841905057a..922d2e207c1 100644
--- a/src/helm/benchmark/run_specs/medhelm_run_specs.py
+++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -1733,8 +1733,9 @@ def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = Non
         groups=["health_bench_professional"],
     )
 
+
 @run_spec_function("medxpert_qa_text")
-def get_medxpert_qa_text_spec(jury_config_path: Optional[str] = None) -> RunSpec:
+def get_medxpert_qa_text_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.medxpert_qa_text_scenario.MedXpertQATextScenario",
         args={},
@@ -1759,4 +1760,4 @@ def get_medxpert_qa_text_spec(jury_config_path: Optional[str] = None) -> RunSpec
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
         groups=["medxpert_qa_text"],
-    )
\ No newline at end of file
+    )
diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
index 8c3baeb76c2..50eda469002 100644
--- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
+++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
@@ -1,6 +1,5 @@
 import os
 from typing import List
-import pandas as pd
 
 from datasets import DatasetDict, load_dataset
 
@@ -18,9 +17,23 @@
     ScenarioMetadata,
 )
 
+
 class MedXpertQATextScenario(Scenario):
     """
-    The MedXpertQA dataset introduced in the MedXpert paper by Li et al:
+    From "MedXpertQA: Benchmarking Expert-Level Medical Knowledge and Reasoning" (2025),
+    MedXpertQA is a highly challenging benchmark designed to evaluate expert-level medical knowledge,
+    clinical reasoning, and advanced problem-solving abilities in large language models.
+    The benchmark contains 4,460 questions spanning 17 medical specialties and 11 body systems,
+    with a dedicated Text subset for text-only evaluation and an MM subset for multimodal clinical reasoning.
+
+    The dataset includes rigorously curated specialty board-style questions enriched with detailed clinical contexts,
+    patient records, and examination findings. MedXpertQA applies filtering, augmentation, and data synthesis
+    techniques to improve difficulty, reduce data leakage risks, and ensure strong clinical relevance through
+    multiple rounds of expert review.
+
+    HuggingFace Dataset: https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA
+    ArXiv Paper: https://arxiv.org/abs/2501.18362
+
     @article{zuo2025medxpertqa,
     title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding},
     author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen},
@@ -33,20 +46,18 @@ class MedXpertQATextScenario(Scenario):
 
     name = "medxpert_qa"
     description = (
-        "MedXpertQA is a benchmark designed to evaluate the medical reasoning and understanding capabilities of"
-        "language models. Each instance in the dataset consists of a medical question and its corresponding "
-        "expert-level answer. The benchmark assesses a model's ability to comprehend complex medical information, reason through clinical scenarios, and provide accurate and informative responses that align with expert knowledge in the field of medicine."
+        "MedXpertQA Text is a text-only benchmark designed to evaluate expert-level medical knowledge, clinical reasoning,"
+        " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems."
+        " It features rigorously curated and clinically relevant board-style questions, enhanced through expert review and data synthesis "
+        "techniques to ensure high difficulty, reliability, and minimal data leakage."
     )
     tags = ["knowledge", "generation", "question_answering", "biomedical"]
 
     def get_instances(self, output_path: str) -> List[Instance]:
         data_path: str = os.path.join(output_path, "data")
         ensure_directory_exists(data_path)
-        dataset: DatasetDict = load_dataset(
-            self.HF_DATASET_NAME,
-            "Text",
-            cache_dir=data_path)
-        
+        dataset: DatasetDict = load_dataset(self.HF_DATASET_NAME, "Text", cache_dir=data_path)
+
         # split the dataset into train, validation, and test splits
         splits = {TEST_SPLIT: ["test"]}
         instances: List[Instance] = []
@@ -68,7 +79,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
                                 Output(text=option),
                                 tags=[CORRECT_TAG] if alpha == answer else [],
                             )
-                            for alpha, option in example['options'].items()
+                            for alpha, option in example["options"].items()
                         ],
                         split=helm_split_name,
                         extra_data={
@@ -79,4 +90,16 @@ def get_instances(self, output_path: str) -> List[Instance]:
                         },
                     )
                     instances.append(instance)
-        return instances
\ No newline at end of file
+        return instances
+
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            description=self.description,
+            tags=self.tags,
+            taxonomy_info=TaxonomyInfo(
+                subjects=["medicine"],
+                objects=["question_answering"],
+                domains=["biomedical"],
+            ),
+        )
\ No newline at end of file

From e6df289570b27267c59b18a2677d6dcfc42db7a7 Mon Sep 17 00:00:00 2001
From: Kalyan Chakravarthy <chakravarthik27@gmail.com>
Date: Mon, 18 May 2026 13:36:49 +0000
Subject: [PATCH 4/8] fix: remove unused import for ensure_file_downloaded in
 MedXpertQATextScenario

---
 src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
index 50eda469002..0bdad444b68 100644
--- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
+++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
@@ -4,7 +4,7 @@
 from datasets import DatasetDict, load_dataset
 
 from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
-from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from helm.common.general import ensure_directory_exists
 
 from helm.benchmark.scenarios.scenario import (
     CORRECT_TAG,

From 781d2a697d5e2bc63e5b2653cf425515e89a85a8 Mon Sep 17 00:00:00 2001
From: Kalyan Chakravarthy <chakravarthik27@gmail.com>
Date: Mon, 18 May 2026 13:40:03 +0000
Subject: [PATCH 5/8] feat: enhance MedXpertQATextScenario metadata with
 display name and taxonomy details

---
 .../scenarios/medxpert_qa_text_scenario.py       | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
index 0bdad444b68..83dcc9eb3f8 100644
--- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
+++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
@@ -95,11 +95,15 @@ def get_instances(self, output_path: str) -> List[Instance]:
     def get_metadata(self) -> ScenarioMetadata:
         return ScenarioMetadata(
             name=self.name,
+            display_name="MedXpertQA Text",
             description=self.description,
-            tags=self.tags,
-            taxonomy_info=TaxonomyInfo(
-                subjects=["medicine"],
-                objects=["question_answering"],
-                domains=["biomedical"],
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Answer expert-level medical questions across diverse specialties and body systems",
+                when="Any",
+                who="Medical professionals, Medical students",
+                language="English",
             ),
-        )
\ No newline at end of file
+            main_metric="exact_match",
+            main_split=TEST_SPLIT,
+        )

From 36811ac8c9fd27f3472396f069deff821646249c Mon Sep 17 00:00:00 2001
From: Kalyan Chakravarthy <chakravarthik27@gmail.com>
Date: Tue, 19 May 2026 12:52:22 +0000
Subject: [PATCH 6/8] feat: update MedXpertQATextScenario name and add tiktoken
 dependency

---
 .gitignore                                                | 1 +
 pyproject.toml                                            | 1 +
 src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py | 2 +-
 uv.lock                                                   | 2 ++
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 7e37430952e..78569780175 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # General Python stuff
 venv
+**/**.sqlite
 __pycache__
 *.egg-info
 .mypy_cache
diff --git a/pyproject.toml b/pyproject.toml
index 0bd78a98972..69fe520a6d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,7 @@ dependencies = [
     # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
     "torch>=1.13.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
     "torchvision>=0.14.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
+    "tiktoken~=0.7",
 ]
 
 [project.optional-dependencies]
diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
index 83dcc9eb3f8..cf18eaa4a0f 100644
--- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
+++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
@@ -44,7 +44,7 @@ class MedXpertQATextScenario(Scenario):
 
     HF_DATASET_NAME = "TsinghuaC3I/MedXpertQA"
 
-    name = "medxpert_qa"
+    name = "medxpert_qa_text"
     description = (
         "MedXpertQA Text is a text-only benchmark designed to evaluate expert-level medical knowledge, clinical reasoning,"
         " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems."
diff --git a/uv.lock b/uv.lock
index 857dff48161..dfcad91467b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -4166,6 +4166,7 @@ dependencies = [
     { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-7-medhelm-human-evaluation' and extra == 'extra-7-medhelm-ibm')" },
     { name = "spacy" },
     { name = "sqlitedict" },
+    { name = "tiktoken" },
     { name = "torch" },
     { name = "torchvision" },
     { name = "tqdm" },
@@ -4750,6 +4751,7 @@ requires-dist = [
     { name = "summ-eval", marker = "extra == 'summarization'", specifier = "~=0.892" },
     { name = "surge-api", marker = "extra == 'human-evaluation'", specifier = "~=1.1" },
     { name = "tensorflow", marker = "extra == 'heim'", specifier = "~=2.11" },
+    { name = "tiktoken", specifier = "~=0.7" },
     { name = "tiktoken", marker = "extra == 'openai'", specifier = "~=0.7" },
     { name = "timm", marker = "extra == 'heim'", specifier = "~=0.6.12" },
     { name = "together", marker = "extra == 'together'", specifier = ">=1.1" },

From 5477d6962dbd5d9e81e7b4c5b45d32b66c9e2d5e Mon Sep 17 00:00:00 2001
From: Kalyan Chakravarthy <chakravarthik27@gmail.com>
Date: Thu, 21 May 2026 10:56:58 +0000
Subject: [PATCH 7/8] feat: update MedXpertQA descriptions and requirements for
 improved clarity and accuracy

---
 requirements.txt                                          | 4 ++--
 src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py | 4 ++--
 src/helm/benchmark/static/schema_medhelm.yaml             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 24b694c4019..4629a8486f8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -79,7 +79,7 @@ mypy-extensions==1.1.0
 networkx==3.4.2
 nltk==3.9.2
 nodeenv==1.10.0
-numba
+numba>=0.56
 numpy==2.2.6
 nvidia-cublas-cu12==12.8.4.1
 nvidia-cuda-cupti-cu12==12.8.90
@@ -153,7 +153,7 @@ tabulate==0.9.0
 thinc==8.3.10
 threadpoolctl==3.6.0
 tiktoken==0.12.0
-together
+together==2.15.0
 tokenizers==0.21.4
 toml==0.10.2
 tomli==2.4.0
diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
index cf18eaa4a0f..b6b0addc4f4 100644
--- a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
+++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
@@ -46,7 +46,7 @@ class MedXpertQATextScenario(Scenario):
 
     name = "medxpert_qa_text"
     description = (
-        "MedXpertQA Text is a text-only benchmark designed to evaluate expert-level medical knowledge, clinical reasoning,"
+        "MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning,"
         " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems."
         " It features rigorously curated and clinically relevant board-style questions, enhanced through expert review and data synthesis "
         "techniques to ensure high difficulty, reliability, and minimal data leakage."
@@ -95,7 +95,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
     def get_metadata(self) -> ScenarioMetadata:
         return ScenarioMetadata(
             name=self.name,
-            display_name="MedXpertQA Text",
+            display_name="MedXpertQA",
             description=self.description,
             taxonomy=TaxonomyInfo(
                 task="Question answering",
diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml
index a5182e93d53..1b5119b1280 100644
--- a/src/helm/benchmark/static/schema_medhelm.yaml
+++ b/src/helm/benchmark/static/schema_medhelm.yaml
@@ -725,7 +725,7 @@ run_groups:
 
   - name: medxpert_qa_text
     display_name: MedXpert-QA-Text
-    description: MedXpert-QA-Text is a benchmark designed to evaluate a model's ability to answer medical questions based on unstructured clinical text. Each instance includes a clinical note and a related question, requiring the model to extract relevant information from the text to provide an accurate answer [(Zhang et al., 2024)](https://arxiv.org/abs/2406.12036).
+    description: MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning, and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems. Note:- Only text split is used [(Medxpertqa)](https://arxiv.org/abs/2501.18362).
     metric_groups:
       - accuracy
       - efficiency

From 9c755d06ce52c7a4004c6c3dd4cc54904b6d0d68 Mon Sep 17 00:00:00 2001
From: Kalyan Chakravarthy <chakravarthik27@gmail.com>
Date: Thu, 21 May 2026 13:56:52 +0000
Subject: [PATCH 8/8] feat: update display name for MedXpertQA Text scenario
 for consistency

---
 src/helm/benchmark/static/schema_medhelm.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml
index 1b5119b1280..adb3774b667 100644
--- a/src/helm/benchmark/static/schema_medhelm.yaml
+++ b/src/helm/benchmark/static/schema_medhelm.yaml
@@ -724,7 +724,7 @@ run_groups:
       language: English
 
   - name: medxpert_qa_text
-    display_name: MedXpert-QA-Text
+    display_name: MedXpertQA
     description: MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning, and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems. Note:- Only text split is used [(Medxpertqa)](https://arxiv.org/abs/2501.18362).
     metric_groups:
       - accuracy