diff --git a/.gitignore b/.gitignore
index 7e37430952..7856978017 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # General Python stuff
 venv
+**/**.sqlite
 __pycache__
 *.egg-info
 .mypy_cache
diff --git a/pyproject.toml b/pyproject.toml
index d6fda049d7..69fe520a6d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ dependencies = [
     "sqlitedict>=2.1.0,<3.0",
     "bottle~=0.12.23",
     # Basic Scenarios
-    "datasets~=3.1",
+    "datasets>=3.1",
     "pyarrow>=11.0.0", # Pinned transitive dependency for datasets; workaround for #1026
     "pyarrow-hotfix~=0.6", # Hotfix for CVE-2023-47248
     # Basic metrics
@@ -60,6 +60,7 @@ dependencies = [
     # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
     "torch>=1.13.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
     "torchvision>=0.14.1,<3.0.0", # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
+    "tiktoken~=0.7",
 ]
 
 [project.optional-dependencies]
@@ -84,7 +85,7 @@ scenarios = [
 
 metrics = [
     "google-api-python-client~=2.64",  # For perspective_api_client via toxicity_metrics
-    "numba~=0.56",  # For copyright_metrics
+    "numba>=0.56",  # For copyright_metrics
     "sacrebleu~=2.2",  # For disinformation_metrics, machine_translation_metrics
     "langdetect~=1.0",  # For ifeval_metrics
     "immutabledict~=4.2",  # For ifeval_metrics
@@ -197,7 +198,7 @@ google = [
 ]
 
 together = [
-    "together~=1.1",
+    "together>=1.1",
 ]
 
 yandex = [
@@ -487,7 +488,7 @@ conflicts = [
 
 # openai-whisper build needs setuptools + wheel; it doesn't declare them (CI uses --no-build-isolation-package + venv with both)
 [tool.uv.extra-build-dependencies]
-openai-whisper = ["setuptools", "wheel"]
+openai-whisper = ["setuptools<82", "wheel"]
 
 [tool.setuptools]
 package-dir = { "" = "src" }
diff --git a/requirements.txt b/requirements.txt
index ae9d4a7079..4629a8486f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,7 +27,7 @@ cryptography==46.0.5
 cycler==0.12.1
 cymem==2.0.13
 dacite==1.9.2
-datasets==3.6.0
+datasets==4.8.5
 dill==0.3.8
 distlib==0.4.0
 distro==1.9.0
@@ -79,7 +79,7 @@ mypy-extensions==1.1.0
 networkx==3.4.2
 nltk==3.9.2
 nodeenv==1.10.0
-numba==0.63.1
+numba>=0.56
 numpy==2.2.6
 nvidia-cublas-cu12==12.8.4.1
 nvidia-cuda-cupti-cu12==12.8.90
@@ -153,7 +153,7 @@ tabulate==0.9.0
 thinc==8.3.10
 threadpoolctl==3.6.0
 tiktoken==0.12.0
-together==1.5.29
+together==2.15.0
 tokenizers==0.21.4
 toml==0.10.2
 tomli==2.4.0
diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
index 31daa32159..922d2e207c 100644
--- a/src/helm/benchmark/run_specs/medhelm_run_specs.py
+++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -1732,3 +1732,32 @@ def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = Non
         metric_specs=metric_specs,
         groups=["health_bench_professional"],
     )
+
+
+@run_spec_function("medxpert_qa_text")
+def get_medxpert_qa_text_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.medxpert_qa_text_scenario.MedXpertQATextScenario",
+        args={},
+    )
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions=(
+            "You are a medical expert assistant. Answer the following medical question with single letter from options."
+        ),
+        input_noun="Question",
+        output_noun="Answer",
+        max_tokens=1,
+        max_train_instances=0,
+    )
+
+    metric_specs = get_exact_match_metric_specs()
+
+    return RunSpec(
+        name="medxpert_qa_text",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["medxpert_qa_text"],
+    )
diff --git a/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
new file mode 100644
index 0000000000..b6b0addc4f
--- /dev/null
+++ b/src/helm/benchmark/scenarios/medxpert_qa_text_scenario.py
@@ -0,0 +1,109 @@
+import os
+from typing import List
+
+from datasets import DatasetDict, load_dataset
+
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.common.general import ensure_directory_exists
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Input,
+    Instance,
+    Output,
+    Reference,
+    Scenario,
+    ScenarioMetadata,
+)
+
+
+class MedXpertQATextScenario(Scenario):
+    """
+    From "MedXpertQA: Benchmarking Expert-Level Medical Knowledge and Reasoning" (2025),
+    MedXpertQA is a highly challenging benchmark designed to evaluate expert-level medical knowledge,
+    clinical reasoning, and advanced problem-solving abilities in large language models.
+    The benchmark contains 4,460 questions spanning 17 medical specialties and 11 body systems,
+    with a dedicated Text subset for text-only evaluation and an MM subset for multimodal clinical reasoning.
+
+    The dataset includes rigorously curated specialty board-style questions enriched with detailed clinical contexts,
+    patient records, and examination findings. MedXpertQA applies filtering, augmentation, and data synthesis
+    techniques to improve difficulty, reduce data leakage risks, and ensure strong clinical relevance through
+    multiple rounds of expert review.
+
+    HuggingFace Dataset: https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA
+    ArXiv Paper: https://arxiv.org/abs/2501.18362
+
+    @article{zuo2025medxpertqa,
+    title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding},
+    author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen},
+    journal={arXiv preprint arXiv:2501.18362},
+    year={2025}
+    }
+    """
+
+    HF_DATASET_NAME = "TsinghuaC3I/MedXpertQA"
+
+    name = "medxpert_qa_text"
+    description = (
+        "MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning,"
+        " and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems."
+        " It features rigorously curated and clinically relevant board-style questions, enhanced through expert review and data synthesis "
+        "techniques to ensure high difficulty, reliability, and minimal data leakage."
+    )
+    tags = ["knowledge", "generation", "question_answering", "biomedical"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path: str = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+        dataset: DatasetDict = load_dataset(self.HF_DATASET_NAME, "Text", cache_dir=data_path)
+
+        # split the dataset into train, validation, and test splits
+        splits = {TEST_SPLIT: ["test"]}
+        instances: List[Instance] = []
+        for (
+            helm_split_name,
+            dataset_splits_name,
+        ) in splits.items():  # Iterate over the splits
+            for dataset_split_name in dataset_splits_name:
+                split_data = dataset[dataset_split_name]
+
+                for example in split_data:
+                    question = example["question"]
+                    answer = example["label"]
+
+                    instance = Instance(
+                        input=Input(text=question),
+                        references=[
+                            Reference(
+                                Output(text=option),
+                                tags=[CORRECT_TAG] if alpha == answer else [],
+                            )
+                            for alpha, option in example["options"].items()
+                        ],
+                        split=helm_split_name,
+                        extra_data={
+                            "id": example["id"],
+                            "medical_task": example["medical_task"],
+                            "body_system": example["body_system"],
+                            "question_type": example["question_type"],
+                        },
+                    )
+                    instances.append(instance)
+        return instances
+
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=self.name,
+            display_name="MedXpertQA",
+            description=self.description,
+            taxonomy=TaxonomyInfo(
+                task="Question answering",
+                what="Answer expert-level medical questions across diverse specialties and body systems",
+                when="Any",
+                who="Medical professionals, Medical students",
+                language="English",
+            ),
+            main_metric="exact_match",
+            main_split=TEST_SPLIT,
+        )
diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml
index 34e5dbdc99..adb3774b66 100644
--- a/src/helm/benchmark/static/schema_medhelm.yaml
+++ b/src/helm/benchmark/static/schema_medhelm.yaml
@@ -517,6 +517,7 @@ run_groups:
       - sct_bench
       - health_bench
       - health_bench_professional
+      - medxpert_qa_text
 
   - name: clinical_note_generation
     display_name: Clinical Note Generation
@@ -722,6 +723,23 @@ run_groups:
       when: Any
       language: English
 
+  - name: medxpert_qa_text
+    display_name: MedXpertQA
+    description: MedXpertQA is a benchmark designed to evaluate expert-level medical knowledge, clinical reasoning, and advanced problem-solving capabilities in large language models across diverse medical specialties and body systems. Note:- Only text split is used [(Medxpertqa)](https://arxiv.org/abs/2501.18362).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: Question answering
+      what: Medical knowledge testing
+      who: Medical student, Researcher
+      when: Any
+      language: English
+
   - name: medbullets
     display_name: Medbullets
     description: Medbullets is a benchmark of USMLE-style medical questions designed to assess a model's ability to understand and apply clinical knowledge. Each question is accompanied by a patient scenario and five multiple-choice options, similar to those found on Step 2 and Step 3 board exams [(MedBullets, 2025)](https://step2.medbullets.com).
diff --git a/uv.lock b/uv.lock
index 620636cfa4..dfcad91467 100644
--- a/uv.lock
+++ b/uv.lock
@@ -4166,6 +4166,7 @@ dependencies = [
     { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-7-medhelm-human-evaluation' and extra == 'extra-7-medhelm-ibm')" },
     { name = "spacy" },
     { name = "sqlitedict" },
+    { name = "tiktoken" },
     { name = "torch" },
     { name = "torchvision" },
     { name = "tqdm" },
@@ -4234,6 +4235,7 @@ all = [
     { name = "opencc" },
     { name = "opencv-python" },
     { name = "opencv-python-headless" },
+    { name = "openpyxl" },
     { name = "pdf2image" },
     { name = "pillow" },
     { name = "pycocoevalcap" },
@@ -4587,7 +4589,7 @@ requires-dist = [
     { name = "colorcet", marker = "extra == 'plots'", specifier = "~=3.0" },
     { name = "colorlog", specifier = "~=6.9" },
     { name = "dacite", specifier = "~=1.6" },
-    { name = "datasets", specifier = "~=3.1" },
+    { name = "datasets", specifier = ">=3.1" },
     { name = "diffusers", marker = "extra == 'heim'", specifier = "~=0.34.0" },
     { name = "dspy", marker = "extra == 'dspy'", specifier = "~=3.0" },
     { name = "einops", marker = "extra == 'audiolm'", specifier = "~=0.7.0" },
@@ -4687,7 +4689,7 @@ requires-dist = [
     { name = "nltk", specifier = "~=3.7,!=3.9.0" },
     { name = "nltk", marker = "extra == 'summarization'", specifier = "~=3.7,!=3.9.0" },
     { name = "nudenet", marker = "extra == 'heim'", specifier = "~=2.0" },
-    { name = "numba", marker = "extra == 'metrics'", specifier = "~=0.56" },
+    { name = "numba", marker = "extra == 'metrics'", specifier = ">=0.56" },
     { name = "numpy", specifier = ">=1.26,<3" },
     { name = "numpy", marker = "extra == 'heim'", specifier = ">=1.26" },
     { name = "omegaconf", marker = "extra == 'heim'", specifier = "~=2.3" },
@@ -4749,9 +4751,10 @@ requires-dist = [
     { name = "summ-eval", marker = "extra == 'summarization'", specifier = "~=0.892" },
     { name = "surge-api", marker = "extra == 'human-evaluation'", specifier = "~=1.1" },
     { name = "tensorflow", marker = "extra == 'heim'", specifier = "~=2.11" },
+    { name = "tiktoken", specifier = "~=0.7" },
     { name = "tiktoken", marker = "extra == 'openai'", specifier = "~=0.7" },
     { name = "timm", marker = "extra == 'heim'", specifier = "~=0.6.12" },
-    { name = "together", marker = "extra == 'together'", specifier = "~=1.1" },
+    { name = "together", marker = "extra == 'together'", specifier = ">=1.1" },
     { name = "tokenizers", marker = "extra == 'aleph-alpha'", specifier = ">=0.13.3" },
     { name = "torch", specifier = ">=1.13.1,<3.0.0" },
     { name = "torch", marker = "extra == 'vlm'", specifier = "~=2.1" },