PacificAI · blidiselalin · May 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 # General Python stuff
+.venv
 venv
 __pycache__
 *.egg-info

diff --git a/README.md b/README.md
@@ -92,14 +92,12 @@ Adds heavy libraries (bert-score, rouge-score, nltk). **Install can take 2–3 m
 
 Scenarios: **DischargeMe** (hospital course summaries; requires PhysioNet `data_path`), **ACI-Bench** (clinical transcripts), **Patient-Edu** (simplifying medical jargon).
 
-**⚠️ Note on macOS with Python 3.12:** Use `pip` instead of `uv` for this tier due to build compatibility issues with `pyemd`.
-
-Install (use `pip` on Intel Mac):
+Install:
 ```sh
 pip install "medhelm[summarization]"
 ```
 
-Or with `uv` (Linux/other platforms):
+Or with `uv`:
 ```sh
 uv pip install "medhelm[summarization]"
 ```

diff --git a/docs/installation.md b/docs/installation.md
@@ -61,8 +61,6 @@ Scenarios: **DischargeMe** (hospital course summaries), **ACI-Bench** (clinical
 pip install "medhelm[summarization]"
 ```
 
-**Note on macOS with Python 3.12:** Use `pip` (not `uv`) for this tier due to build compatibility with `pyemd`.
-
 ### Gated / licensing tier (`[gated]`)
 
 Adds **gdown** so the code can download data from Google Drive. Install can also take longer.
@@ -92,18 +90,6 @@ pip install "medhelm[summarization,gated]"
 
 See [Quick Start](/quick_start) for running benchmarks with `medhelm-run` (after activating your environment).
 
-## Troubleshooting
-
-### macOS with Python 3.12 and the summarization tier
-
-If you encounter build errors with `pyemd` when installing `medhelm[summarization]` on macOS:
-
-```bash
-pip install "medhelm[summarization]"
-```
-
-This is a known compatibility issue. Use `pip` instead of `uv` for this tier.
-
 ## Install Multimodal Support
 
 Additional steps are required for multimodal evaluations:

diff --git a/docs/medhelm.md b/docs/medhelm.md
@@ -181,7 +181,6 @@ source .venv/bin/activate
 pip install "medhelm[summarization,gated]"
 ```
 
-**Note on macOS/Python 3.12:** Use `pip` (not `uv`) for the `[summarization]` tier due to compatibility with `pyemd`.
 
 ### Alternative: Using conda
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -98,8 +98,9 @@ ranking = [
 summarization = [
     # For summarization_metrics and MedHELM Clinical NLP tier (DischargeMe, ACI-Bench, Patient-Edu)
     # Use: pip install "medhelm[summarization]" — install can take 2–3 minutes
-    "summ-eval~=0.892",
-    "bert-score~=0.3",
+    # Note: extractive stats (coverage/density/compression) are vendored in helm.benchmark.metrics.data_stats
+    # instead of summ-eval, which pins pyemd==0.5.1 (fails on Python 3.12+) and torch<2.0 via blanc.
+    "bert-score~=0.3.13",
     "rouge-score~=0.1.2",
     "nltk~=3.7,!=3.9.0",
     "sentencepiece~=0.2.0",
@@ -468,13 +469,6 @@ crfm-proxy-cli = "helm.proxy.cli:main"
 default-groups = ["dev", "types"]
 
 override-dependencies = [
-    # override torch dependency for summ-eval in "summarization" extra:
-    # - summ-eval depends on blanc 
-    # - blanc depends on torch<2.0
-    "torch~=2.1",
-    # override torch dependency for summ-eval in "summarization" extra:
-    # - summ-eval depends on blanc 
-    # - blanc depends on numpy<2.0
     "numpy>=1.26,<3",
 ]
 

diff --git a/src/helm/benchmark/metrics/data_stats/__init__.py b/src/helm/benchmark/metrics/data_stats/__init__.py
@@ -0,0 +1,3 @@
+from helm.benchmark.metrics.data_stats.data_stats_metric import DataStatsMetric
+
+__all__ = ["DataStatsMetric"]
diff --git a/src/helm/benchmark/metrics/data_stats/data_stats_metric.py b/src/helm/benchmark/metrics/data_stats/data_stats_metric.py
@@ -0,0 +1,66 @@
+# Adapted from summ_eval.data_stats_metric (Apache 2.0 / Newsroom fragments).
+# Vendored so medhelm[summarization] does not depend on summ-eval (pyemd==0.5.1, torch<2.0).
+
+from collections import Counter
+from typing import Any, Dict, List, Optional
+
+import spacy
+
+from helm.benchmark.metrics.data_stats.fragments import Fragments
+
+_en: Optional[Any] = None
+
+
+def _get_spacy_en():
+    global _en
+    if _en is None:
+        _en = spacy.load("en_core_web_sm")
+    return _en
+
+
+def _find_ngrams(input_list: List[str], n: int):
+    return zip(*[input_list[i:] for i in range(n)])
+
+
+class DataStatsMetric:
+    """Extractive summarization statistics (coverage, density, compression)."""
+
+    def __init__(self, n_gram: int = 3, case: bool = False, tokenize: bool = True):
+        self.n_gram = n_gram
+        self.case = case
+        self.tokenize = tokenize
+
+    def evaluate_example(self, summary: str, input_text: str) -> Dict[str, float]:
+        if self.tokenize:
+            nlp = _get_spacy_en()
+            input_text = [tok.text for tok in nlp(input_text, disable=["tagger", "parser", "ner", "textcat"])]
+            summary = [tok.text for tok in nlp(summary, disable=["tagger", "parser", "ner", "textcat"])]
+        fragments = Fragments(summary, input_text, case=self.case)
+        score_dict: Dict[str, float] = {
+            "coverage": fragments.coverage(),
+            "density": fragments.density(),
+            "compression": fragments.compression(),
+            "summary_length": float(len(fragments.summary)),
+        }
+        tokenized_summary = fragments.summary
+        tokenized_text = fragments.text
+        for i in range(1, self.n_gram + 1):
+            input_ngrams = list(_find_ngrams(tokenized_text, i))
+            summ_ngrams = list(_find_ngrams(tokenized_summary, i))
+            input_ngrams_set = set(input_ngrams)
+            summ_ngrams_set = set(summ_ngrams)
+            intersect = summ_ngrams_set.intersection(input_ngrams_set)
+            try:
+                score_dict[f"percentage_novel_{i}-gram"] = (len(summ_ngrams_set) - len(intersect)) / float(
+                    len(summ_ngrams_set)
+                )
+                ngram_counter = Counter(summ_ngrams)
+                repeated = [key for key, val in ngram_counter.items() if val > 1]
+                score_dict[f"percentage_repeated_{i}-gram_in_summ"] = len(repeated) / float(len(summ_ngrams_set))
+            except ZeroDivisionError:
+                continue
+        return score_dict
+
+    @property
+    def supports_multi_ref(self) -> bool:
+        return False
diff --git a/src/helm/benchmark/metrics/data_stats/fragments.py b/src/helm/benchmark/metrics/data_stats/fragments.py
@@ -0,0 +1,81 @@
+# Adapted from https://github.com/lil-lab/newsroom/blob/master/newsroom/analyze/fragments.py
+# (used by summ_eval DataStatsMetric; vendored to avoid summ-eval's pyemd/torch pins)
+
+from collections import namedtuple
+
+
+def normalize(tokens, case: bool = False) -> list[str]:
+    """Lowercase tokens unless case preservation is requested."""
+    return [str(t).lower() if not case else str(t) for t in tokens]
+
+
+class Fragments:
+    Match = namedtuple("Match", ("summary", "text", "length"))
+
+    def __init__(self, summary, text, case: bool = False):
+        if isinstance(summary, str):
+            self.summary = summary.split()
+        else:
+            self.summary = summary
+        if isinstance(text, str):
+            self.text = text.split()
+        else:
+            self.text = text
+
+        self._norm_summary = normalize(self.summary, case)
+        self._norm_text = normalize(self.text, case)
+        self._matches: list[Fragments.Match] = []
+        self._match(self._norm_summary, self._norm_text)
+
+    def overlaps(self) -> list[Match]:
+        return self._matches
+
+    def coverage(self, summary_base: bool = True) -> float:
+        numerator = sum(o.length for o in self.overlaps())
+        denominator = len(self.summary) if summary_base else len(self.text)
+        if denominator == 0:
+            return 0.0
+        return numerator / denominator
+
+    def density(self, summary_base: bool = True) -> float:
+        numerator = sum(o.length**2 for o in self.overlaps())
+        denominator = len(self.summary) if summary_base else len(self.text)
+        if denominator == 0:
+            return 0.0
+        return numerator / denominator
+
+    def compression(self, text_to_summary: bool = True) -> float:
+        ratio = [len(self.text), len(self.summary)]
+        try:
+            if text_to_summary:
+                return ratio[0] / ratio[1]
+            return ratio[1] / ratio[0]
+        except ZeroDivisionError:
+            return 0.0
+
+    def _match(self, a: list[str], b: list[str]) -> None:
+        a_start = b_start = 0
+        while a_start < len(a):
+            best_match = None
+            best_match_length = 0
+            while b_start < len(b):
+                if a[a_start] == b[b_start]:
+                    a_end = a_start
+                    b_end = b_start
+                    while a_end < len(a) and b_end < len(b) and b[b_end] == a[a_end]:
+                        b_end += 1
+                        a_end += 1
+                    length = a_end - a_start
+                    if length > best_match_length:
+                        best_match = Fragments.Match(a_start, b_start, length)
+                        best_match_length = length
+                    b_start = b_end
+                else:
+                    b_start += 1
+            b_start = 0
+            if best_match:
+                if best_match_length > 0:
+                    self._matches.append(best_match)
+                a_start += best_match_length
+            else:
+                a_start += 1
diff --git a/src/helm/benchmark/metrics/summarization_metrics.py b/src/helm/benchmark/metrics/summarization_metrics.py
@@ -20,6 +20,7 @@
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.data_stats import DataStatsMetric
 from helm.benchmark.metrics.summac.model_summac import SummaCZS
 
 try:
@@ -67,11 +68,6 @@ def __init__(
         if not spacy.util.is_package("en_core_web_sm"):
             spacy.cli.download("en_core_web_sm")
 
-        try:
-            from summ_eval.data_stats_metric import DataStatsMetric  # type: ignore
-        except ModuleNotFoundError as e:
-            handle_module_not_found_error(e, ["summarization"])
-
         self.data_stats_metric = DataStatsMetric()
         self.task: str = task
         self.qa_fact_eval: Optional[Dict] = None
-Original file line number
+Diff line change
@@ Expand Up / @@ -181,7 +181,6 @@ source .venv/bin/activate @@
     pip install "medhelm[summarization,gated]"
     ```
-    **Note on macOS/Python 3.12:** Use `pip` (not `uv`) for the `[summarization]` tier due to compatibility with `pyemd`.
     ### Alternative: Using conda
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from helm.benchmark.metrics.data_stats.data_stats_metric import DataStatsMetric

		__all__ = ["DataStatsMetric"]