Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# General Python stuff
.venv
venv
__pycache__
*.egg-info
Expand Down
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,12 @@ Adds heavy libraries (bert-score, rouge-score, nltk). **Install can take 2–3 m

Scenarios: **DischargeMe** (hospital course summaries; requires PhysioNet `data_path`), **ACI-Bench** (clinical transcripts), **Patient-Edu** (simplifying medical jargon).

**⚠️ Note on macOS with Python 3.12:** Use `pip` instead of `uv` for this tier due to build compatibility issues with `pyemd`.

Install (use `pip` on Intel Mac):
Install:
```sh
pip install "medhelm[summarization]"
```

Or with `uv` (Linux/other platforms):
Or with `uv`:
```sh
uv pip install "medhelm[summarization]"
```
Expand Down
14 changes: 0 additions & 14 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ Scenarios: **DischargeMe** (hospital course summaries), **ACI-Bench** (clinical
pip install "medhelm[summarization]"
```

**Note on macOS with Python 3.12:** Use `pip` (not `uv`) for this tier due to build compatibility with `pyemd`.

### Gated / licensing tier (`[gated]`)

Adds **gdown** so the code can download data from Google Drive. Install can also take longer.
Expand Down Expand Up @@ -92,18 +90,6 @@ pip install "medhelm[summarization,gated]"

See [Quick Start](/quick_start) for running benchmarks with `medhelm-run` (after activating your environment).

## Troubleshooting

### macOS with Python 3.12 and the summarization tier

If you encounter build errors with `pyemd` when installing `medhelm[summarization]` on macOS:

```bash
pip install "medhelm[summarization]"
```

This is a known compatibility issue. Use `pip` instead of `uv` for this tier.

## Install Multimodal Support

Additional steps are required for multimodal evaluations:
Expand Down
1 change: 0 additions & 1 deletion docs/medhelm.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@ source .venv/bin/activate
pip install "medhelm[summarization,gated]"
```

**Note on macOS/Python 3.12:** Use `pip` (not `uv`) for the `[summarization]` tier due to compatibility with `pyemd`.

### Alternative: Using conda

Expand Down
12 changes: 3 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ ranking = [
summarization = [
# For summarization_metrics and MedHELM Clinical NLP tier (DischargeMe, ACI-Bench, Patient-Edu)
# Use: pip install "medhelm[summarization]" — install can take 2–3 minutes
"summ-eval~=0.892",
"bert-score~=0.3",
# Note: extractive stats (coverage/density/compression) are vendored in helm.benchmark.metrics.data_stats
# instead of summ-eval, which pins pyemd==0.5.1 (fails on Python 3.12+) and torch<2.0 via blanc.
"bert-score~=0.3.13",
"rouge-score~=0.1.2",
"nltk~=3.7,!=3.9.0",
"sentencepiece~=0.2.0",
Expand Down Expand Up @@ -468,13 +469,6 @@ crfm-proxy-cli = "helm.proxy.cli:main"
default-groups = ["dev", "types"]

override-dependencies = [
# override torch dependency for summ-eval in "summarization" extra:
# - summ-eval depends on blanc
# - blanc depends on torch<2.0
"torch~=2.1",
# override torch dependency for summ-eval in "summarization" extra:
# - summ-eval depends on blanc
# - blanc depends on numpy<2.0
"numpy>=1.26,<3",
]

Expand Down
3 changes: 3 additions & 0 deletions src/helm/benchmark/metrics/data_stats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from helm.benchmark.metrics.data_stats.data_stats_metric import DataStatsMetric

__all__ = ["DataStatsMetric"]
66 changes: 66 additions & 0 deletions src/helm/benchmark/metrics/data_stats/data_stats_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Adapted from summ_eval.data_stats_metric (Apache 2.0 / Newsroom fragments).
# Vendored so medhelm[summarization] does not depend on summ-eval (pyemd==0.5.1, torch<2.0).

from collections import Counter
from typing import Any, Dict, List, Optional

import spacy

from helm.benchmark.metrics.data_stats.fragments import Fragments

_en: Optional[Any] = None


def _get_spacy_en():
global _en
if _en is None:
_en = spacy.load("en_core_web_sm")
return _en


def _find_ngrams(input_list: List[str], n: int):
return zip(*[input_list[i:] for i in range(n)])


class DataStatsMetric:
"""Extractive summarization statistics (coverage, density, compression)."""

def __init__(self, n_gram: int = 3, case: bool = False, tokenize: bool = True):
self.n_gram = n_gram
self.case = case
self.tokenize = tokenize

def evaluate_example(self, summary: str, input_text: str) -> Dict[str, float]:
if self.tokenize:
nlp = _get_spacy_en()
input_text = [tok.text for tok in nlp(input_text, disable=["tagger", "parser", "ner", "textcat"])]
summary = [tok.text for tok in nlp(summary, disable=["tagger", "parser", "ner", "textcat"])]
fragments = Fragments(summary, input_text, case=self.case)
score_dict: Dict[str, float] = {
"coverage": fragments.coverage(),
"density": fragments.density(),
"compression": fragments.compression(),
"summary_length": float(len(fragments.summary)),
}
tokenized_summary = fragments.summary
tokenized_text = fragments.text
for i in range(1, self.n_gram + 1):
input_ngrams = list(_find_ngrams(tokenized_text, i))
summ_ngrams = list(_find_ngrams(tokenized_summary, i))
input_ngrams_set = set(input_ngrams)
summ_ngrams_set = set(summ_ngrams)
intersect = summ_ngrams_set.intersection(input_ngrams_set)
try:
score_dict[f"percentage_novel_{i}-gram"] = (len(summ_ngrams_set) - len(intersect)) / float(
len(summ_ngrams_set)
)
ngram_counter = Counter(summ_ngrams)
repeated = [key for key, val in ngram_counter.items() if val > 1]
score_dict[f"percentage_repeated_{i}-gram_in_summ"] = len(repeated) / float(len(summ_ngrams_set))
except ZeroDivisionError:
continue
return score_dict

@property
def supports_multi_ref(self) -> bool:
return False
81 changes: 81 additions & 0 deletions src/helm/benchmark/metrics/data_stats/fragments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Adapted from https://github.com/lil-lab/newsroom/blob/master/newsroom/analyze/fragments.py
# (used by summ_eval DataStatsMetric; vendored to avoid summ-eval's pyemd/torch pins)

from collections import namedtuple


def normalize(tokens, case: bool = False) -> list[str]:
"""Lowercase tokens unless case preservation is requested."""
return [str(t).lower() if not case else str(t) for t in tokens]


class Fragments:
Match = namedtuple("Match", ("summary", "text", "length"))

def __init__(self, summary, text, case: bool = False):
if isinstance(summary, str):
self.summary = summary.split()
else:
self.summary = summary
if isinstance(text, str):
self.text = text.split()
else:
self.text = text

self._norm_summary = normalize(self.summary, case)
self._norm_text = normalize(self.text, case)
self._matches: list[Fragments.Match] = []
self._match(self._norm_summary, self._norm_text)

def overlaps(self) -> list[Match]:
return self._matches

def coverage(self, summary_base: bool = True) -> float:
numerator = sum(o.length for o in self.overlaps())
denominator = len(self.summary) if summary_base else len(self.text)
if denominator == 0:
return 0.0
return numerator / denominator

def density(self, summary_base: bool = True) -> float:
numerator = sum(o.length**2 for o in self.overlaps())
denominator = len(self.summary) if summary_base else len(self.text)
if denominator == 0:
return 0.0
return numerator / denominator

def compression(self, text_to_summary: bool = True) -> float:
ratio = [len(self.text), len(self.summary)]
try:
if text_to_summary:
return ratio[0] / ratio[1]
return ratio[1] / ratio[0]
except ZeroDivisionError:
return 0.0

def _match(self, a: list[str], b: list[str]) -> None:
a_start = b_start = 0
while a_start < len(a):
best_match = None
best_match_length = 0
while b_start < len(b):
if a[a_start] == b[b_start]:
a_end = a_start
b_end = b_start
while a_end < len(a) and b_end < len(b) and b[b_end] == a[a_end]:
b_end += 1
a_end += 1
length = a_end - a_start
if length > best_match_length:
best_match = Fragments.Match(a_start, b_start, length)
best_match_length = length
b_start = b_end
else:
b_start += 1
b_start = 0
if best_match:
if best_match_length > 0:
self._matches.append(best_match)
a_start += best_match_length
else:
a_start += 1
6 changes: 1 addition & 5 deletions src/helm/benchmark/metrics/summarization_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.metric_service import MetricService
from helm.benchmark.metrics.statistic import Stat
from helm.benchmark.metrics.data_stats import DataStatsMetric
from helm.benchmark.metrics.summac.model_summac import SummaCZS

try:
Expand Down Expand Up @@ -67,11 +68,6 @@ def __init__(
if not spacy.util.is_package("en_core_web_sm"):
spacy.cli.download("en_core_web_sm")

try:
from summ_eval.data_stats_metric import DataStatsMetric # type: ignore
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["summarization"])

self.data_stats_metric = DataStatsMetric()
self.task: str = task
self.qa_fact_eval: Optional[Dict] = None
Expand Down
Loading