Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions every_eval_ever/converters/lm_eval/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def _build_evaluation_results(
is_higher_better = higher_is_better.get(metric_name, True)

bounds = KNOWN_METRIC_BOUNDS.get(metric_name)

min_score = bounds[0] if bounds else None
max_score = bounds[1] if bounds else None

Expand All @@ -267,11 +268,15 @@ def _build_evaluation_results(
or task_results.get('samples')
or task_results.get('sample_len')
)
if stderr_val is not None or num_samples:
# Only use stderr_val if it's a valid number (not 'N/A' or other strings)
valid_stderr = (
isinstance(stderr_val, (int, float)) and stderr_val is not None
)
if valid_stderr or num_samples:
uncertainty = Uncertainty(
standard_error=(
StandardError(value=stderr_val, method='bootstrap')
if stderr_val is not None
if valid_stderr
else None
),
num_samples=num_samples,
Expand Down
4 changes: 4 additions & 0 deletions every_eval_ever/converters/lm_eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,8 @@ def find_samples_file(output_dir: Path, task_name: str) -> Optional[Path]:
'rougeLsum': (0.0, 1.0),
'ter': (0.0, None),
'brier_score': (0.0, 1.0),
'prompt_level_strict_acc': (0.0, 1.0),
'inst_level_strict_acc': (0.0, 1.0),
'prompt_level_loose_acc': (0.0, 1.0),
'inst_level_loose_acc': (0.0, 1.0),
}