neuralmagic · Chibukach · May 11, 2026
diff --git a/every_eval_ever/converters/lm_eval/adapter.py b/every_eval_ever/converters/lm_eval/adapter.py
@@ -246,6 +246,7 @@ def _build_evaluation_results(
             is_higher_better = higher_is_better.get(metric_name, True)
 
             bounds = KNOWN_METRIC_BOUNDS.get(metric_name)
+
             min_score = bounds[0] if bounds else None
             max_score = bounds[1] if bounds else None
 
@@ -267,11 +268,15 @@ def _build_evaluation_results(
                 or task_results.get('samples')
                 or task_results.get('sample_len')
             )
-            if stderr_val is not None or num_samples:
+            # Only use stderr_val if it's a valid number (not 'N/A' or other strings)
+            valid_stderr = (
+                isinstance(stderr_val, (int, float)) and stderr_val is not None
+            )
+            if valid_stderr or num_samples:
                 uncertainty = Uncertainty(
                     standard_error=(
                         StandardError(value=stderr_val, method='bootstrap')
-                        if stderr_val is not None
+                        if valid_stderr
                         else None
                     ),
                     num_samples=num_samples,

diff --git a/every_eval_ever/converters/lm_eval/utils.py b/every_eval_ever/converters/lm_eval/utils.py
@@ -73,4 +73,8 @@ def find_samples_file(output_dir: Path, task_name: str) -> Optional[Path]:
     'rougeLsum': (0.0, 1.0),
     'ter': (0.0, None),
     'brier_score': (0.0, 1.0),
+    'prompt_level_strict_acc': (0.0, 1.0),
+    'inst_level_strict_acc': (0.0, 1.0),
+    'prompt_level_loose_acc': (0.0, 1.0),
+    'inst_level_loose_acc': (0.0, 1.0),
 }