diff --git a/README.md b/README.md index 2220397..27253f3 100644 --- a/README.md +++ b/README.md @@ -242,14 +242,14 @@ EVAL_SKIP_AUTH_CHECK=1 eval-harness run --skill= --dry-run # Layer 1 — single check kind in isolation bash scripts/eval/lib/score.sh check -# Full test suite — 20 suites covering every primitive +# Full test suite — 21 suites covering every primitive for t in scripts/eval/tests/*.sh; do bash "$t"; done # → all should print PASS ``` If you need to know whether a specific factor is being checked, point at the case YAML — `.checks[]` is the complete list of factors that case enforces. There is no hidden scoring. -### Verified test suites (20/20 green on `main`) +### Verified test suites (21/21 green on `main`) | Suite | Covers | |---|---| @@ -269,6 +269,7 @@ If you need to know whether a specific factor is being checked, point at the cas | `fix_proposal_render.sh` | `fix_proposal` renders in `diff.md` (closes BLK-5) | | `bypass.sh` | `EVAL_BYPASS=1` exits 0 + writes bypass event (closes BLK-1) | | `shell_safety.sh` | `score_shell` filter accepts jq/pipes/wc; rejects rm/curl/`$()`/backtick/`>`; honors `unsafe_shell:` opt-in (closes BLK-2) | +| `shell_no_expectation.sh` | `score_shell` treats missing `expect_*` fields as harness errors, not ordinary FAILs | | `fixture_path_traversal.sh` | Fixture copy rejects absolute paths + `..` segments (closes BLK-3) | | `attribution_portable.sh` | Attribution works under GNU + BSD grep (closes BLK-4) | | `transcript_empty_guard.sh` | Missing/empty transcript → harness error not vacuous PASS (closes BLK-7) | diff --git a/scripts/eval/lib/score.sh b/scripts/eval/lib/score.sh index f03cd09..70ffed7 100755 --- a/scripts/eval/lib/score.sh +++ b/scripts/eval/lib/score.sh @@ -68,6 +68,21 @@ score_shell() { local expect_exact; expect_exact="$(yq -r '.expect_exact // empty' "$check_file")" local unsafe_opt_in; unsafe_opt_in="$(yq -r '.unsafe_shell // false' "$check_file" 2>/dev/null || echo false)" + if ! grep -qE '^[[:space:]]*expect_(regex|min|exact)[[:space:]]*:' "$check_file"; then + jq -n \ + --arg cmd "$cmd" \ + '{ + kind: "shell", + passed: false, + failed_check_id: ("shell:" + $cmd), + expected: "at least one expect_* field (expect_regex / expect_min / expect_exact)", + actual: "none set - check YAML for typo like expected_*", + diff_hint: "this check is misconfigured; treating as harness error", + error: true + }' + return 0 + fi + if [[ "$unsafe_opt_in" != "true" && "${EVAL_ALLOW_UNSAFE_SHELL:-0}" != "1" ]] && score_shell_is_unsafe "$cmd"; then jq -n \ --arg cmd "$cmd" \ diff --git a/scripts/eval/tests/shell_no_expectation.sh b/scripts/eval/tests/shell_no_expectation.sh new file mode 100755 index 0000000..6b3efe4 --- /dev/null +++ b/scripts/eval/tests/shell_no_expectation.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +source "$SCRIPT_DIR/../lib/yq-shim.sh" +source "$SCRIPT_DIR/../lib/llm_judge.sh" +source "$SCRIPT_DIR/../lib/autofix.sh" +source "$SCRIPT_DIR/../lib/score.sh" + +WORK="$(mktemp -d -t eval-harness-shell-no-expect.XXXXXX)" +trap 'rm -rf "$WORK"' EXIT + +cat > "$WORK/no-expectation.yaml" <&2; echo "$out" >&2; exit 1; } +[[ "$err" == "true" ]] || { echo "FAIL: missing expectations should set error=true, got $err" >&2; echo "$out" >&2; exit 1; } +[[ "$expected" == *"expect_regex"* ]] || { echo "FAIL: expected should mention expect_* fields, got: $expected" >&2; exit 1; } +[[ "$actual" == *"none set"* ]] || { echo "FAIL: actual should mention no expectations, got: $actual" >&2; exit 1; } +[[ "$hint" == *"misconfigured"* ]] || { echo "FAIL: hint should identify a misconfigured check, got: $hint" >&2; exit 1; } + +echo "PASS: shell checks with no expect_* fields surface as harness errors" +exit 0