From 1d15144341b116f5d05116d24e14945027b3e1dd Mon Sep 17 00:00:00 2001
From: Cody Hart <mrcodyhart@gmail.com>
Date: Thu, 29 Jan 2026 11:47:20 -0500
Subject: [PATCH] Migrate commands/ to skills/ format for reliable CLI
 resolution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Claude Code 2.1.x changed how commands/ are discovered, causing
multi-command plugins to misroute (e.g. /look:again running tidy).
The skills/ directory format uses the newer, supported resolution
path. See glittercowboy/get-shit-done#218 for background.

- Move commands/again.md → skills/again/SKILL.md
- Move commands/tidy.md → skills/tidy/SKILL.md
- Add disable-model-invocation: true to both skills
- Rename frontmatter field tools → allowed-tools (skills format)
- Remove commands array from plugin.json and marketplace.json
- Update test suite, build script, evals, and documentation
- Add integration test script
- Bump version to 0.4.0

No changes to user-facing command names: /look:again and /look:tidy
work as before.
---
 .claude-plugin/marketplace.json               |   5 +-
 CHANGELOG.md                                  |  10 +
 CONTRIBUTING.md                               |  48 ++-
 Makefile                                      |   3 +
 README.md                                     |  19 +-
 evals/promptfooconfig.yaml                    |  38 +--
 scripts/integration-test.sh                   | 321 ++++++++++++++++++
 scripts/package.sh                            |   1 -
 scripts/test.sh                               |  30 +-
 src/dot-claude-plugin/plugin.json             |   5 +-
 .../again.md => skills/again/SKILL.md}        |   1 +
 .../tidy.md => skills/tidy/SKILL.md}          |   3 +-
 12 files changed, 417 insertions(+), 67 deletions(-)
 create mode 100755 scripts/integration-test.sh
 rename src/{commands/again.md => skills/again/SKILL.md} (99%)
 rename src/{commands/tidy.md => skills/tidy/SKILL.md} (94%)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 029a9c4..343df47 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -11,14 +11,13 @@
       "name": "look",
       "source": "./src",
       "description": "Sequential code review with fresh agent contexts. Runs multiple independent review passes to catch more issues.",
-      "version": "0.3.0",
+      "version": "0.4.0",
       "author": { "name": "HartBrook" },
       "repository": "https://github.com/HartBrook/lookagain",
       "license": "MIT",
       "keywords": ["code-review", "quality", "iterative", "multi-pass"],
-      "commands": ["./commands/again.md", "./commands/tidy.md"],
       "agents": ["./agents/lookagain-reviewer.md"],
-      "skills": ["./skills/lookagain-output-format"],
+      "skills": ["./skills/again", "./skills/tidy", "./skills/lookagain-output-format"],
       "strict": false
     }
   ]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a83a8c6..11f57aa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.4.0] - 2026-01-29
+
+### Changed
+
+- Migrated commands from `commands/` directory to `skills/` directory format (`SKILL.md` files) to fix CLI command resolution issues on newer Claude Code versions. See [GSD plugin issue #218](https://github.com/glittercowboy/get-shit-done/issues/218) for background.
+- Added `disable-model-invocation: true` to `/look:again` and `/look:tidy` to prevent unintended auto-invocation.
+- Renamed frontmatter field `tools` to `allowed-tools` per skills format.
+- Updated test suite, build script, evals, and documentation to reflect new file layout.
+- No changes to user-facing command names: `/look:again` and `/look:tidy` work as before.
+
 ## [0.3.0] - 2026-01-28
 
 ### Changed
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7cb8827..e819e1c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -17,18 +17,18 @@ lookagain/
 ├── .claude-plugin/
 │   └── marketplace.json    # Marketplace manifest
 ├── src/
-│   ├── commands/           # Plugin commands
-│   │   ├── again.md        # Main orchestrator
-│   │   └── tidy.md         # Tidy old review runs
 │   ├── agents/             # Subagent definitions
 │   │   └── lookagain-reviewer.md
-│   ├── skills/             # Output format specs
-│   │   └── lookagain-output-format/
+│   ├── skills/             # Plugin skills (commands + reference)
+│   │   ├── again/          # Main orchestrator (/look:again)
+│   │   ├── tidy/           # Tidy old review runs (/look:tidy)
+│   │   └── lookagain-output-format/  # Output format spec
 │   ├── dot-claude-plugin/  # Plugin manifest (becomes .claude-plugin/)
 │   └── dot-claude/         # Claude settings (becomes .claude/)
 ├── scripts/
 │   ├── package.sh          # Build script
-│   └── test.sh             # Plugin validation tests
+│   ├── test.sh             # Plugin validation tests
+│   └── integration-test.sh # End-to-end integration test
 ├── evals/                  # Behavioral evals (promptfoo)
 │   ├── promptfooconfig.yaml
 │   └── prompt-loader.js
@@ -65,13 +65,27 @@ make test
 # Behavioral evals — verifies models interpret prompt arguments correctly
 # Requires ANTHROPIC_API_KEY
 make eval
+
+# Integration test — end-to-end run of look:again with auto-fix
+# Requires ANTHROPIC_API_KEY
+make integration
 ```
 
-`make test` runs fast, offline checks that validate plugin structure: file existence, JSON validity, frontmatter fields, cross-references between manifests, and that commands accepting arguments use the correct pattern (`argument-hint` in frontmatter, `$ARGUMENTS` placeholder, and a defaults table in the body).
+The three test layers catch different kinds of regressions:
+
+| Layer | Command | What it catches | Cost |
+|-------|---------|----------------|------|
+| **Structure** | `make test` | Broken manifests, missing files, frontmatter drift, cross-reference mismatches | Free, offline, fast |
+| **Evals** | `make eval` | Model misinterpreting arguments (e.g. ignoring `auto-fix=false`, wrong pass count) | API calls (~$0.05/run) |
+| **Integration** | `make integration` | End-to-end failures: review not finding bugs, auto-fix not applying, output artifacts missing, tidy not cleaning up | API calls (~$0.50/run) |
+
+**When to run each:**
 
-`make eval` runs [promptfoo](https://promptfoo.dev) evals that send the interpolated prompts to Claude and assert on behavioral correctness. For example, it verifies that `auto-fix=false` causes the model to skip fixes, and that `passes=5` results in 5 planned passes.
+- **`make test`** — always, before every commit. It's fast and offline. Catches most structural mistakes from editing manifests, renaming files, or changing frontmatter.
+- **`make eval`** — after changing prompt wording or argument handling in skill files. The evals send the interpolated prompts to Claude and assert on behavioral correctness (e.g. that `auto-fix=false` causes the model to skip fixes, that `passes=5` results in 5 planned passes). Cheap enough to run liberally.
+- **`make integration`** — after changes that affect the review pipeline end-to-end (orchestration logic, agent prompts, output format). Spins up a temp git repo with contrived bugs, runs `/look:again` with auto-fix, and verifies that bugs are found, fixed, and that output artifacts are correct. Also tests `/look:tidy` cleanup. More expensive, so run when the cheaper layers aren't sufficient.
 
-Evals require an Anthropic API key and cost a small amount per run. Set the key before running:
+Both `make eval` and `make integration` require an Anthropic API key. Set it before running:
 
 ```bash
 # Option 1: export for the current shell session
@@ -109,22 +123,24 @@ You can also test the plugin through the marketplace install flow, which is clos
 
 ### Key Files
 
-- **[src/commands/again.md](src/commands/again.md)**: Main orchestrator logic. Controls pass execution, auto-fixing, and aggregation.
+- **[src/skills/again/SKILL.md](src/skills/again/SKILL.md)**: Main orchestrator logic. Controls pass execution, auto-fixing, and aggregation.
+- **[src/skills/tidy/SKILL.md](src/skills/tidy/SKILL.md)**: Tidy skill for pruning old review runs.
 - **[src/agents/lookagain-reviewer.md](src/agents/lookagain-reviewer.md)**: Reviewer subagent. Defines how individual review passes work.
 - **[src/skills/lookagain-output-format/SKILL.md](src/skills/lookagain-output-format/SKILL.md)**: JSON output format specification.
 - **[src/dot-claude-plugin/plugin.json](src/dot-claude-plugin/plugin.json)**: Plugin metadata and version.
-- **[src/commands/tidy.md](src/commands/tidy.md)**: Tidy command for pruning old review runs.
 - **[.claude-plugin/marketplace.json](.claude-plugin/marketplace.json)**: Marketplace manifest for plugin discovery and installation.
 
-### Writing Command Prompts
+### Writing Skill Prompts
 
-When editing or adding command prompts in `src/commands/`:
+When editing or adding skills in `src/skills/`:
 
-- **Use `$ARGUMENTS` for the raw string.** Claude Code replaces `$ARGUMENTS` with whatever the user typed after the command name. There is no `$ARGUMENTS.name` dot-access syntax — only `$ARGUMENTS` (whole string) and `$ARGUMENTS[N]` (positional). Do NOT use an `arguments:` array in frontmatter — it is not a supported Claude Code feature and will not be interpolated.
+- Each skill is a directory containing a `SKILL.md` file (e.g., `src/skills/again/SKILL.md`).
+- **Use `$ARGUMENTS` for the raw string.** Claude Code replaces `$ARGUMENTS` with whatever the user typed after the skill name. There is no `$ARGUMENTS.name` dot-access syntax — only `$ARGUMENTS` (whole string) and `$ARGUMENTS[N]` (positional). Do NOT use an `arguments:` array in frontmatter — it is not a supported Claude Code feature and will not be interpolated.
 - **Add `argument-hint`** in frontmatter to document expected input format (e.g., `argument-hint: "[key=value ...]"`).
+- **Add `disable-model-invocation: true`** for user-triggered actions to prevent Claude from auto-invoking them.
 - **Include a defaults table** in the body listing each key, its default, and a description. Instruct the agent to parse `key=value` pairs from `$ARGUMENTS` and fall back to defaults for missing keys.
 - **Log the resolved configuration** so it is visible in output and reviewable in evals.
-- `make test` enforces that commands using `$ARGUMENTS` have `argument-hint` in frontmatter, a defaults table in the body, and do NOT use the unsupported `arguments:` frontmatter array.
+- `make test` enforces that skills using `$ARGUMENTS` have `argument-hint` in frontmatter, a defaults table in the body, and do NOT use the unsupported `arguments:` frontmatter array.
 - After changing prompt logic, run `make eval` to verify models still interpret the arguments correctly.
 
 ## Pull Requests
@@ -138,4 +154,4 @@ When editing or adding command prompts in `src/commands/`:
 
 ## Versioning
 
-Update the version in `src/dot-claude-plugin/plugin.json` when making releases. The marketplace entry in `.claude-plugin/marketplace.json` also has a `version` field — keep both in sync. The test suite (`make test`) validates that commands, agents, and skills match between the two files.
+Update the version in `src/dot-claude-plugin/plugin.json` when making releases. The marketplace entry in `.claude-plugin/marketplace.json` also has a `version` field — keep both in sync. The test suite (`make test`) validates that agents and skills match between the two files.
diff --git a/Makefile b/Makefile
index d837424..8e41568 100644
--- a/Makefile
+++ b/Makefile
@@ -18,6 +18,9 @@ dev: build ## Build and start Claude Code with plugin loaded
 eval: ## Run behavioral evals (requires ANTHROPIC_API_KEY)
 	@npx promptfoo@latest eval -c evals/promptfooconfig.yaml
 
+integration: build ## Run integration test (requires ANTHROPIC_API_KEY)
+	@./scripts/integration-test.sh
+
 clean: ## Remove build artifacts
 	@rm -rf dist/
 	@echo "Cleaned dist/"
diff --git a/README.md b/README.md
index 32f9839..f7df2c2 100644
--- a/README.md
+++ b/README.md
@@ -140,15 +140,26 @@ Previous runs are preserved. Use `/look:tidy` to prune old results:
 | should_fix | No          | Performance issues, poor error handling         |
 | suggestion | No          | Refactoring, documentation, style               |
 
+## Updating
+
+To update to the latest version:
+
+```bash
+/plugin marketplace update hartbrook-plugins
+/plugin uninstall look@hartbrook-plugins
+/plugin install look@hartbrook-plugins
+```
+
 ## Development
 
 ```bash
-make test   # Structural validation (offline, fast)
-make eval   # Behavioral evals via promptfoo (requires ANTHROPIC_API_KEY)
-make dev    # Build and start Claude Code with the plugin loaded
+make test          # Structural validation (offline, fast)
+make eval          # Behavioral evals via promptfoo (requires ANTHROPIC_API_KEY)
+make integration   # End-to-end integration test (requires ANTHROPIC_API_KEY)
+make dev           # Build and start Claude Code with the plugin loaded
 ```
 
-See [CONTRIBUTING.md](CONTRIBUTING.md) for full development setup and guidelines.
+Run `make test` before every commit (free, offline). Run `make eval` after changing prompt wording or argument handling. Run `make integration` after changes to the review pipeline or output format. See [CONTRIBUTING.md](CONTRIBUTING.md) for details on what each layer catches.
 
 ## License
 
diff --git a/evals/promptfooconfig.yaml b/evals/promptfooconfig.yaml
index 49f322d..af195f7 100644
--- a/evals/promptfooconfig.yaml
+++ b/evals/promptfooconfig.yaml
@@ -10,11 +10,11 @@ providers:
 
 tests:
   # ==================================================================
-  # again.md — no arguments (defaults)
+  # again — no arguments (defaults)
   # ==================================================================
   - description: "no arguments → model uses all defaults (auto-fix=true, passes=3, thorough)"
     vars:
-      prompt_file: src/commands/again.md
+      prompt_file: src/skills/again/SKILL.md
     assert:
       - type: llm-rubric
         value: >
@@ -26,11 +26,11 @@ tests:
           simply means the user wants all defaults.
 
   # ==================================================================
-  # again.md — auto-fix interpretation
+  # again — auto-fix interpretation
   # ==================================================================
   - description: "auto-fix=true → model plans to apply fixes"
     vars:
-      prompt_file: src/commands/again.md
+      prompt_file: src/skills/again/SKILL.md
       arg_passes: "3"
       arg_target: staged
       arg_auto-fix: "true"
@@ -45,7 +45,7 @@ tests:
 
   - description: "auto-fix=false → model skips fixes"
     vars:
-      prompt_file: src/commands/again.md
+      prompt_file: src/skills/again/SKILL.md
       arg_passes: "3"
       arg_target: staged
       arg_auto-fix: "false"
@@ -59,11 +59,11 @@ tests:
           It should not describe applying any code fixes.
 
   # ==================================================================
-  # again.md — passes count
+  # again — passes count
   # ==================================================================
   - description: "passes=5 → model plans exactly 5 initial passes"
     vars:
-      prompt_file: src/commands/again.md
+      prompt_file: src/skills/again/SKILL.md
       arg_passes: "5"
       arg_target: staged
       arg_auto-fix: "true"
@@ -79,11 +79,11 @@ tests:
           5 sequential passes before considering additional passes.
 
   # ==================================================================
-  # again.md — model resolution
+  # again — model resolution
   # ==================================================================
   - description: "model=fast → reviewer uses haiku"
     vars:
-      prompt_file: src/commands/again.md
+      prompt_file: src/skills/again/SKILL.md
       arg_passes: "3"
       arg_target: staged
       arg_auto-fix: "true"
@@ -99,7 +99,7 @@ tests:
 
   - description: "model=thorough → no explicit model override"
     vars:
-      prompt_file: src/commands/again.md
+      prompt_file: src/skills/again/SKILL.md
       arg_passes: "3"
       arg_target: staged
       arg_auto-fix: "true"
@@ -113,11 +113,11 @@ tests:
           current model). It should NOT set the model to haiku or sonnet.
 
   # ==================================================================
-  # again.md — scope resolution
+  # again — scope resolution
   # ==================================================================
   - description: "target=branch → branch-based diff scope"
     vars:
-      prompt_file: src/commands/again.md
+      prompt_file: src/skills/again/SKILL.md
       arg_passes: "3"
       arg_target: branch
       arg_auto-fix: "true"
@@ -131,11 +131,11 @@ tests:
           branch. It should reference branch comparison or merge-base.
 
   # ==================================================================
-  # again.md — partial arguments (only auto-fix specified)
+  # again — partial arguments (only auto-fix specified)
   # ==================================================================
   - description: "only auto-fix=false → defaults for everything else, no fixing"
     vars:
-      prompt_file: src/commands/again.md
+      prompt_file: src/skills/again/SKILL.md
       arg_auto-fix: "false"
     assert:
       - type: llm-rubric
@@ -146,11 +146,11 @@ tests:
           auto-fix is explicitly false.
 
   # ==================================================================
-  # tidy.md — all flag
+  # tidy — all flag
   # ==================================================================
   - description: "all=true → removes all runs"
     vars:
-      prompt_file: src/commands/tidy.md
+      prompt_file: src/skills/tidy/SKILL.md
       arg_keep: "1"
       arg_all: "true"
     assert:
@@ -161,7 +161,7 @@ tests:
 
   - description: "all=false, keep=3 → date-based retention"
     vars:
-      prompt_file: src/commands/tidy.md
+      prompt_file: src/skills/tidy/SKILL.md
       arg_keep: "3"
       arg_all: "false"
     assert:
@@ -172,11 +172,11 @@ tests:
           It should keep runs from the last 3 days.
 
   # ==================================================================
-  # tidy.md — no arguments (defaults)
+  # tidy — no arguments (defaults)
   # ==================================================================
   - description: "tidy with no arguments → keep=1, all=false"
     vars:
-      prompt_file: src/commands/tidy.md
+      prompt_file: src/skills/tidy/SKILL.md
     assert:
       - type: llm-rubric
         value: >
diff --git a/scripts/integration-test.sh b/scripts/integration-test.sh
new file mode 100755
index 0000000..581e493
--- /dev/null
+++ b/scripts/integration-test.sh
@@ -0,0 +1,321 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+# ============================================================
+# Integration test for look:again
+#
+# Creates a temp git repo with a contrived buggy file, runs
+# the plugin with 2 passes (expecting pass 1 fixes the bugs
+# and pass 2 finds no must_fix issues), then chains a tidy
+# to verify cleanup works.
+#
+# Requires: ANTHROPIC_API_KEY, claude CLI
+#
+# Uses --dangerously-skip-permissions since this runs in an
+# isolated temp directory. Times out after 5 minutes per
+# claude invocation.
+# ============================================================
+
+TIMEOUT=300
+
+if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
+    echo "Error: ANTHROPIC_API_KEY is required"
+    exit 1
+fi
+
+if ! command -v claude >/dev/null 2>&1; then
+    echo "Error: claude CLI is required but not found"
+    exit 1
+fi
+
+# Build the plugin
+echo "Building plugin..."
+"$PROJECT_ROOT/scripts/package.sh" > /dev/null
+
+WORK_DIR=$(mktemp -d)
+cleanup() {
+    if [[ "${TEST_FAILED:-0}" -eq 1 ]]; then
+        echo ""
+        echo "Preserving work directory for debugging: $WORK_DIR"
+    else
+        rm -rf "$WORK_DIR"
+    fi
+}
+trap cleanup EXIT
+
+echo "Work directory: $WORK_DIR"
+
+# Helper: run claude with a timeout (perl, since macOS lacks GNU timeout)
+run_claude() {
+    local output_file="$1"
+    shift
+    perl -e '
+        alarm shift;
+        $SIG{ALRM} = sub { kill "TERM", $pid; exit 124 };
+        $pid = fork // die;
+        unless ($pid) { exec @ARGV; die "exec failed: $!" }
+        waitpid $pid, 0;
+        exit ($? >> 8);
+    ' "$TIMEOUT" \
+        claude -p "$@" \
+        --plugin-dir "$PROJECT_ROOT/dist/lookagain" \
+        --dangerously-skip-permissions \
+        > "$output_file" 2>&1
+    local exit_code=$?
+
+    if [[ $exit_code -eq 124 ]]; then
+        echo "Error: claude timed out after ${TIMEOUT}s"
+    elif [[ $exit_code -ne 0 ]]; then
+        echo "Warning: claude exited with code $exit_code (continuing to check output)"
+    fi
+    return 0
+}
+
+# ============================================================
+# Setup: create a git repo with a buggy file
+# ============================================================
+
+cd "$WORK_DIR"
+git init -q
+git config user.email "test@test.com"
+git config user.name "Test"
+
+# Create a Python file with obvious logic bugs:
+# 1. Variable is overwritten before use (result always 0.0)
+# 2. Accessing attribute on None without null check
+cat > buggy.py << 'PYEOF'
+def calculate_total(items: list[dict]) -> float:
+    """Calculate the total price of all items."""
+    total = 0.0
+    for item in items:
+        total += item["price"] * item["quantity"]
+    total = 0.0
+    return total
+
+
+def find_user_name(users: list[dict], user_id: str) -> str:
+    """Find a user's name by ID."""
+    result = None
+    for user in users:
+        if user["id"] == user_id:
+            result = user
+    return result["name"]
+PYEOF
+
+ORIGINAL_CONTENT=$(cat buggy.py)
+
+git add buggy.py
+git commit -q -m "Add buggy module"
+
+# ============================================================
+# Phase 1: run look:again (2 passes, fast model, auto-fix on)
+# ============================================================
+
+echo ""
+echo "Phase 1: Running look:again against buggy.py (2 passes, fast model, auto-fix on)..."
+echo ""
+
+run_claude "$WORK_DIR/again_output.txt" \
+    "/look:again target=buggy.py passes=2 auto-fix=true model=fast"
+
+echo "look:again output saved to $WORK_DIR/again_output.txt"
+
+# ============================================================
+# Verify look:again results
+#
+# Disable set -e so all checks run even if commands fail.
+# ============================================================
+
+set +e
+
+PASS=0
+FAIL=0
+
+pass() {
+    PASS=$((PASS + 1))
+    echo "  ✓ $1"
+}
+
+fail() {
+    FAIL=$((FAIL + 1))
+    echo "  ✗ $1"
+}
+
+echo ""
+echo "=== look:again checks ==="
+echo ""
+
+# 1. File was modified (auto-fix applied)
+CURRENT_CONTENT=$(cat buggy.py)
+if [[ "$CURRENT_CONTENT" != "$ORIGINAL_CONTENT" ]]; then
+    pass "buggy.py was modified (auto-fix applied)"
+else
+    fail "buggy.py was NOT modified (auto-fix did not apply)"
+fi
+
+# 2. The double-assignment bug is fixed
+if ! grep -q 'total = 0\.0' buggy.py || [[ $(grep -c 'total = 0\.0' buggy.py) -lt 2 ]]; then
+    pass "double-assignment bug appears fixed (total = 0.0 not duplicated)"
+else
+    fail "double-assignment bug still present"
+fi
+
+# 3. Output directory exists
+if [[ -d ".lookagain" ]]; then
+    pass ".lookagain/ directory created"
+else
+    fail ".lookagain/ directory not created"
+fi
+
+# 4. Run directory matches expected pattern
+RUN_DIR=""
+if [[ -d ".lookagain" ]]; then
+    RUN_DIRS=$(find .lookagain -maxdepth 1 -mindepth 1 -type d 2>/dev/null | sort)
+    RUN_COUNT=$(echo "$RUN_DIRS" | grep -c . 2>/dev/null || echo 0)
+    if [[ "$RUN_COUNT" -ge 1 ]]; then
+        pass "found $RUN_COUNT run directory(ies)"
+        RUN_DIR=$(echo "$RUN_DIRS" | head -1)
+    else
+        fail "no run directories found"
+    fi
+else
+    fail "no run directories found (no .lookagain/)"
+fi
+
+# 5. Both pass output files exist and are valid JSON
+for pass_num in 1 2; do
+    if [[ -n "$RUN_DIR" ]] && [[ -f "$RUN_DIR/pass-${pass_num}.json" ]]; then
+        if python3 -c "import json,sys; json.load(open(sys.argv[1]))" "$RUN_DIR/pass-${pass_num}.json" 2>/dev/null; then
+            pass "pass-${pass_num}.json exists and is valid JSON"
+        else
+            fail "pass-${pass_num}.json exists but is not valid JSON"
+        fi
+    else
+        fail "pass-${pass_num}.json not found"
+    fi
+done
+
+# 6. Pass 1 found must_fix issues (the contrived bugs)
+if [[ -n "$RUN_DIR" ]] && [[ -f "$RUN_DIR/pass-1.json" ]]; then
+    if python3 -c "
+import json, sys
+d = json.load(open(sys.argv[1]))
+issues = d.get('issues', [])
+must_fix = [i for i in issues if i.get('severity') == 'must_fix']
+sys.exit(0 if must_fix else 1)
+" "$RUN_DIR/pass-1.json" 2>/dev/null; then
+        pass "pass 1 found must_fix issues"
+    else
+        fail "pass 1 did not find must_fix issues"
+    fi
+else
+    fail "cannot check pass 1 must_fix issues (file missing)"
+fi
+
+# 7. Pass 2 found no must_fix issues (bugs were fixed after pass 1)
+if [[ -n "$RUN_DIR" ]] && [[ -f "$RUN_DIR/pass-2.json" ]]; then
+    if python3 -c "
+import json, sys
+d = json.load(open(sys.argv[1]))
+issues = d.get('issues', [])
+must_fix = [i for i in issues if i.get('severity') == 'must_fix']
+sys.exit(0 if not must_fix else 1)
+" "$RUN_DIR/pass-2.json" 2>/dev/null; then
+        pass "pass 2 found no must_fix issues (bugs were fixed)"
+    else
+        fail "pass 2 still found must_fix issues after auto-fix"
+    fi
+else
+    fail "cannot check pass 2 must_fix issues (file missing)"
+fi
+
+# 8. Aggregate JSON exists with issues array
+if [[ -n "$RUN_DIR" ]] && [[ -f "$RUN_DIR/aggregate.json" ]]; then
+    if python3 -c "
+import json, sys
+d = json.load(open(sys.argv[1]))
+assert isinstance(d.get('issues', d.get('findings', [])), list), 'no issues array'
+" "$RUN_DIR/aggregate.json" 2>/dev/null; then
+        pass "aggregate.json exists with issues array"
+    else
+        fail "aggregate.json missing issues array"
+    fi
+else
+    fail "aggregate.json not found"
+fi
+
+# 9. Aggregate markdown exists and is non-empty
+if [[ -n "$RUN_DIR" ]] && [[ -f "$RUN_DIR/aggregate.md" ]]; then
+    if [[ -s "$RUN_DIR/aggregate.md" ]]; then
+        pass "aggregate.md exists and is non-empty"
+    else
+        fail "aggregate.md exists but is empty"
+    fi
+else
+    fail "aggregate.md not found"
+fi
+
+# ============================================================
+# Phase 2: run look:tidy to clean up the run
+# ============================================================
+
+set -e
+
+echo ""
+echo "Phase 2: Running look:tidy all=true..."
+echo ""
+
+run_claude "$WORK_DIR/tidy_output.txt" \
+    "/look:tidy all=true"
+
+echo "look:tidy output saved to $WORK_DIR/tidy_output.txt"
+
+set +e
+
+echo ""
+echo "=== look:tidy checks ==="
+echo ""
+
+# 10. Run directory was removed by tidy
+if [[ -n "$RUN_DIR" ]] && [[ ! -d "$RUN_DIR" ]]; then
+    pass "run directory removed by tidy"
+elif [[ -n "$RUN_DIR" ]]; then
+    fail "run directory still exists after tidy"
+else
+    fail "cannot check tidy (no run directory to remove)"
+fi
+
+# 11. .lookagain/ is empty or gone
+if [[ ! -d ".lookagain" ]]; then
+    pass ".lookagain/ removed entirely"
+else
+    REMAINING=$(find .lookagain -maxdepth 1 -mindepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
+    if [[ "$REMAINING" -eq 0 ]]; then
+        pass ".lookagain/ exists but is empty (all runs removed)"
+    else
+        fail ".lookagain/ still has $REMAINING run directories after tidy all=true"
+    fi
+fi
+
+# ============================================================
+# Summary
+# ============================================================
+
+echo ""
+echo "=== Results: $PASS passed, $FAIL failed ==="
+
+if [[ $FAIL -gt 0 ]]; then
+    TEST_FAILED=1
+    echo ""
+    echo "--- look:again output (first 200 lines) ---"
+    head -200 "$WORK_DIR/again_output.txt"
+    echo "--- end again output ---"
+    echo ""
+    echo "--- look:tidy output (first 100 lines) ---"
+    head -100 "$WORK_DIR/tidy_output.txt" 2>/dev/null || echo "(no tidy output)"
+    echo "--- end tidy output ---"
+    exit 1
+fi
diff --git a/scripts/package.sh b/scripts/package.sh
index 44940b3..ed4025d 100755
--- a/scripts/package.sh
+++ b/scripts/package.sh
@@ -32,7 +32,6 @@ cp -r "$PROJECT_ROOT/src/dot-claude" "$DIST_DIR/lookagain/.claude"
 cp -r "$PROJECT_ROOT/src/dot-claude-plugin" "$DIST_DIR/lookagain/.claude-plugin"
 # Overlay marketplace.json from repo root into the dist .claude-plugin/ directory
 cp "$PROJECT_ROOT/.claude-plugin/marketplace.json" "$DIST_DIR/lookagain/.claude-plugin/"
-cp -r "$PROJECT_ROOT/src/commands" "$DIST_DIR/lookagain/commands"
 cp -r "$PROJECT_ROOT/src/agents" "$DIST_DIR/lookagain/agents"
 cp -r "$PROJECT_ROOT/src/skills" "$DIST_DIR/lookagain/skills"
 
diff --git a/scripts/test.sh b/scripts/test.sh
index 87b8198..6eadb97 100755
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -82,7 +82,7 @@ test_plugin_json() {
     fi
 
     # Required fields
-    for field in name version description author commands agents; do
+    for field in name version description author agents skills; do
         check_json_field "$pjson" "$field"
     done
 
@@ -111,8 +111,8 @@ sys.exit(0 if isinstance(d.get('author'), dict) and 'name' in d['author'] else 1
 
 test_required_files() {
     check_file "src/dot-claude-plugin/plugin.json"
-    check_file "src/commands/again.md"
-    check_file "src/commands/tidy.md"
+    check_file "src/skills/again/SKILL.md"
+    check_file "src/skills/tidy/SKILL.md"
     check_file "src/agents/lookagain-reviewer.md"
     check_file "src/skills/lookagain-output-format/SKILL.md"
     check_file "src/dot-claude/settings.local.json"
@@ -123,21 +123,21 @@ test_required_files() {
 }
 
 test_frontmatter() {
-    check_frontmatter "$PROJECT_ROOT/src/commands/again.md" name description
-    check_frontmatter "$PROJECT_ROOT/src/commands/tidy.md" name description
+    check_frontmatter "$PROJECT_ROOT/src/skills/again/SKILL.md" name description
+    check_frontmatter "$PROJECT_ROOT/src/skills/tidy/SKILL.md" name description
     check_frontmatter "$PROJECT_ROOT/src/agents/lookagain-reviewer.md" name description tools
     check_frontmatter "$PROJECT_ROOT/src/skills/lookagain-output-format/SKILL.md" name description
 }
 
 test_argument_handling() {
-    # Verify that command files using arguments follow the correct pattern:
+    # Verify that skill files using arguments follow the correct pattern:
     # 1. Frontmatter has argument-hint (not the unsupported arguments: array)
     # 2. Body contains $ARGUMENTS placeholder for the raw argument string
     # 3. Body contains a defaults table with Key/Default columns
     # This ensures the agent receives and parses arguments at runtime
     # rather than relying on non-existent compile-time interpolation.
 
-    for file in "$PROJECT_ROOT"/src/commands/*.md; do
+    for file in "$PROJECT_ROOT"/src/skills/*/SKILL.md; do
         local relpath="${file#"$PROJECT_ROOT"/}"
 
         local frontmatter
@@ -150,7 +150,7 @@ test_argument_handling() {
             !in_fm { print }
         ' "$file")
 
-        # Skip commands that don't accept arguments
+        # Skip skills that don't accept arguments
         if ! echo "$body" | grep -qF '$ARGUMENTS'; then
             continue
         fi
@@ -180,16 +180,6 @@ test_argument_handling() {
 test_cross_references() {
     local pjson="$PROJECT_ROOT/src/dot-claude-plugin/plugin.json"
 
-    # Commands resolve
-    while IFS= read -r cmd; do
-        local resolved="$PROJECT_ROOT/src/${cmd#./}"
-        if [[ -f "$resolved" ]]; then
-            pass "command $cmd resolves"
-        else
-            fail "command $cmd not found at src/${cmd#./}"
-        fi
-    done < <(python3 -c "import json,sys; [print(c) for c in json.load(open(sys.argv[1])).get('commands', [])]" "$pjson")
-
     # Agents resolve
     while IFS= read -r agent; do
         local resolved="$PROJECT_ROOT/src/${agent#./}"
@@ -243,7 +233,7 @@ test_build() {
         fail "dist marketplace.json is invalid"
     fi
 
-    for f in commands/again.md commands/tidy.md agents/lookagain-reviewer.md skills/lookagain-output-format/SKILL.md README.md; do
+    for f in agents/lookagain-reviewer.md skills/again/SKILL.md skills/tidy/SKILL.md skills/lookagain-output-format/SKILL.md README.md; do
         if [[ -f "$dist/$f" ]]; then
             pass "dist/$f exists"
         else
@@ -339,7 +329,7 @@ sys.exit(0 if pv == mv else 1)
     fi
 
     # Commands, agents, skills match plugin.json
-    for field in commands agents skills; do
+    for field in agents skills; do
         if python3 -c "
 import json, sys
 p = sorted(json.load(open(sys.argv[1])).get(sys.argv[3], []))
diff --git a/src/dot-claude-plugin/plugin.json b/src/dot-claude-plugin/plugin.json
index a60ffe1..61ad0b7 100644
--- a/src/dot-claude-plugin/plugin.json
+++ b/src/dot-claude-plugin/plugin.json
@@ -1,11 +1,10 @@
 {
   "name": "look",
-  "version": "0.3.0",
+  "version": "0.4.0",
   "description": "Sequential code review with fresh agent contexts. Runs multiple independent review passes to catch more issues.",
   "author": { "name": "HartBrook" },
   "repository": "https://github.com/HartBrook/lookagain",
   "keywords": ["code-review", "quality", "iterative", "multi-pass"],
-  "commands": ["./commands/again.md", "./commands/tidy.md"],
   "agents": ["./agents/lookagain-reviewer.md"],
-  "skills": ["./skills/lookagain-output-format"]
+  "skills": ["./skills/again", "./skills/tidy", "./skills/lookagain-output-format"]
 }
diff --git a/src/commands/again.md b/src/skills/again/SKILL.md
similarity index 99%
rename from src/commands/again.md
rename to src/skills/again/SKILL.md
index 4c2f395..83a816e 100644
--- a/src/commands/again.md
+++ b/src/skills/again/SKILL.md
@@ -1,6 +1,7 @@
 ---
 name: again
 description: Run sequential code review passes with fresh contexts to catch more issues
+disable-model-invocation: true
 argument-hint: "[key=value ...]"
 ---
 
diff --git a/src/commands/tidy.md b/src/skills/tidy/SKILL.md
similarity index 94%
rename from src/commands/tidy.md
rename to src/skills/tidy/SKILL.md
index b79b221..d16aea5 100644
--- a/src/commands/tidy.md
+++ b/src/skills/tidy/SKILL.md
@@ -1,7 +1,8 @@
 ---
 name: tidy
 description: Remove old lookagain review runs, keeping today's results by default
-tools: Glob, Bash(rm -rf .lookagain/????-??-??T??-??-??)
+disable-model-invocation: true
+allowed-tools: Glob, Bash(rm -rf .lookagain/????-??-??T??-??-??)
 argument-hint: "[key=value ...]"
 ---