failurefirst · dependabot · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/.DS_Store b/.DS_Store
diff --git a/.github/workflows/history-cleanup-696.yml b/.github/workflows/history-cleanup-696.yml
@@ -0,0 +1,75 @@
+name: History cleanup 696
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+env:
+  FILTER_DATE: '2026-05-08'
+
+jobs:
+  cleanup:
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+    steps:
+      - name: Install git-filter-repo
+        run: |
+          set -euo pipefail
+          python3 -m pip install --user git-filter-repo
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+      - name: Mirror, filter, verify, and force-push
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          git clone --mirror "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" ff-working.git
+          cd ff-working.git
+
+          bytes_for_dir() {
+            find "$1" -type f -print0 | xargs -0 stat -c %s | awk '{s+=$1} END {print s+0}'
+          }
+
+          BASELINE_COMMITS=$(git rev-list --all --count)
+          BASELINE_REFS=$(git for-each-ref --format='%(refname)' | wc -l | tr -d ' ')
+          BASELINE_BYTES=$(bytes_for_dir .)
+          printf 'BASELINE commits=%s refs=%s bytes=%s\n' "$BASELINE_COMMITS" "$BASELINE_REFS" "$BASELINE_BYTES"
+
+          git filter-repo --strip-blobs-bigger-than 10M --force
+          git filter-repo \
+            --invert-paths \
+            --path docs/video/ \
+            --path docs/audio/ \
+            --path site/public/video/ \
+            --path site/public/audio/ \
+            --path-glob 'docs/images/**/*.mp4' \
+            --path-glob 'docs/images/**/*.m4a' \
+            --path-glob 'site/public/images/**/*.mp4' \
+            --path-glob 'site/public/images/**/*.m4a' \
+            --force
+
+          POST_COMMITS=$(git rev-list --all --count)
+          POST_REFS=$(git for-each-ref --format='%(refname)' | wc -l | tr -d ' ')
+          POST_BYTES=$(bytes_for_dir .)
+          printf 'POST commits=%s refs=%s bytes=%s\n' "$POST_COMMITS" "$POST_REFS" "$POST_BYTES"
+          git count-objects -vH
+
+          test "$POST_BYTES" -lt 2000000000
+          test "$POST_COMMITS" -le "$BASELINE_COMMITS"
+          test "$POST_COMMITS" -ge $(( BASELINE_COMMITS / 2 ))
+          test "$POST_REFS" -eq "$BASELINE_REFS"
+
+          git filter-repo --analyze --force
+          if grep -E '\.(mp4|m4a|mp3|wav|ogg)$' filter-repo/analysis/extensions-all-sizes.txt; then
+            echo 'media extensions remain after filter' >&2
+            exit 1
+          fi
+          if grep -E '<present> (docs/video|docs/audio|site/public/video|site/public/audio|docs/images/.*\.(mp4|m4a)|site/public/images/.*\.(mp4|m4a))' filter-repo/analysis/path-all-sizes.txt; then
+            echo 'targeted media paths remain after filter' >&2
+            exit 1
+          fi
+
+          git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git"
+          git push --force --prune origin '+refs/heads/*:refs/heads/*' '+refs/tags/*:refs/tags/*'
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,12 @@
+# Afterwords TTS voice override
+.afterwords
+
+# OS files
+.DS_Store
+
+# IDE
+.vscode/
+
+# Superpowers brainstorm artifacts
+.superpowers/
+.wrangler/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,62 +1,69 @@
-# Contributing to Failure-First Embodied AI
+# Contributing to Failure-First
 
-Thank you for your interest in contributing to Failure-First Embodied AI!
+Thank you for your interest in Failure-First. This is a **research project**, not a typical open-source codebase. Contributions are welcome, but the ways to contribute differ from a standard software project.
 
-## Important: Public Repository Context
+## How to Contribute
 
-This is the **public-facing** repository for the Failure-First research project. Contributions must adhere to strict safety guidelines to ensure all content remains:
-- Pattern-level only (never operational)
-- Defensively purposed
-- Appropriate for public academic discourse
+### Report Issues
 
-## What to Contribute
+If you find errors in our published findings, methodology gaps, broken links on [failurefirst.org](https://failurefirst.org), or inconsistencies in the public documentation, please open a GitHub issue.
 
-**✅ Welcome Contributions:**
-- Documentation improvements
-- Research methodology clarifications
-- Failure taxonomy additions (pattern-level)
-- Website improvements
-- Typo fixes and clarity improvements
+### Cite Our Work
 
-**❌ Not Accepted:**
-- Operational exploit code
-- Working jailbreak prompts
-- Model-specific bypass techniques
-- Raw test results or adversarial datasets
+The most impactful contribution for a research project is citation. If our findings, datasets, or methodology inform your work, please cite us:
 
-## Contribution Process
+```bibtex
+@software{failure_first_2026,
+  title   = {Failure-First: Adversarial Evaluation Framework for Embodied AI},
+  author  = {Wedd, Adrian},
+  year    = {2026},
+  url     = {https://failurefirst.org},
+  note    = {258 models, 142{,}307 prompts, 346 attack techniques}
+}
+```
 
-1. **Fork** the repository
-2. **Create a branch** for your changes
-3. **Make your changes** following our guidelines
-4. **Submit a pull request** with a clear description
+### Red-Team Collaboration
 
-## Safety Review
+We welcome collaboration with AI safety researchers, red-team practitioners, and frontier lab security teams. If you have adversarial evaluation results, novel attack technique taxonomies, or defense effectiveness data you would like to contribute or cross-validate, open a GitHub issue describing your institutional affiliation and research focus.
+
+### Dataset Contributions
+
+If you have adversarial evaluation datasets that could strengthen the corpus, we accept contributions subject to:
+
+- **Pattern-level only**: no operational exploits or copy-paste attack templates
+- **Provenance documented**: source, collection methodology, and intended use
+- **Schema compliance**: data must conform to our versioned JSON Schemas (documented in the private repository; we will assist with formatting)
+- **Safety review**: all contributed data undergoes review before inclusion
+
+### Documentation Improvements
+
+Corrections, clarifications, and improvements to public-facing documentation (this repository, the design charter, security policy) are welcome via pull request.
 
-All contributions undergo safety review to ensure:
-- No operational exploit instructions
-- Pattern-level descriptions only
-- Appropriate for public repository
-- Aligned with defensive research mission
+## What We Do Not Accept
 
-## Code of Conduct
+- Operational exploit code or working jailbreak prompts
+- Model-specific bypass techniques intended for attack
+- Raw adversarial datasets without provenance
+- Content that facilitates real-world harm outside AI safety research
 
-- Be respectful and professional
-- Focus on defensive AI safety research
-- No weaponization of research findings
-- Maintain academic integrity
+## Vulnerability Reporting
 
-## Questions?
+If you discover vulnerabilities in AI systems -- whether through this framework or independent research -- please follow responsible disclosure practices. See [SECURITY.md](SECURITY.md) for our coordinated disclosure process.
 
-- **Issues**: Open a GitHub issue for questions or suggestions
-- **Discussions**: Use GitHub Discussions for research-related conversations
+## Process
+
+1. Open a GitHub issue describing the proposed contribution
+2. For documentation changes, submit a pull request directly
+3. For research collaborations and dataset contributions, we will coordinate via issue discussion
+
+## Safety Review
+
+All contributions undergo safety review to ensure content remains pattern-level, defensively purposed, and appropriate for a public repository. This review is not optional and applies equally to maintainers and external contributors.
 
 ## License
 
-By contributing, you agree that your contributions will be licensed under the MIT License, the same license as this project.
+By contributing, you agree that your contributions will be licensed under the MIT License.
 
 ---
 
-**Remember:** This is defensive AI safety research. All contributions should strengthen defenses, not enable attacks.
-
-**Last updated:** 2026-02-01
+**Last updated:** 2026-05-06
diff --git a/DESIGN_CHARTER.md b/DESIGN_CHARTER.md
@@ -22,7 +22,14 @@ This is a **research methodology for studying AI safety through systematic failu
 
 At its center is a principle: **failure is signal, not noise**.
 
-The framework exists to support *rigorous failure analysis, defensive research, and safety boundary mapping*.
+The framework exists to support *rigorous failure analysis, defensive research, and safety boundary mapping* across the full landscape of adversarial AI evaluation:
+
+- **Jailbreak archaeology**: systematic study of how adversarial techniques evolve across eras, from early DAN-style prompts through crescendo attacks, format-lock exploitation, and reasoning-chain manipulation
+- **VLA safety evaluation**: 42 attack families targeting vision-language-action models, covering affordance manipulation, kinematic injection, safety instruction dilution, and dual-layer attacks
+- **Multi-turn escalation**: stateful attack sequences that exploit context accumulation in reasoning and tool-using models
+- **Format-lock attacks**: structured output compliance (JSON, YAML, code) as an attack vector against safety training
+- **Defense effectiveness measurement**: quantifying how safety mechanisms perform under adversarial pressure, including positional bias, iatrogenic effects, and cross-model vulnerability inheritance
+- **Classifier reliability**: documenting the gap between heuristic and LLM-graded attack success rates
 
 This is not a demonstration.
 It is not an attack platform.
@@ -59,7 +66,7 @@ The tooling must not pretend otherwise.
 These are not preferences.
 They are constraints.
 
-If an implementation violates any of the following, it is wrong—even if it "works".
+If an implementation violates any of the following, it is wrong -- even if it "works".
 
 ### 3.1 Pattern-Level Only, Never Operational
 
@@ -109,6 +116,14 @@ If an implementation violates any of the following, it is wrong—even if it "wo
 - No security through obscurity
 - If the process cannot withstand scrutiny, it should not be published
 
+### 3.7 Statistical Rigor Over Convenience
+
+- All ASR claims must specify grading methodology (LLM-only, heuristic-only, or coalesced)
+- Confidence intervals are required for sample-based estimates
+- Multiple-comparison corrections are mandatory for pairwise model comparisons
+- Classifier reliability must be documented with inter-rater agreement metrics
+- If a finding cannot survive peer review scrutiny, it should not be published
+
 ---
 
 ## 4. Architectural Decisions (Final)
@@ -121,8 +136,10 @@ They are no longer exploratory.
 - **Validation**: JSON Schema + custom invariant checks
 - **Safety gates**: Heuristic linters + manual review
 - **Benchmark runners**: Model-agnostic (CLI templates + HTTP API adapters)
-- **Scoring**: Refusal quality, recovery mechanisms, attack success rates
+- **Scoring**: Three-tier ASR (strict, broad, functionally dangerous) with Wilson CIs
+- **Classification**: LLM-graded (FLIP protocol) with documented grader reliability
 - **Version control**: Git, with CI enforcement of validation/linting
+- **Database**: SQLite corpus unifying prompts, techniques, and evaluation results
 
 If a future change requires revisiting these decisions, the charter must be updated explicitly.
 
@@ -152,7 +169,7 @@ If adversarial content can be weaponized directly, it has violated this charter.
 
 This document defines intent and limits.
 
-Implementation details—including tool design, algorithms, dataset schema extensions, benchmark metrics, and acceptance criteria—are defined in issues, documentation, and code.
+Implementation details -- including tool design, algorithms, dataset schema extensions, benchmark metrics, and acceptance criteria -- are defined in issues, documentation, and code.
 
 The relationship is explicit:
 
@@ -210,7 +227,7 @@ When using AI systems to generate adversarial scenarios:
 
 ## 9. Research Ethics Boundaries
 
-This project operates within established AI safety research norms:
+This project operates within established AI safety research norms. A full research ethics charter is maintained in the private repository.
 
 ### Acceptable Research Activities
 
@@ -219,6 +236,7 @@ This project operates within established AI safety research norms:
 - Testing robustness of safety mechanisms
 - Improving alignment under adversarial pressure
 - Publishing defensive research findings
+- Coordinated vulnerability disclosure to model providers
 
 ### Unacceptable Activities
 
@@ -230,10 +248,10 @@ This project operates within established AI safety research norms:
 
 ### Disclosure Standards
 
-- Vulnerabilities discovered through this research should be disclosed responsibly
-- Real-world safety issues should be reported to affected parties before public disclosure
-- Research findings should distinguish between controlled evaluation and real-world risk
-- Limitations of evaluation harnesses must be stated explicitly
+- Vulnerabilities discovered through this research are disclosed responsibly
+- Real-world safety issues are reported to affected parties before public disclosure
+- Research findings distinguish between controlled evaluation and real-world risk
+- Limitations of evaluation harnesses are stated explicitly
 
 ---
 
@@ -249,8 +267,8 @@ This charter may evolve as the project grows, but changes must be:
 Minor clarifications (typo fixes, example additions) do not require versioning.
 Substantive changes (adding/removing principles, changing constraints) require charter version increment.
 
-**Current version**: 1.0
-**Last updated**: 2025-01-11
+**Current version**: 2.0
+**Last updated**: 2026-03-29
 
 ---
 

diff --git a/MANIFEST.json b/MANIFEST.json
@@ -3,14 +3,17 @@
   "note": "Full traces available under NDA. Contact via GitHub issue.",
   "generated_from": "failure-first-embodied-ai (private)",
   "totals": {
-    "files": 632,
+    "files": 860,
     "invariant_errors": 0,
     "json_parse_errors": 0,
-    "rows": 51201,
+    "rows": 60847,
     "schema_errors": 0,
-    "failure_classes": 661,
-    "domains": 19,
-    "models_evaluated": 51
+    "prompts": 142307,
+    "results": 140794,
+    "techniques": 346,
+    "harm_classes": 139,
+    "domains": 28,
+    "models_evaluated": 258
   },
   "packs_by_kind": {
     "adversarial_poetry": 3,
@@ -493,7 +496,7 @@
       "validation_ok": true
     },
     {
-      "path": "data/generated_attacks/massive_scale/expanded/Conceptual_Semantic_M\u00f6bius_Strip.jsonl",
+      "path": "data/generated_attacks/massive_scale/expanded/Conceptual_Semantic_Möbius_Strip.jsonl",
       "rows": 81,
       "bytes": 59756,
       "pack_kind": "massive_scale_expanded",
@@ -1354,4 +1357,4 @@
       "validation_ok": true
     }
   ]
-}
+}