From bbcdc236045f04bc1abfe324757eeb9a4c46f7ef Mon Sep 17 00:00:00 2001
From: Ethan Troy <63926014+ethanolivertroy@users.noreply.github.com>
Date: Thu, 14 May 2026 04:37:33 +0000
Subject: [PATCH 1/2] add api validation provenance and schemas

---
 .github/workflows/update-data.yml             |   5 +
 README.md                                     |  40 +-
 api/schemas/algorithms.schema.json            |  50 ++
 api/schemas/certificate-detail.schema.json    | 195 +++++
 api/schemas/historical-modules.schema.json    |  22 +
 api/schemas/index.schema.json                 |  49 ++
 api/schemas/metadata.schema.json              |  70 ++
 api/schemas/module-in-process.schema.json     |  30 +
 api/schemas/module.schema.json                | 155 ++++
 api/schemas/modules-in-process.schema.json    |  22 +
 api/schemas/modules.schema.json               |  22 +
 scraper.py                                    | 733 +++++++++++++++++-
 test_scraper.py                               | 271 +++++++
 .../5152_fips_140_2_algorithms.txt            |  21 +
 .../5260_fips_140_3_algorithms.txt            |  17 +
 validate_api.py                               | 508 ++++++++++++
 16 files changed, 2168 insertions(+), 42 deletions(-)
 create mode 100644 api/schemas/algorithms.schema.json
 create mode 100644 api/schemas/certificate-detail.schema.json
 create mode 100644 api/schemas/historical-modules.schema.json
 create mode 100644 api/schemas/index.schema.json
 create mode 100644 api/schemas/metadata.schema.json
 create mode 100644 api/schemas/module-in-process.schema.json
 create mode 100644 api/schemas/module.schema.json
 create mode 100644 api/schemas/modules-in-process.schema.json
 create mode 100644 api/schemas/modules.schema.json
 create mode 100644 tests/fixtures/nist_security_policies/5152_fips_140_2_algorithms.txt
 create mode 100644 tests/fixtures/nist_security_policies/5260_fips_140_3_algorithms.txt
 create mode 100644 validate_api.py

diff --git a/.github/workflows/update-data.yml b/.github/workflows/update-data.yml
index cfa1a6250..24df0262a 100644
--- a/.github/workflows/update-data.yml
+++ b/.github/workflows/update-data.yml
@@ -16,6 +16,7 @@ on:
       - main
     paths:
       - 'scraper.py'
+      - 'validate_api.py'
       - 'requirements.txt'
       - '.github/workflows/update-data.yml'
 
@@ -49,6 +50,10 @@ jobs:
         run: |
           python scraper.py
 
+      - name: Validate generated API artifacts
+        run: |
+          python validate_api.py --require-current-schema --forbid-firecrawl-run-source
+
       - name: Check for changes
         id: git-check
         run: |
diff --git a/README.md b/README.md
index 676bdbf12..f9029d847 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ Static JSON API for NIST Cryptographic Module Validation Program data. Auto-upda
 - **Historical Modules**: Expired/revoked modules for historical reference
 - **Modules In Process**: Modules currently in validation
 - **Algorithm Extraction**: Approved algorithms extracted from Security Policy PDFs with Crawl4AI, with a local PDF parser fallback
+- **Extraction Provenance**: Per-certificate `algorithm_extraction` metadata records cache/fallback status, source URL, and extracted row counts
 - **Security Policy Links**: Direct URLs to Security Policy PDF documents
 - **Certificate Detail Records**: Per-certificate JSON with vendor, related files, validation history, and security level exceptions
 
@@ -17,6 +18,7 @@ Static JSON API for NIST Cryptographic Module Validation Program data. Auto-upda
 - [`llms-full.txt`](https://hackidle.github.io/nist-cmvp-api/llms-full.txt) - complete single-file reference
 - [`api/docs.md`](https://hackidle.github.io/nist-cmvp-api/api/docs.md) - Markdown endpoint reference with examples
 - [`openapi.json`](https://hackidle.github.io/nist-cmvp-api/openapi.json) - OpenAPI 3.0.3 schema
+- [`api/schemas/index.schema.json`](https://hackidle.github.io/nist-cmvp-api/api/schemas/index.schema.json) - JSON Schema index for API responses
 
 ## Endpoints
 
@@ -30,6 +32,7 @@ Base URL: `https://hackidle.github.io/nist-cmvp-api/api/`
 | `algorithms.json` | Algorithm summary with usage statistics across all certificates |
 | `metadata.json` | Dataset info (last update, counts, feature flags) |
 | `index.json` | API index with all endpoints and feature information |
+| `schemas/*.schema.json` | JSON Schemas for response validation |
 | `certificates/{certificate}.json` | Structured detail record for one CMVP certificate |
 
 ## Data Structure
@@ -54,7 +57,17 @@ Base URL: `https://hackidle.github.io/nist-cmvp-api/api/`
   "embodiment": "Multi-Chip Stand Alone",
   "description": "A software library that contains cryptographic functionality...",
   "lab": "DEKRA Cybersecurity Certification Laboratory",
-  "algorithms": ["AES", "SHA-256", "RSA", "ECDSA", "HMAC", "DRBG"]
+  "algorithms": ["AES", "SHA-256", "RSA", "ECDSA", "HMAC", "DRBG"],
+  "algorithm_extraction": {
+    "status": "parsed",
+    "configured_source": "crawl4ai",
+    "source": "crawl4ai",
+    "source_url": "https://csrc.nist.gov/CSRC/media/projects/.../140sp5104.pdf",
+    "cached": false,
+    "fallback_used": false,
+    "algorithm_count": 6,
+    "detailed_algorithm_count": 42
+  }
 }
 ```
 
@@ -121,7 +134,17 @@ Base URL: `https://hackidle.github.io/nist-cmvp-api/api/`
         "lab": "Lightship Security, Inc."
       }
     ],
-    "algorithms": ["AES", "HMAC"]
+    "algorithms": ["AES", "HMAC"],
+    "algorithm_extraction": {
+      "status": "parsed",
+      "configured_source": "crawl4ai",
+      "source": "security_policy_pdf",
+      "source_url": "https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5203.pdf",
+      "cached": false,
+      "fallback_used": true,
+      "algorithm_count": 2,
+      "detailed_algorithm_count": 18
+    }
   }
 }
 ```
@@ -147,8 +170,14 @@ curl -s https://hackidle.github.io/nist-cmvp-api/api/algorithms.json | \
 # Get the full detail page payload for one certificate
 curl -s https://hackidle.github.io/nist-cmvp-api/api/certificates/5203.json | jq '.certificate'
 
-# Check last update
-curl -s https://hackidle.github.io/nist-cmvp-api/api/metadata.json | jq '.generated_at'
+# Check last update and extraction metrics
+curl -s https://hackidle.github.io/nist-cmvp-api/api/metadata.json | \
+  jq '{generated_at, extraction_metrics: .extraction_metrics.combined}'
+
+# Validate a response with a published JSON Schema (requires: pip install jsonschema)
+curl -s https://hackidle.github.io/nist-cmvp-api/api/schemas/modules.schema.json > modules.schema.json
+curl -s https://hackidle.github.io/nist-cmvp-api/api/modules.json > modules.json
+python -m jsonschema modules.schema.json -i modules.json
 ```
 
 ## Local Development
@@ -165,6 +194,9 @@ ALGORITHM_SOURCE=security_policy_pdf python scraper.py
 
 # Run quick scraper (skip algorithm extraction entirely)
 SKIP_ALGORITHMS=1 python scraper.py
+
+# Validate generated artifacts before publishing
+python validate_api.py --require-current-schema --forbid-firecrawl-run-source
 ```
 
 ## Environment Variables
diff --git a/api/schemas/algorithms.schema.json b/api/schemas/algorithms.schema.json
new file mode 100644
index 000000000..72bcdd158
--- /dev/null
+++ b/api/schemas/algorithms.schema.json
@@ -0,0 +1,50 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/algorithms.schema.json",
+  "title": "NIST CMVP Algorithms Summary Response",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "total_unique_algorithms",
+    "total_certificate_algorithm_pairs",
+    "algorithms",
+    "metadata"
+  ],
+  "properties": {
+    "total_unique_algorithms": {
+      "type": "integer",
+      "minimum": 0
+    },
+    "total_certificate_algorithm_pairs": {
+      "type": "integer",
+      "minimum": 0
+    },
+    "algorithms": {
+      "type": "object",
+      "additionalProperties": {
+        "type": "object",
+        "additionalProperties": false,
+        "required": [
+          "count",
+          "certificates"
+        ],
+        "properties": {
+          "count": {
+            "type": "integer",
+            "minimum": 0
+          },
+          "certificates": {
+            "type": "array",
+            "items": {
+              "type": "integer"
+            }
+          }
+        }
+      }
+    },
+    "metadata": {
+      "type": "object",
+      "additionalProperties": true
+    }
+  }
+}
diff --git a/api/schemas/certificate-detail.schema.json b/api/schemas/certificate-detail.schema.json
new file mode 100644
index 000000000..027cf8274
--- /dev/null
+++ b/api/schemas/certificate-detail.schema.json
@@ -0,0 +1,195 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/certificate-detail.schema.json",
+  "title": "NIST CMVP Certificate Detail Response",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "metadata",
+    "certificate"
+  ],
+  "properties": {
+    "metadata": {
+      "type": "object",
+      "additionalProperties": true,
+      "required": [
+        "generated_at",
+        "dataset",
+        "source"
+      ]
+    },
+    "certificate": {
+      "type": "object",
+      "additionalProperties": true,
+      "required": [
+        "certificate_number",
+        "dataset",
+        "generated_at",
+        "nist_page_url",
+        "certificate_detail_url",
+        "security_policy_url",
+        "vendor_name",
+        "module_name",
+        "standard",
+        "status",
+        "related_files",
+        "validation_history",
+        "vendor"
+      ],
+      "properties": {
+        "certificate_number": {
+          "type": "string",
+          "pattern": "^[0-9]+$"
+        },
+        "dataset": {
+          "type": "string",
+          "enum": [
+            "active",
+            "historical"
+          ]
+        },
+        "generated_at": {
+          "type": "string",
+          "format": "date-time"
+        },
+        "nist_page_url": {
+          "type": "string",
+          "format": "uri"
+        },
+        "certificate_detail_url": {
+          "type": "string",
+          "format": "uri"
+        },
+        "security_policy_url": {
+          "type": [
+            "string",
+            "null"
+          ],
+          "format": "uri"
+        },
+        "vendor_name": {
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "module_name": {
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "standard": {
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "status": {
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "related_files": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "additionalProperties": true
+          }
+        },
+        "validation_history": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "additionalProperties": true
+          }
+        },
+        "vendor": {
+          "type": "object",
+          "additionalProperties": true
+        },
+        "algorithms": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "algorithms_detailed": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "algorithm_extraction": {
+          "type": "object",
+          "additionalProperties": true,
+          "required": [
+            "schema_version",
+            "status",
+            "configured_source",
+            "source",
+            "cached",
+            "fallback_used",
+            "cache_version",
+            "algorithm_count",
+            "detailed_algorithm_count"
+          ],
+          "properties": {
+            "schema_version": {
+              "type": "string"
+            },
+            "status": {
+              "type": "string",
+              "enum": [
+                "parsed",
+                "cached",
+                "miss",
+                "skipped"
+              ]
+            },
+            "configured_source": {
+              "type": "string"
+            },
+            "source": {
+              "type": "string"
+            },
+            "source_url": {
+              "type": [
+                "string",
+                "null"
+              ],
+              "format": "uri"
+            },
+            "cached": {
+              "type": "boolean"
+            },
+            "fallback_used": {
+              "type": "boolean"
+            },
+            "cache_version": {
+              "type": "string"
+            },
+            "algorithm_count": {
+              "type": "integer",
+              "minimum": 0
+            },
+            "detailed_algorithm_count": {
+              "type": "integer",
+              "minimum": 0
+            },
+            "attempts": {
+              "type": "array",
+              "items": {
+                "type": "object",
+                "additionalProperties": {
+                  "type": "string"
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/api/schemas/historical-modules.schema.json b/api/schemas/historical-modules.schema.json
new file mode 100644
index 000000000..91c0b8aa4
--- /dev/null
+++ b/api/schemas/historical-modules.schema.json
@@ -0,0 +1,22 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/historical-modules.schema.json",
+  "title": "NIST CMVP Historical Modules Response",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "metadata",
+    "modules"
+  ],
+  "properties": {
+    "metadata": {
+      "$ref": "/api/schemas/metadata.schema.json"
+    },
+    "modules": {
+      "type": "array",
+      "items": {
+        "$ref": "/api/schemas/module.schema.json"
+      }
+    }
+  }
+}
diff --git a/api/schemas/index.schema.json b/api/schemas/index.schema.json
new file mode 100644
index 000000000..f89101f53
--- /dev/null
+++ b/api/schemas/index.schema.json
@@ -0,0 +1,49 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/index.schema.json",
+  "title": "NIST CMVP API JSON Schema Index",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "name",
+    "schema_version",
+    "base_url",
+    "schemas"
+  ],
+  "properties": {
+    "name": {
+      "type": "string"
+    },
+    "schema_version": {
+      "type": "string"
+    },
+    "base_url": {
+      "type": "string",
+      "format": "uri"
+    },
+    "schemas": {
+      "type": "object",
+      "additionalProperties": {
+        "type": "string"
+      }
+    }
+  },
+  "examples": [
+    {
+      "name": "NIST CMVP API JSON Schemas",
+      "schema_version": "1.0",
+      "base_url": "https://hackidle.github.io/nist-cmvp-api",
+      "schemas": {
+        "index": "/api/schemas/index.schema.json",
+        "metadata": "/api/schemas/metadata.schema.json",
+        "module": "/api/schemas/module.schema.json",
+        "module_in_process": "/api/schemas/module-in-process.schema.json",
+        "modules": "/api/schemas/modules.schema.json",
+        "historical_modules": "/api/schemas/historical-modules.schema.json",
+        "modules_in_process": "/api/schemas/modules-in-process.schema.json",
+        "certificate_detail": "/api/schemas/certificate-detail.schema.json",
+        "algorithms": "/api/schemas/algorithms.schema.json"
+      }
+    }
+  ]
+}
diff --git a/api/schemas/metadata.schema.json b/api/schemas/metadata.schema.json
new file mode 100644
index 000000000..967470dde
--- /dev/null
+++ b/api/schemas/metadata.schema.json
@@ -0,0 +1,70 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/metadata.schema.json",
+  "title": "NIST CMVP API Metadata",
+  "type": "object",
+  "additionalProperties": true,
+  "required": [
+    "generated_at",
+    "total_modules",
+    "total_historical_modules",
+    "total_modules_in_process",
+    "total_certificates_with_algorithms",
+    "total_certificate_details",
+    "source",
+    "modules_in_process_source",
+    "algorithm_source",
+    "algorithm_cache_version",
+    "version"
+  ],
+  "properties": {
+    "generated_at": {
+      "type": "string",
+      "format": "date-time"
+    },
+    "total_modules": {
+      "type": "integer",
+      "minimum": 0
+    },
+    "total_historical_modules": {
+      "type": "integer",
+      "minimum": 0
+    },
+    "total_modules_in_process": {
+      "type": "integer",
+      "minimum": 0
+    },
+    "total_certificates_with_algorithms": {
+      "type": "integer",
+      "minimum": 0
+    },
+    "total_certificate_details": {
+      "type": "integer",
+      "minimum": 0
+    },
+    "source": {
+      "type": "string",
+      "format": "uri"
+    },
+    "modules_in_process_source": {
+      "type": "string",
+      "format": "uri"
+    },
+    "algorithm_source": {
+      "type": "string"
+    },
+    "algorithm_cache_version": {
+      "type": "string"
+    },
+    "algorithm_extraction_schema_version": {
+      "type": "string"
+    },
+    "extraction_metrics": {
+      "type": "object",
+      "additionalProperties": true
+    },
+    "version": {
+      "type": "string"
+    }
+  }
+}
diff --git a/api/schemas/module-in-process.schema.json b/api/schemas/module-in-process.schema.json
new file mode 100644
index 000000000..10f0694c4
--- /dev/null
+++ b/api/schemas/module-in-process.schema.json
@@ -0,0 +1,30 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/module-in-process.schema.json",
+  "title": "NIST CMVP Module In Process Row",
+  "type": "object",
+  "additionalProperties": true,
+  "required": [
+    "Module Name",
+    "Vendor Name",
+    "Standard",
+    "Status"
+  ],
+  "properties": {
+    "Module Name": {
+      "type": "string"
+    },
+    "Vendor Name": {
+      "type": "string"
+    },
+    "Vendor Name_url": {
+      "type": "string"
+    },
+    "Standard": {
+      "type": "string"
+    },
+    "Status": {
+      "type": "string"
+    }
+  }
+}
diff --git a/api/schemas/module.schema.json b/api/schemas/module.schema.json
new file mode 100644
index 000000000..81cea4a5c
--- /dev/null
+++ b/api/schemas/module.schema.json
@@ -0,0 +1,155 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/module.schema.json",
+  "title": "NIST CMVP Module Row",
+  "type": "object",
+  "additionalProperties": true,
+  "required": [
+    "Certificate Number",
+    "Vendor Name",
+    "Module Name",
+    "security_policy_url",
+    "certificate_detail_url",
+    "detail_available"
+  ],
+  "properties": {
+    "Certificate Number": {
+      "type": "string",
+      "pattern": "^[0-9]+$"
+    },
+    "Certificate Number_url": {
+      "type": "string"
+    },
+    "Vendor Name": {
+      "type": "string"
+    },
+    "Module Name": {
+      "type": "string"
+    },
+    "Module Type": {
+      "type": "string"
+    },
+    "Validation Date": {
+      "type": "string"
+    },
+    "Status": {
+      "type": "string"
+    },
+    "security_policy_url": {
+      "type": "string",
+      "format": "uri"
+    },
+    "certificate_detail_url": {
+      "type": "string",
+      "format": "uri"
+    },
+    "standard": {
+      "type": [
+        "string",
+        "null"
+      ]
+    },
+    "status": {
+      "type": [
+        "string",
+        "null"
+      ]
+    },
+    "overall_level": {
+      "type": [
+        "integer",
+        "string",
+        "null"
+      ]
+    },
+    "sunset_date": {
+      "type": [
+        "string",
+        "null"
+      ]
+    },
+    "detail_available": {
+      "type": "boolean"
+    },
+    "algorithms": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "algorithms_detailed": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "algorithm_extraction": {
+      "type": "object",
+      "additionalProperties": true,
+      "required": [
+        "schema_version",
+        "status",
+        "configured_source",
+        "source",
+        "cached",
+        "fallback_used",
+        "cache_version",
+        "algorithm_count",
+        "detailed_algorithm_count"
+      ],
+      "properties": {
+        "schema_version": {
+          "type": "string"
+        },
+        "status": {
+          "type": "string",
+          "enum": [
+            "parsed",
+            "cached",
+            "miss",
+            "skipped"
+          ]
+        },
+        "configured_source": {
+          "type": "string"
+        },
+        "source": {
+          "type": "string"
+        },
+        "source_url": {
+          "type": [
+            "string",
+            "null"
+          ],
+          "format": "uri"
+        },
+        "cached": {
+          "type": "boolean"
+        },
+        "fallback_used": {
+          "type": "boolean"
+        },
+        "cache_version": {
+          "type": "string"
+        },
+        "algorithm_count": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "detailed_algorithm_count": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "attempts": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "additionalProperties": {
+              "type": "string"
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/api/schemas/modules-in-process.schema.json b/api/schemas/modules-in-process.schema.json
new file mode 100644
index 000000000..08c87f6af
--- /dev/null
+++ b/api/schemas/modules-in-process.schema.json
@@ -0,0 +1,22 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/modules-in-process.schema.json",
+  "title": "NIST CMVP Modules In Process Response",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "metadata",
+    "modules_in_process"
+  ],
+  "properties": {
+    "metadata": {
+      "$ref": "/api/schemas/metadata.schema.json"
+    },
+    "modules_in_process": {
+      "type": "array",
+      "items": {
+        "$ref": "/api/schemas/module-in-process.schema.json"
+      }
+    }
+  }
+}
diff --git a/api/schemas/modules.schema.json b/api/schemas/modules.schema.json
new file mode 100644
index 000000000..0595e05c2
--- /dev/null
+++ b/api/schemas/modules.schema.json
@@ -0,0 +1,22 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://hackidle.github.io/nist-cmvp-api/api/schemas/modules.schema.json",
+  "title": "NIST CMVP Active Modules Response",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "metadata",
+    "modules"
+  ],
+  "properties": {
+    "metadata": {
+      "$ref": "/api/schemas/metadata.schema.json"
+    },
+    "modules": {
+      "type": "array",
+      "items": {
+        "$ref": "/api/schemas/module.schema.json"
+      }
+    }
+  }
+}
diff --git a/scraper.py b/scraper.py
index 9877f78f6..07fe2cdab 100644
--- a/scraper.py
+++ b/scraper.py
@@ -32,6 +32,7 @@
 import sqlite3
 import sys
 import time
+from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Dict, List, Optional, Set, Tuple
@@ -75,6 +76,7 @@
 CRAWL4AI_ALGORITHM_SOURCE = "crawl4ai"
 SECURITY_POLICY_ALGORITHM_SOURCE = "security_policy_pdf"
 ALGORITHM_CACHE_VERSION = "2026-04-15-legacy-v1"
+ALGORITHM_EXTRACTION_SCHEMA_VERSION = "1.0"
 CACHEABLE_ALGORITHM_SOURCES = {
     CRAWL4AI_ALGORITHM_SOURCE,
     SECURITY_POLICY_ALGORITHM_SOURCE,
@@ -236,7 +238,7 @@
     ("RSA", re.compile(r"\bRSA\b", re.IGNORECASE)),
     ("ECDSA", re.compile(r"\bECDSA\b", re.IGNORECASE)),
     ("ECDH", re.compile(r"\bECDH\b", re.IGNORECASE)),
-    ("DRBG", re.compile(r"\bDRBG\b", re.IGNORECASE)),
+    ("DRBG", re.compile(r"(?:\b|_)DRBG\b", re.IGNORECASE)),
     ("KDF", re.compile(r"\b(KDF|KDA|KBKDF|HKDF|PBKDF)\b", re.IGNORECASE)),
     ("KAS", re.compile(r"\bKAS\b", re.IGNORECASE)),
     ("KTS", re.compile(r"\bKTS\b", re.IGNORECASE)),
@@ -249,6 +251,38 @@
     ("CVL", re.compile(r"\bCVL\b", re.IGNORECASE)),
 ]
 
+PROCESSING_STAT_KEYS = (
+    "html_reused",
+    "html_refreshed",
+    "html_failed",
+    "pdf_reused",
+    "pdf_refreshed",
+    "pdf_failed",
+    "pdf_cache_hits",
+    "algorithm_misses",
+    "algorithm_cache_hits",
+    "algorithm_successes",
+    "algorithm_fallbacks",
+    "algorithm_source_crawl4ai",
+    "algorithm_source_security_policy_pdf",
+    "algorithm_source_database",
+    "algorithm_source_none",
+)
+
+
+@dataclass
+class AlgorithmExtractionResult:
+    """Result of attempting to extract algorithms for one Security Policy."""
+
+    detailed: List[str]
+    categories: List[str]
+    parsed: bool
+    source: str
+    source_url: Optional[str] = None
+    fallback_used: bool = False
+    pdf_cache_hits: int = 0
+    attempts: List[Dict[str, str]] = field(default_factory=list)
+
 
 def fetch_page(url: str, timeout: int = 30, retries: int = 3) -> Optional[str]:
     """
@@ -321,6 +355,8 @@ def normalize_string_list(values: Optional[List[str]]) -> List[str]:
     normalized: List[str] = []
     seen: Set[str] = set()
     for value in values or []:
+        if value is None:
+            continue
         text = normalize_whitespace(str(value))
         if not text or text in seen:
             continue
@@ -329,6 +365,97 @@ def normalize_string_list(values: Optional[List[str]]) -> List[str]:
     return normalized
 
 
+def new_processing_stats() -> Dict[str, int]:
+    """Return zeroed scrape/extraction counters for one dataset or certificate."""
+    return {key: 0 for key in PROCESSING_STAT_KEYS}
+
+
+def add_processing_stats(target: Dict[str, int], increment: Dict[str, int]) -> None:
+    """Add processing counters from one stats dictionary into another."""
+    for key in PROCESSING_STAT_KEYS:
+        target[key] = target.get(key, 0) + increment.get(key, 0)
+
+
+def combine_processing_stats(*stats_dicts: Dict[str, int]) -> Dict[str, int]:
+    """Combine multiple processing stats dictionaries into one."""
+    combined = new_processing_stats()
+    for stats in stats_dicts:
+        add_processing_stats(combined, stats)
+    return combined
+
+
+def build_extraction_metrics(active_stats: Dict[str, int], historical_stats: Dict[str, int]) -> Dict[str, object]:
+    """Build metadata-safe scrape and algorithm extraction metrics."""
+    return {
+        "active": dict(active_stats),
+        "historical": dict(historical_stats),
+        "combined": combine_processing_stats(active_stats, historical_stats),
+        "concurrency": {
+            "certificate_fetch": CERT_FETCH_CONCURRENCY,
+            "security_policy_fetch": PDF_FETCH_CONCURRENCY,
+        },
+    }
+
+
+def build_algorithm_extraction_provenance(
+    configured_source: str,
+    status: str,
+    source: str,
+    source_url: Optional[str],
+    categories: Optional[List[str]],
+    detailed: Optional[List[str]],
+    cached: bool = False,
+    fallback_used: bool = False,
+    attempts: Optional[List[Dict[str, str]]] = None,
+) -> Dict[str, object]:
+    """Build the per-certificate provenance object for algorithm extraction."""
+    provenance = {
+        "schema_version": ALGORITHM_EXTRACTION_SCHEMA_VERSION,
+        "status": status,
+        "configured_source": configured_source,
+        "source": source,
+        "source_url": source_url,
+        "cached": cached,
+        "fallback_used": fallback_used,
+        "cache_version": ALGORITHM_CACHE_VERSION,
+        "algorithm_count": len(normalize_string_list(categories or [])),
+        "detailed_algorithm_count": len(normalize_string_list(detailed or [])),
+    }
+    if attempts is not None:
+        provenance["attempts"] = attempts
+    return provenance
+
+
+def apply_algorithm_extraction_provenance(
+    record: Optional[Dict],
+    provenance: Dict[str, object],
+    include_attempts: bool = False,
+) -> None:
+    """Attach algorithm extraction provenance to a module or detail payload."""
+    if record is None:
+        return
+    payload = dict(provenance)
+    if not include_attempts:
+        payload.pop("attempts", None)
+    record["algorithm_extraction"] = payload
+
+
+def cached_algorithm_extraction_source(
+    previous_module: Optional[Dict],
+    previous_detail: Optional[Dict],
+    previous_metadata: Dict,
+) -> Tuple[str, Optional[str]]:
+    """Return the best available extraction source metadata for cached algorithms."""
+    previous_extraction = (
+        (previous_detail or {}).get("algorithm_extraction")
+        or (previous_module or {}).get("algorithm_extraction")
+        or {}
+    )
+    source = previous_extraction.get("source") or previous_metadata.get("algorithm_source") or "cache"
+    source_url = previous_extraction.get("source_url")
+    return str(source), source_url if isinstance(source_url, str) else None
+
+
 def parse_certificate_number(record: Optional[Dict]) -> Optional[int]:
     """Extract an integer certificate number from a module row or detail payload."""
     if not record:
@@ -1199,6 +1326,24 @@ async def fetch_with_retry(
     return None
 
 
+async def fetch_policy_pdf_bytes(
+    client: httpx.AsyncClient,
+    url: str,
+    pdf_cache: Dict[str, asyncio.Task],
+    pdf_cache_lock: asyncio.Lock,
+) -> Tuple[Optional[bytes], bool]:
+    """Fetch Security Policy PDF bytes through an in-run task cache."""
+    async with pdf_cache_lock:
+        task = pdf_cache.get(url)
+        cache_hit = task is not None
+        if task is None:
+            task = asyncio.create_task(fetch_with_retry(client, url, response_type="bytes"))
+            pdf_cache[url] = task
+
+    result = await task
+    return result if isinstance(result, bytes) else None, cache_hit
+
+
 async def fetch_crawl4ai_policy_text(
     url: str,
     retries: int = 1,
@@ -1280,38 +1425,110 @@ async def fetch_certificate_algorithms(
     fallback_url: Optional[str],
     pdf_semaphore: asyncio.Semaphore,
     algorithm_source: str,
-) -> Tuple[List[str], List[str], bool]:
+    pdf_cache: Dict[str, asyncio.Task],
+    pdf_cache_lock: asyncio.Lock,
+) -> AlgorithmExtractionResult:
     """Fetch and parse a certificate's Security Policy using the configured source."""
+    attempts: List[Dict[str, str]] = []
+    pdf_cache_hits = 0
+
     for candidate in normalize_string_list([security_policy_url, fallback_url]):
         if algorithm_source == CRAWL4AI_ALGORITHM_SOURCE and CRAWL4AI_AVAILABLE:
+            attempt = {
+                "source": CRAWL4AI_ALGORITHM_SOURCE,
+                "url": candidate,
+                "status": "started",
+            }
             async with pdf_semaphore:
                 policy_text = await fetch_crawl4ai_policy_text(candidate)
             if policy_text:
                 try:
                     detailed, categories = parse_algorithms_from_policy_text(policy_text)
                     if detailed or categories:
-                        return detailed, categories, True
+                        attempt["status"] = "parsed"
+                        attempts.append(attempt)
+                        return AlgorithmExtractionResult(
+                            detailed=detailed,
+                            categories=categories,
+                            parsed=True,
+                            source=CRAWL4AI_ALGORITHM_SOURCE,
+                            source_url=candidate,
+                            attempts=attempts,
+                        )
+                    attempt["status"] = "no_algorithms"
+                    attempts.append(attempt)
                     print(
                         f"Warning: Crawl4AI returned policy text for {candidate} but no algorithm rows were found; "
                         "falling back to local PDF parsing.",
                         file=sys.stderr,
                     )
                 except Exception as exc:
+                    attempt["status"] = "parse_error"
+                    attempt["error"] = str(exc)[:200]
+                    attempts.append(attempt)
                     print(f"Warning: Failed to parse Crawl4AI policy text for {candidate}: {exc}", file=sys.stderr)
+            else:
+                attempt["status"] = "no_text"
+                attempts.append(attempt)
 
+        local_attempt = {
+            "source": SECURITY_POLICY_ALGORITHM_SOURCE,
+            "url": candidate,
+            "status": "started",
+        }
         async with pdf_semaphore:
-            pdf_bytes = await fetch_with_retry(client, candidate, response_type="bytes")
+            pdf_bytes, cache_hit = await fetch_policy_pdf_bytes(
+                client,
+                candidate,
+                pdf_cache,
+                pdf_cache_lock,
+            )
+        if cache_hit:
+            pdf_cache_hits += 1
+            local_attempt["cache_hit"] = "true"
         if not pdf_bytes:
+            local_attempt["status"] = "fetch_failed"
+            attempts.append(local_attempt)
             continue
 
         try:
             detailed, categories = parse_algorithms_from_policy_pdf_bytes(pdf_bytes)
             if detailed or categories:
-                return detailed, categories, True
+                local_attempt["status"] = "parsed"
+                attempts.append(local_attempt)
+                return AlgorithmExtractionResult(
+                    detailed=detailed,
+                    categories=categories,
+                    parsed=True,
+                    source=SECURITY_POLICY_ALGORITHM_SOURCE,
+                    source_url=candidate,
+                    fallback_used=any(
+                        attempt.get("source") == CRAWL4AI_ALGORITHM_SOURCE
+                        for attempt in attempts
+                    ),
+                    pdf_cache_hits=pdf_cache_hits,
+                    attempts=attempts,
+                )
+            local_attempt["status"] = "no_algorithms"
+            attempts.append(local_attempt)
         except Exception as exc:
+            local_attempt["status"] = "parse_error"
+            local_attempt["error"] = str(exc)[:200]
+            attempts.append(local_attempt)
             print(f"Warning: Failed to parse Security Policy PDF {candidate}: {exc}", file=sys.stderr)
 
-    return [], [], False
+    return AlgorithmExtractionResult(
+        detailed=[],
+        categories=[],
+        parsed=False,
+        source="none",
+        fallback_used=any(
+            attempt.get("source") == CRAWL4AI_ALGORITHM_SOURCE
+            for attempt in attempts
+        ),
+        pdf_cache_hits=pdf_cache_hits,
+        attempts=attempts,
+    )
 
 
 async def process_certificate_record(
@@ -1325,18 +1542,12 @@ async def process_certificate_record(
     client: httpx.AsyncClient,
     cert_semaphore: asyncio.Semaphore,
     pdf_semaphore: asyncio.Semaphore,
+    pdf_cache: Dict[str, asyncio.Task],
+    pdf_cache_lock: asyncio.Lock,
     database_algorithms_map: Dict[int, List[str]],
 ) -> Tuple[Dict, Optional[Dict], List[str], Dict[str, int]]:
     """Process one module row into an enriched module row and optional detail payload."""
-    stats = {
-        "html_reused": 0,
-        "html_refreshed": 0,
-        "html_failed": 0,
-        "pdf_reused": 0,
-        "pdf_refreshed": 0,
-        "pdf_failed": 0,
-        "algorithm_misses": 0,
-    }
+    stats = new_processing_stats()
 
     cert_number = parse_certificate_number(module)
     module_out = dict(previous_module or {})
@@ -1344,6 +1555,17 @@ async def process_certificate_record(
 
     if cert_number is None:
         strip_algorithm_fields(module_out)
+        apply_algorithm_extraction_provenance(
+            module_out,
+            build_algorithm_extraction_provenance(
+                algorithm_source,
+                "skipped",
+                "none",
+                None,
+                [],
+                [],
+            ),
+        )
         module_out["detail_available"] = False
         return module_out, None, [], stats
 
@@ -1407,38 +1629,106 @@ async def process_certificate_record(
     if algorithm_source == "database":
         categories = normalize_string_list(database_algorithms_map.get(cert_number, []))
         detailed: List[str] = []
+        extraction_status = "parsed" if categories else "miss"
+        extraction_provenance = build_algorithm_extraction_provenance(
+            algorithm_source,
+            extraction_status,
+            "database",
+            None,
+            categories,
+            detailed,
+        )
+        stats["algorithm_source_database"] += 1
+        if categories:
+            stats["algorithm_successes"] += 1
+        else:
+            stats["algorithm_misses"] += 1
         if detail_payload:
             apply_algorithm_fields(detail_payload, categories, detailed)
+            apply_algorithm_extraction_provenance(detail_payload, extraction_provenance, include_attempts=True)
         apply_algorithm_fields(module_out, categories, detailed)
+        apply_algorithm_extraction_provenance(module_out, extraction_provenance)
     elif algorithm_source in CACHEABLE_ALGORITHM_SOURCES:
         detailed, categories = ([], [])
         if trusted_algorithm_reuse:
             categories, detailed = cached_algorithm_fields(previous_module, previous_detail)
             stats["pdf_reused"] += 1
+            stats["algorithm_cache_hits"] += 1
+            cached_source, cached_source_url = cached_algorithm_extraction_source(
+                previous_module,
+                previous_detail,
+                previous_metadata,
+            )
+            extraction_provenance = build_algorithm_extraction_provenance(
+                algorithm_source,
+                "cached",
+                cached_source,
+                cached_source_url,
+                categories,
+                detailed,
+                cached=True,
+            )
+            if categories or detailed:
+                stats["algorithm_successes"] += 1
         else:
             if detail_payload:
                 strip_algorithm_fields(detail_payload)
             strip_algorithm_fields(module_out)
-            detailed, categories, parsed = await fetch_certificate_algorithms(
+            extraction_result = await fetch_certificate_algorithms(
                 client,
                 (detail_payload or {}).get("security_policy_url") or module.get("security_policy_url"),
                 get_security_policy_url(cert_number),
                 pdf_semaphore,
                 algorithm_source,
+                pdf_cache,
+                pdf_cache_lock,
             )
-            if parsed:
+            detailed = extraction_result.detailed
+            categories = extraction_result.categories
+            stats["pdf_cache_hits"] += extraction_result.pdf_cache_hits
+            extraction_provenance = build_algorithm_extraction_provenance(
+                algorithm_source,
+                "parsed" if extraction_result.parsed else "miss",
+                extraction_result.source,
+                extraction_result.source_url,
+                categories,
+                detailed,
+                fallback_used=extraction_result.fallback_used,
+                attempts=extraction_result.attempts,
+            )
+            if extraction_result.parsed:
                 stats["pdf_refreshed"] += 1
+                stats["algorithm_successes"] += 1
+                if extraction_result.source == CRAWL4AI_ALGORITHM_SOURCE:
+                    stats["algorithm_source_crawl4ai"] += 1
+                elif extraction_result.source == SECURITY_POLICY_ALGORITHM_SOURCE:
+                    stats["algorithm_source_security_policy_pdf"] += 1
+                if extraction_result.fallback_used:
+                    stats["algorithm_fallbacks"] += 1
             else:
                 stats["pdf_failed"] += 1
                 stats["algorithm_misses"] += 1
 
         if detail_payload:
             apply_algorithm_fields(detail_payload, categories, detailed)
+            apply_algorithm_extraction_provenance(detail_payload, extraction_provenance, include_attempts=True)
         apply_algorithm_fields(module_out, categories, detailed)
+        apply_algorithm_extraction_provenance(module_out, extraction_provenance)
     else:
+        extraction_provenance = build_algorithm_extraction_provenance(
+            algorithm_source,
+            "skipped",
+            "none",
+            None,
+            [],
+            [],
+        )
+        stats["algorithm_source_none"] += 1
         if detail_payload:
             strip_algorithm_fields(detail_payload)
+            apply_algorithm_extraction_provenance(detail_payload, extraction_provenance, include_attempts=True)
         strip_algorithm_fields(module_out)
+        apply_algorithm_extraction_provenance(module_out, extraction_provenance)
 
     module_out["detail_available"] = detail_payload is not None
     module_categories = normalize_string_list(module_out.get("algorithms", []))
@@ -1462,19 +1752,13 @@ async def build_certificate_artifacts(
     results: List[Optional[Dict]] = [None] * len(modules)
     payloads: Dict[int, Dict] = {}
     algorithms_map: Dict[int, List[str]] = {}
-    stats = {
-        "html_reused": 0,
-        "html_refreshed": 0,
-        "html_failed": 0,
-        "pdf_reused": 0,
-        "pdf_refreshed": 0,
-        "pdf_failed": 0,
-        "algorithm_misses": 0,
-    }
+    stats = new_processing_stats()
 
     timeout = httpx.Timeout(30.0)
     cert_semaphore = asyncio.Semaphore(CERT_FETCH_CONCURRENCY)
     pdf_semaphore = asyncio.Semaphore(PDF_FETCH_CONCURRENCY)
+    pdf_cache: Dict[str, asyncio.Task] = {}
+    pdf_cache_lock = asyncio.Lock()
 
     async with httpx.AsyncClient(
         headers={"User-Agent": USER_AGENT},
@@ -1497,6 +1781,8 @@ async def build_certificate_artifacts(
                         client,
                         cert_semaphore,
                         pdf_semaphore,
+                        pdf_cache,
+                        pdf_cache_lock,
                         database_algorithms_map,
                     )
                 )
@@ -1513,8 +1799,7 @@ async def build_certificate_artifacts(
                 payloads[cert_number] = detail_payload
             if cert_number is not None and categories:
                 algorithms_map[cert_number] = categories
-            for key, value in task_stats.items():
-                stats[key] += value
+            add_processing_stats(stats, task_stats)
             if completed % 100 == 0 or completed == total:
                 print(
                     f"  Progress: {completed}/{total} "
@@ -1859,9 +2144,27 @@ def documentation_paths() -> Dict[str, str]:
         "llms_full_txt": "/llms-full.txt",
         "api_docs": "/api/docs.md",
         "openapi": "/openapi.json",
+        "json_schemas": "/api/schemas/index.schema.json",
     }
 
 
+def schema_paths(algorithms_summary: Optional[Dict] = None) -> Dict[str, str]:
+    """Return published JSON Schema paths."""
+    paths = {
+        "index": "/api/schemas/index.schema.json",
+        "metadata": "/api/schemas/metadata.schema.json",
+        "module": "/api/schemas/module.schema.json",
+        "module_in_process": "/api/schemas/module-in-process.schema.json",
+        "modules": "/api/schemas/modules.schema.json",
+        "historical_modules": "/api/schemas/historical-modules.schema.json",
+        "modules_in_process": "/api/schemas/modules-in-process.schema.json",
+        "certificate_detail": "/api/schemas/certificate-detail.schema.json",
+    }
+    if algorithms_summary:
+        paths["algorithms"] = "/api/schemas/algorithms.schema.json"
+    return paths
+
+
 def sample_module_example(module: Optional[Dict]) -> Dict:
     """Build a compact module example for generated docs."""
     if not module:
@@ -1881,6 +2184,7 @@ def sample_module_example(module: Optional[Dict]) -> Dict:
         "security_policy_url",
         "certificate_detail_url",
         "detail_available",
+        "algorithm_extraction",
     ]
     example = {}
     for key in keys:
@@ -1889,6 +2193,9 @@ def sample_module_example(module: Optional[Dict]) -> Dict:
         value = module[key]
         if key in {"Module Name"}:
             value = truncate_text(value, 100)
+        if key == "algorithm_extraction" and isinstance(value, dict):
+            value = dict(value)
+            value.pop("attempts", None)
         example[key] = value
     if "description" in module:
         example["description"] = truncate_text(module["description"])
@@ -1925,6 +2232,10 @@ def sample_certificate_example(detail: Optional[Dict]) -> Dict:
         "validation_history": (detail.get("validation_history") or [])[:2],
         "algorithms": (detail.get("algorithms") or [])[:5],
     }
+    if isinstance(detail.get("algorithm_extraction"), dict):
+        algorithm_extraction = dict(detail["algorithm_extraction"])
+        algorithm_extraction.pop("attempts", None)
+        example["algorithm_extraction"] = algorithm_extraction
     return {key: value for key, value in example.items() if value not in (None, [], {})}
 
 
@@ -1982,7 +2293,10 @@ def build_api_reference_body(
         "`GET api/index.json` — API discovery endpoint with resource paths, documentation links, feature flags, and current counts.",
         "",
         "### Metadata",
-        "`GET api/metadata.json` — Generation timestamp, source URLs, dataset counts, and algorithm extraction status.",
+        "`GET api/metadata.json` — Generation timestamp, source URLs, dataset counts, extraction metrics, and algorithm extraction status.",
+        "",
+        "### JSON Schemas",
+        "`GET api/schemas/index.schema.json` — JSON Schema discovery document for the static API response files.",
         "",
         "### Active Modules",
         f"`GET api/modules.json` — All {format_count(total_modules)} active validated modules.",
@@ -1999,7 +2313,7 @@ def build_api_reference_body(
             }
         ),
         "",
-        "Each active module includes certificate identifiers, vendor/module names, validation metadata, direct Security Policy links, NIST detail URLs, and detail availability flags.",
+        "Each active module includes certificate identifiers, vendor/module names, validation metadata, direct Security Policy links, NIST detail URLs, detail availability flags, and algorithm extraction provenance when algorithms were evaluated.",
         "",
         "### Historical Modules",
         f"`GET api/historical-modules.json` — All {format_count(total_historical)} expired or revoked modules for historical lookups.",
@@ -2015,6 +2329,8 @@ def build_api_reference_body(
                 "### Algorithms",
                 f"`GET api/algorithms.json` — Algorithm usage summary across {format_count(total_algorithms)} certificates in the current build.",
                 "",
+                "`algorithm_extraction` records the configured source, actual source, cache/fallback status, source URL, and extracted row counts for each evaluated certificate.",
+                "",
                 "Example response (truncated):",
                 "",
                 render_json_block(sample_algorithms_example(algorithms_summary)),
@@ -2050,7 +2366,7 @@ def build_api_reference_body(
             "### Discover the API surface",
             "```",
             "GET api/index.json → endpoints, docs links, feature flags, counts",
-            "GET api/metadata.json → freshness and scrape provenance",
+            "GET api/metadata.json → freshness, scrape provenance, and extraction metrics",
             "```",
             "",
             "### Find a module and pull the full certificate record",
@@ -2073,7 +2389,7 @@ def build_api_reference_body(
                 "### Explore algorithm coverage",
                 "```",
                 "GET api/algorithms.json → counts and certificate lists per algorithm",
-                "GET api/modules.json → filter module rows by algorithms[] entries",
+                "GET api/modules.json → filter module rows by algorithms[] entries and inspect algorithm_extraction",
                 "```",
                 "",
             ]
@@ -2092,7 +2408,7 @@ def build_api_reference_body(
 
     if algorithms_summary:
         lines.append(
-            f"- **Algorithms coverage:** `api/algorithms.json` summarizes {format_count(total_algorithms)} certificates that had algorithm data in this build."
+            f"- **Algorithms coverage:** `api/algorithms.json` summarizes {format_count(total_algorithms)} certificates that had algorithm data in this build. `api/metadata.json` reports extraction cache hits, refreshes, failures, misses, and fallback counts."
         )
     else:
         lines.append(
@@ -2109,7 +2425,7 @@ def build_llms_txt(metadata: Dict, algorithms_summary: Optional[Dict]) -> str:
         f"- `api/modules.json` — {format_count(metadata.get('total_modules', 0))} active validated modules.",
         f"- `api/historical-modules.json` — {format_count(metadata.get('total_historical_modules', 0))} historical modules.",
         f"- `api/modules-in-process.json` — {format_count(metadata.get('total_modules_in_process', 0))} modules currently in process.",
-        "- `api/metadata.json` — generation timestamp, counts, and source URLs.",
+        "- `api/metadata.json` — generation timestamp, counts, source URLs, and extraction metrics.",
         f"- `api/certificates/{{certificate}}.json` — full detail record for a single CMVP certificate.",
     ]
     if algorithms_summary:
@@ -2139,6 +2455,7 @@ def build_llms_txt(metadata: Dict, algorithms_summary: Optional[Dict]) -> str:
         "- [API Reference](api/docs.md): endpoint reference with examples and workflows.",
         "- [Complete Documentation](llms-full.txt): fuller single-file agent reference.",
         "- [OpenAPI](openapi.json): OpenAPI 3.0.3 schema for the JSON endpoints.",
+        "- [JSON Schemas](api/schemas/index.schema.json): JSON Schema index for static API responses.",
         "",
         "## Caveats",
         "",
@@ -2242,6 +2559,7 @@ def build_index_html(metadata: Dict, algorithms_summary: Optional[Dict]) -> str:
         '    <li><a href="llms-full.txt">llms-full.txt</a></li>',
         '    <li><a href="api/docs.md">api/docs.md</a></li>',
         '    <li><a href="openapi.json">openapi.json</a></li>',
+        '    <li><a href="api/schemas/index.schema.json">JSON Schemas</a></li>',
     ]
 
     endpoint_links = [
@@ -2333,6 +2651,310 @@ def generate_text_artifacts(
     }
 
 
+def json_schema_document(title: str, schema_id: str, schema: Dict) -> Dict:
+    """Wrap a JSON Schema body with common metadata."""
+    document = {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "$id": f"{PUBLIC_BASE_URL}{schema_id}",
+        "title": title,
+    }
+    document.update(schema)
+    return document
+
+
+def algorithm_extraction_schema() -> Dict:
+    """Return the shared algorithm extraction provenance schema."""
+    return {
+        "type": "object",
+        "additionalProperties": True,
+        "required": [
+            "schema_version",
+            "status",
+            "configured_source",
+            "source",
+            "cached",
+            "fallback_used",
+            "cache_version",
+            "algorithm_count",
+            "detailed_algorithm_count",
+        ],
+        "properties": {
+            "schema_version": {"type": "string"},
+            "status": {"type": "string", "enum": ["parsed", "cached", "miss", "skipped"]},
+            "configured_source": {"type": "string"},
+            "source": {"type": "string"},
+            "source_url": {"type": ["string", "null"], "format": "uri"},
+            "cached": {"type": "boolean"},
+            "fallback_used": {"type": "boolean"},
+            "cache_version": {"type": "string"},
+            "algorithm_count": {"type": "integer", "minimum": 0},
+            "detailed_algorithm_count": {"type": "integer", "minimum": 0},
+            "attempts": {
+                "type": "array",
+                "items": {"type": "object", "additionalProperties": {"type": "string"}},
+            },
+        },
+    }
+
+
+def module_schema() -> Dict:
+    """Return a backwards-compatible schema for active and historical module rows."""
+    return {
+        "type": "object",
+        "additionalProperties": True,
+        "required": [
+            "Certificate Number",
+            "Vendor Name",
+            "Module Name",
+            "security_policy_url",
+            "certificate_detail_url",
+            "detail_available",
+        ],
+        "properties": {
+            "Certificate Number": {"type": "string", "pattern": "^[0-9]+$"},
+            "Certificate Number_url": {"type": "string"},
+            "Vendor Name": {"type": "string"},
+            "Module Name": {"type": "string"},
+            "Module Type": {"type": "string"},
+            "Validation Date": {"type": "string"},
+            "Status": {"type": "string"},
+            "security_policy_url": {"type": "string", "format": "uri"},
+            "certificate_detail_url": {"type": "string", "format": "uri"},
+            "standard": {"type": ["string", "null"]},
+            "status": {"type": ["string", "null"]},
+            "overall_level": {"type": ["integer", "string", "null"]},
+            "sunset_date": {"type": ["string", "null"]},
+            "detail_available": {"type": "boolean"},
+            "algorithms": {"type": "array", "items": {"type": "string"}},
+            "algorithms_detailed": {"type": "array", "items": {"type": "string"}},
+            "algorithm_extraction": algorithm_extraction_schema(),
+        },
+    }
+
+
+def module_in_process_schema() -> Dict:
+    """Return the schema for CMVP modules in process rows."""
+    return {
+        "type": "object",
+        "additionalProperties": True,
+        "required": ["Module Name", "Vendor Name", "Standard", "Status"],
+        "properties": {
+            "Module Name": {"type": "string"},
+            "Vendor Name": {"type": "string"},
+            "Vendor Name_url": {"type": "string"},
+            "Standard": {"type": "string"},
+            "Status": {"type": "string"},
+        },
+    }
+
+
+def metadata_schema() -> Dict:
+    """Return the dataset metadata schema."""
+    return {
+        "type": "object",
+        "additionalProperties": True,
+        "required": [
+            "generated_at",
+            "total_modules",
+            "total_historical_modules",
+            "total_modules_in_process",
+            "total_certificates_with_algorithms",
+            "total_certificate_details",
+            "source",
+            "modules_in_process_source",
+            "algorithm_source",
+            "algorithm_cache_version",
+            "version",
+        ],
+        "properties": {
+            "generated_at": {"type": "string", "format": "date-time"},
+            "total_modules": {"type": "integer", "minimum": 0},
+            "total_historical_modules": {"type": "integer", "minimum": 0},
+            "total_modules_in_process": {"type": "integer", "minimum": 0},
+            "total_certificates_with_algorithms": {"type": "integer", "minimum": 0},
+            "total_certificate_details": {"type": "integer", "minimum": 0},
+            "source": {"type": "string", "format": "uri"},
+            "modules_in_process_source": {"type": "string", "format": "uri"},
+            "algorithm_source": {"type": "string"},
+            "algorithm_cache_version": {"type": "string"},
+            "algorithm_extraction_schema_version": {"type": "string"},
+            "extraction_metrics": {"type": "object", "additionalProperties": True},
+            "version": {"type": "string"},
+        },
+    }
+
+
+def response_schema(metadata_ref: str, array_name: str, item_ref: str) -> Dict:
+    """Return a two-field metadata/list response schema."""
+    return {
+        "type": "object",
+        "additionalProperties": False,
+        "required": ["metadata", array_name],
+        "properties": {
+            "metadata": {"$ref": metadata_ref},
+            array_name: {"type": "array", "items": {"$ref": item_ref}},
+        },
+    }
+
+
+def certificate_detail_schema() -> Dict:
+    """Return the per-certificate detail response schema."""
+    certificate_schema = {
+        "type": "object",
+        "additionalProperties": True,
+        "required": [
+            "certificate_number",
+            "dataset",
+            "generated_at",
+            "nist_page_url",
+            "certificate_detail_url",
+            "security_policy_url",
+            "vendor_name",
+            "module_name",
+            "standard",
+            "status",
+            "related_files",
+            "validation_history",
+            "vendor",
+        ],
+        "properties": {
+            "certificate_number": {"type": "string", "pattern": "^[0-9]+$"},
+            "dataset": {"type": "string", "enum": ["active", "historical"]},
+            "generated_at": {"type": "string", "format": "date-time"},
+            "nist_page_url": {"type": "string", "format": "uri"},
+            "certificate_detail_url": {"type": "string", "format": "uri"},
+            "security_policy_url": {"type": ["string", "null"], "format": "uri"},
+            "vendor_name": {"type": ["string", "null"]},
+            "module_name": {"type": ["string", "null"]},
+            "standard": {"type": ["string", "null"]},
+            "status": {"type": ["string", "null"]},
+            "related_files": {"type": "array", "items": {"type": "object", "additionalProperties": True}},
+            "validation_history": {"type": "array", "items": {"type": "object", "additionalProperties": True}},
+            "vendor": {"type": "object", "additionalProperties": True},
+            "algorithms": {"type": "array", "items": {"type": "string"}},
+            "algorithms_detailed": {"type": "array", "items": {"type": "string"}},
+            "algorithm_extraction": algorithm_extraction_schema(),
+        },
+    }
+    return {
+        "type": "object",
+        "additionalProperties": False,
+        "required": ["metadata", "certificate"],
+        "properties": {
+            "metadata": {
+                "type": "object",
+                "additionalProperties": True,
+                "required": ["generated_at", "dataset", "source"],
+            },
+            "certificate": certificate_schema,
+        },
+    }
+
+
+def algorithms_schema() -> Dict:
+    """Return the algorithms summary response schema."""
+    return {
+        "type": "object",
+        "additionalProperties": False,
+        "required": ["total_unique_algorithms", "total_certificate_algorithm_pairs", "algorithms", "metadata"],
+        "properties": {
+            "total_unique_algorithms": {"type": "integer", "minimum": 0},
+            "total_certificate_algorithm_pairs": {"type": "integer", "minimum": 0},
+            "algorithms": {
+                "type": "object",
+                "additionalProperties": {
+                    "type": "object",
+                    "additionalProperties": False,
+                    "required": ["count", "certificates"],
+                    "properties": {
+                        "count": {"type": "integer", "minimum": 0},
+                        "certificates": {"type": "array", "items": {"type": "integer"}},
+                    },
+                },
+            },
+            "metadata": {"type": "object", "additionalProperties": True},
+        },
+    }
+
+
+def build_schema_index_payload(algorithms_summary: Optional[Dict]) -> Dict:
+    """Build the JSON Schema discovery document."""
+    return {
+        "name": "NIST CMVP API JSON Schemas",
+        "schema_version": "1.0",
+        "base_url": PUBLIC_BASE_URL,
+        "schemas": schema_paths(algorithms_summary),
+    }
+
+
+def generate_json_schema_artifacts(algorithms_summary: Optional[Dict]) -> Dict[str, Dict]:
+    """Generate tracked JSON Schema artifacts for API response files."""
+    metadata_path = "/api/schemas/metadata.schema.json"
+    module_path = "/api/schemas/module.schema.json"
+    module_in_process_path = "/api/schemas/module-in-process.schema.json"
+    paths = schema_paths(algorithms_summary)
+    artifacts = {
+        "api/schemas/index.schema.json": json_schema_document(
+            "NIST CMVP API JSON Schema Index",
+            paths["index"],
+            {
+                "type": "object",
+                "additionalProperties": False,
+                "required": ["name", "schema_version", "base_url", "schemas"],
+                "properties": {
+                    "name": {"type": "string"},
+                    "schema_version": {"type": "string"},
+                    "base_url": {"type": "string", "format": "uri"},
+                    "schemas": {"type": "object", "additionalProperties": {"type": "string"}},
+                },
+                "examples": [build_schema_index_payload(algorithms_summary)],
+            },
+        ),
+        "api/schemas/metadata.schema.json": json_schema_document(
+            "NIST CMVP API Metadata",
+            paths["metadata"],
+            metadata_schema(),
+        ),
+        "api/schemas/module.schema.json": json_schema_document(
+            "NIST CMVP Module Row",
+            module_path,
+            module_schema(),
+        ),
+        "api/schemas/module-in-process.schema.json": json_schema_document(
+            "NIST CMVP Module In Process Row",
+            module_in_process_path,
+            module_in_process_schema(),
+        ),
+        "api/schemas/modules.schema.json": json_schema_document(
+            "NIST CMVP Active Modules Response",
+            paths["modules"],
+            response_schema(metadata_path, "modules", module_path),
+        ),
+        "api/schemas/historical-modules.schema.json": json_schema_document(
+            "NIST CMVP Historical Modules Response",
+            paths["historical_modules"],
+            response_schema(metadata_path, "modules", module_path),
+        ),
+        "api/schemas/modules-in-process.schema.json": json_schema_document(
+            "NIST CMVP Modules In Process Response",
+            paths["modules_in_process"],
+            response_schema(metadata_path, "modules_in_process", module_in_process_path),
+        ),
+        "api/schemas/certificate-detail.schema.json": json_schema_document(
+            "NIST CMVP Certificate Detail Response",
+            paths["certificate_detail"],
+            certificate_detail_schema(),
+        ),
+    }
+    if algorithms_summary:
+        artifacts["api/schemas/algorithms.schema.json"] = json_schema_document(
+            "NIST CMVP Algorithms Summary Response",
+            paths["algorithms"],
+            algorithms_schema(),
+        )
+    return artifacts
+
+
 def build_index_payload(metadata: Dict, algorithms_summary: Optional[Dict]) -> Dict:
     """Build the API index payload published at api/index.json."""
     endpoints = {
@@ -2352,6 +2974,7 @@ def build_index_payload(metadata: Dict, algorithms_summary: Optional[Dict]) -> D
         "base_url": PUBLIC_BASE_URL,
         "endpoints": endpoints,
         "documentation": documentation_paths(),
+        "schemas": schema_paths(algorithms_summary),
         "last_updated": metadata.get("generated_at"),
         "total_modules": metadata.get("total_modules", 0),
         "total_historical_modules": metadata.get("total_historical_modules", 0),
@@ -2362,11 +2985,14 @@ def build_index_payload(metadata: Dict, algorithms_summary: Optional[Dict]) -> D
             "security_policy_urls": True,
             "certificate_detail_urls": True,
             "algorithm_extraction": bool(algorithms_summary),
+            "algorithm_extraction_provenance": True,
+            "extraction_metrics": True,
             "certificate_detail_records": True,
             "llms_txt": True,
             "llms_full_txt": True,
             "markdown_api_docs": True,
             "openapi_spec": True,
+            "json_schemas": True,
         },
     }
 
@@ -2609,8 +3235,11 @@ def generate_openapi_spec(
                         "total_certificates_with_algorithms": {"type": "integer", "example": metadata.get("total_certificates_with_algorithms", 0)},
                         "total_certificate_details": {"type": "integer", "example": metadata.get("total_certificate_details", 0)},
                         "source": {"type": "string", "example": metadata.get("source", "")},
+                        "modules_in_process_source": {"type": "string", "example": metadata.get("modules_in_process_source", "")},
                         "algorithm_source": {"type": "string", "example": metadata.get("algorithm_source", "")},
                         "algorithm_cache_version": {"type": "string", "example": metadata.get("algorithm_cache_version", "")},
+                        "algorithm_extraction_schema_version": {"type": "string", "example": metadata.get("algorithm_extraction_schema_version", "")},
+                        "extraction_metrics": {"type": "object", "additionalProperties": True},
                         "version": {"type": "string", "example": metadata.get("version", "")}
                     }
                 },
@@ -2656,7 +3285,8 @@ def generate_openapi_spec(
                                     }
                                 }
                             }
-                        }
+                        },
+                        "metadata": {"type": "object", "additionalProperties": True}
                     }
                 },
                 "CertificateDetail": {
@@ -2843,6 +3473,8 @@ def main():
     certificate_detail_payloads.update(historical_payloads)
     algorithms_map.update(historical_algorithms)
 
+    extraction_metrics = build_extraction_metrics(active_stats, historical_stats)
+
     # Prepare output directory
     output_dir = "api"
 
@@ -2858,6 +3490,8 @@ def main():
         "modules_in_process_source": MODULES_IN_PROCESS_URL,
         "algorithm_source": algorithm_source,
         "algorithm_cache_version": ALGORITHM_CACHE_VERSION,
+        "algorithm_extraction_schema_version": ALGORITHM_EXTRACTION_SCHEMA_VERSION,
+        "extraction_metrics": extraction_metrics,
         "version": "3.0"
     }
 
@@ -2910,9 +3544,18 @@ def main():
         algorithms_summary["metadata"] = {
             "generated_at": metadata["generated_at"],
             "total_certificates_processed": len(algorithms_map),
-            "source": algorithm_source
+            "source": algorithm_source,
+            "algorithm_source": algorithm_source,
+            "algorithm_cache_version": ALGORITHM_CACHE_VERSION,
+            "algorithm_extraction_schema_version": ALGORITHM_EXTRACTION_SCHEMA_VERSION,
+            "extraction_metrics": extraction_metrics["combined"],
         }
         save_json(algorithms_summary, f"{output_dir}/algorithms.json")
+    else:
+        algorithms_path = Path(output_dir) / "algorithms.json"
+        if algorithms_path.exists():
+            algorithms_path.unlink()
+            print(f"Removed stale: {algorithms_path}")
 
     # Save metadata separately for quick access
     save_json(metadata, f"{output_dir}/metadata.json")
@@ -2942,6 +3585,18 @@ def main():
     ).items():
         save_text(content, path)
 
+    print("Generating JSON Schema artifacts...")
+    schema_artifacts = generate_json_schema_artifacts(algorithms_summary)
+    for path, schema in schema_artifacts.items():
+        save_json(schema, path)
+    schema_dir = Path(output_dir) / "schemas"
+    if schema_dir.exists():
+        expected_schema_paths = {Path(path) for path in schema_artifacts}
+        for stale_schema in schema_dir.glob("*.schema.json"):
+            if stale_schema not in expected_schema_paths:
+                stale_schema.unlink()
+                print(f"Removed stale: {stale_schema}")
+
     print("\n" + "=" * 60)
     print("Scraping completed successfully!")
     print("=" * 60)
@@ -2967,12 +3622,14 @@ def main():
         print(
             "  - Active algorithm reuse: "
             f"{active_stats['pdf_reused']} reused, {active_stats['pdf_refreshed']} refreshed, "
-            f"{active_stats['pdf_failed']} failed, {active_stats['algorithm_misses']} misses"
+            f"{active_stats['pdf_failed']} failed, {active_stats['pdf_cache_hits']} PDF cache hits, "
+            f"{active_stats['algorithm_misses']} misses"
         )
         print(
             "  - Historical algorithm reuse: "
             f"{historical_stats['pdf_reused']} reused, {historical_stats['pdf_refreshed']} refreshed, "
-            f"{historical_stats['pdf_failed']} failed, {historical_stats['algorithm_misses']} misses"
+            f"{historical_stats['pdf_failed']} failed, {historical_stats['pdf_cache_hits']} PDF cache hits, "
+            f"{historical_stats['algorithm_misses']} misses"
         )
     print(f"  - OpenAPI spec: openapi.json")
     print(f"\nOutput files saved to: {output_dir}/")
diff --git a/test_scraper.py b/test_scraper.py
index db8fc9089..71593e048 100644
--- a/test_scraper.py
+++ b/test_scraper.py
@@ -4,6 +4,7 @@
 Tests the parsing logic with sample HTML.
 """
 
+import asyncio
 import json
 import sys
 import tempfile
@@ -11,22 +12,37 @@
 from types import SimpleNamespace
 from scraper import (
     ALGORITHM_CACHE_VERSION,
+    ALGORITHM_EXTRACTION_SCHEMA_VERSION,
+    build_algorithm_extraction_provenance,
     build_certificate_fingerprint,
+    build_extraction_metrics,
     build_index_payload,
     extract_legacy_algorithm_section,
     extract_text_from_crawl4ai_process_result,
     extract_text_from_crawl4ai_html,
+    fetch_policy_pdf_bytes,
+    generate_json_schema_artifacts,
     generate_openapi_spec,
     generate_text_artifacts,
     parse_algorithms_from_policy_markdown,
     parse_algorithms_from_policy_text,
     parse_certificate_detail_page,
     parse_modules_table,
+    process_certificate_record,
     prune_orphan_certificate_details,
     select_algorithm_source,
     should_reuse_certificate_detail,
     should_reuse_cached_algorithms,
 )
+from validate_api import validate_api
+
+
+FIXTURE_DIR = Path(__file__).parent / "tests" / "fixtures" / "nist_security_policies"
+
+
+def load_policy_fixture(name: str) -> str:
+    """Load a checked-in Security Policy text fixture."""
+    return (FIXTURE_DIR / name).read_text(encoding="utf-8")
 
 
 def test_parse_simple_table():
@@ -487,6 +503,44 @@ def test_extract_legacy_algorithm_section_prefers_body_over_toc():
     print("✓ Legacy algorithm section TOC preference test passed")
 
 
+def test_parse_real_world_fips_140_3_policy_fixture():
+    """Regression-test a representative FIPS 140-3 NIST Security Policy text fixture."""
+    policy_text = load_policy_fixture("5260_fips_140_3_algorithms.txt")
+
+    detailed, categories = parse_algorithms_from_policy_text(policy_text)
+
+    assert any("AES-CBC" in entry for entry in detailed), "Expected AES-CBC from FIPS 140-3 fixture"
+    assert any("HMAC SHA2-256" in entry for entry in detailed), "Expected HMAC from FIPS 140-3 fixture"
+    assert any("CTR_DRBG" in entry for entry in detailed), "Expected DRBG from FIPS 140-3 fixture"
+    assert categories == ["AES", "DRBG", "HMAC", "SHA"], "Expected normalized FIPS 140-3 categories"
+
+    print("✓ Real-world FIPS 140-3 fixture parsing test passed")
+
+
+def test_parse_real_world_fips_140_2_policy_fixture():
+    """Regression-test a representative FIPS 140-2 NIST Security Policy text fixture."""
+    policy_text = load_policy_fixture("5152_fips_140_2_algorithms.txt")
+
+    detailed, categories = parse_algorithms_from_policy_text(policy_text)
+
+    assert detailed == [], "Legacy FIPS 140-2 fixture should use coarse categories"
+    assert categories == [
+        "AES",
+        "DRBG",
+        "ECDSA",
+        "HMAC",
+        "KAS",
+        "KDF",
+        "RSA",
+        "SHS",
+        "SSH",
+        "TLS",
+    ], "Expected normalized FIPS 140-2 categories"
+    assert "DES" not in categories, "Allowed/non-approved section must not leak into approved categories"
+
+    print("✓ Real-world FIPS 140-2 fixture parsing test passed")
+
+
 def test_parse_algorithms_from_policy_markdown():
     """Test parsing algorithm tables from policy markdown output."""
     markdown = """
@@ -664,6 +718,150 @@ def test_should_reuse_cached_algorithms():
     print("✓ Algorithm cache reuse test passed")
 
 
+def test_algorithm_extraction_provenance_and_metrics():
+    """Algorithm extraction provenance should expose source, cache, fallback, and counts."""
+    provenance = build_algorithm_extraction_provenance(
+        "crawl4ai",
+        "parsed",
+        "security_policy_pdf",
+        "https://csrc.nist.gov/example.pdf",
+        ["AES", "HMAC"],
+        ["AES-CBC A1", "HMAC SHA2-256 A1"],
+        cached=False,
+        fallback_used=True,
+        attempts=[
+            {"source": "crawl4ai", "url": "https://csrc.nist.gov/example.pdf", "status": "no_algorithms"},
+            {"source": "security_policy_pdf", "url": "https://csrc.nist.gov/example.pdf", "status": "parsed"},
+        ],
+    )
+
+    assert provenance["schema_version"] == ALGORITHM_EXTRACTION_SCHEMA_VERSION, "Provenance schema version mismatch"
+    assert provenance["configured_source"] == "crawl4ai", "Configured source should be recorded"
+    assert provenance["source"] == "security_policy_pdf", "Actual extraction source should be recorded"
+    assert provenance["fallback_used"] is True, "Fallback usage should be recorded"
+    assert provenance["algorithm_count"] == 2, "Algorithm category count mismatch"
+    assert provenance["detailed_algorithm_count"] == 2, "Detailed algorithm count mismatch"
+    assert len(provenance["attempts"]) == 2, "Attempt provenance should be retained for detail records"
+
+    active_stats = {"html_reused": 3, "algorithm_successes": 2, "algorithm_fallbacks": 1}
+    historical_stats = {"html_refreshed": 4, "algorithm_misses": 1}
+    metrics = build_extraction_metrics(active_stats, historical_stats)
+    assert metrics["combined"]["html_reused"] == 3, "Combined metrics should include active counters"
+    assert metrics["combined"]["html_refreshed"] == 4, "Combined metrics should include historical counters"
+    assert metrics["combined"]["algorithm_successes"] == 2, "Combined metrics should include successes"
+    assert metrics["combined"]["algorithm_misses"] == 1, "Combined metrics should include misses"
+    assert "concurrency" in metrics, "Extraction metrics should record concurrency settings"
+
+    print("✓ Algorithm provenance and metrics test passed")
+
+
+def test_fetch_policy_pdf_bytes_reuses_in_run_cache():
+    """Local Security Policy PDF fetches should be reused within one scrape run."""
+    class FakeResponse:
+        status_code = 200
+        headers = {}
+        text = ""
+        content = b"%PDF-1.7 fixture"
+
+        def raise_for_status(self):
+            return None
+
+    class FakeClient:
+        def __init__(self):
+            self.calls = 0
+
+        async def get(self, url):
+            self.calls += 1
+            await asyncio.sleep(0)
+            return FakeResponse()
+
+    async def scenario():
+        client = FakeClient()
+        pdf_cache = {}
+        pdf_cache_lock = asyncio.Lock()
+        first_bytes, first_hit = await fetch_policy_pdf_bytes(
+            client,
+            "https://csrc.nist.gov/example.pdf",
+            pdf_cache,
+            pdf_cache_lock,
+        )
+        second_bytes, second_hit = await fetch_policy_pdf_bytes(
+            client,
+            "https://csrc.nist.gov/example.pdf",
+            pdf_cache,
+            pdf_cache_lock,
+        )
+        return client.calls, first_bytes, first_hit, second_bytes, second_hit
+
+    calls, first_bytes, first_hit, second_bytes, second_hit = asyncio.run(scenario())
+
+    assert calls == 1, "Expected one network fetch for repeated policy URL"
+    assert first_bytes == b"%PDF-1.7 fixture", "First PDF fetch returned unexpected bytes"
+    assert second_bytes == first_bytes, "Second PDF fetch should reuse cached bytes"
+    assert first_hit is False, "First PDF fetch should not be a cache hit"
+    assert second_hit is True, "Second PDF fetch should be a cache hit"
+
+    print("✓ Policy PDF cache reuse test passed")
+
+
+def test_process_certificate_record_applies_cached_algorithm_provenance():
+    """Cached algorithm reuse should still attach explicit provenance to outputs."""
+    module = {
+        "Certificate Number": "5238",
+        "Vendor Name": "SUSE LLC",
+        "Module Name": "SUSE Linux Enterprise OpenSSL 1 Cryptographic Module",
+        "Module Type": "Software",
+        "Validation Date": "04/10/2026",
+        "security_policy_url": "https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5238.pdf",
+        "certificate_detail_url": "https://csrc.nist.gov/projects/cryptographic-module-validation-program/certificate/5238",
+    }
+    previous_detail = {
+        "certificate_number": "5238",
+        "software_versions": "3.0.9",
+        "hardware_versions": None,
+        "firmware_versions": None,
+        "security_policy_url": module["security_policy_url"],
+        "algorithms": ["AES", "HMAC"],
+        "algorithms_detailed": ["AES-CBC A1", "HMAC SHA2-256 A1"],
+        "algorithm_extraction": {
+            "source": "crawl4ai",
+            "source_url": module["security_policy_url"],
+        },
+    }
+    previous_metadata = {
+        "algorithm_source": "crawl4ai",
+        "algorithm_cache_version": ALGORITHM_CACHE_VERSION,
+    }
+
+    module_out, detail_payload, categories, stats = asyncio.run(
+        process_certificate_record(
+            module,
+            "active",
+            "2026-04-12T03:10:00.961597Z",
+            "crawl4ai",
+            module,
+            previous_detail,
+            previous_metadata,
+            None,
+            asyncio.Semaphore(1),
+            asyncio.Semaphore(1),
+            {},
+            asyncio.Lock(),
+            {},
+        )
+    )
+
+    assert categories == ["AES", "HMAC"], "Cached categories should be reused"
+    assert module_out["algorithm_extraction"]["status"] == "cached", "Module should record cached extraction status"
+    assert module_out["algorithm_extraction"]["source"] == "crawl4ai", "Cached source should be preserved"
+    assert detail_payload["algorithm_extraction"]["cached"] is True, "Detail should record cache provenance"
+    assert detail_payload["algorithm_extraction"]["algorithm_count"] == 2, "Detail algorithm count mismatch"
+    assert stats["pdf_reused"] == 1, "Cached algorithm reuse should increment pdf_reused"
+    assert stats["algorithm_cache_hits"] == 1, "Cached algorithm reuse should increment cache hits"
+
+    print("✓ Cached algorithm provenance application test passed")
+
+
 def test_prune_orphan_certificate_details():
     """Test that stale certificate detail files are removed only for missing certs."""
     with tempfile.TemporaryDirectory() as temp_dir:
@@ -682,6 +880,15 @@ def test_prune_orphan_certificate_details():
     print("✓ Orphan certificate cleanup test passed")
 
 
+def test_validate_generated_api_artifacts():
+    """Current checked-in generated API artifacts should be internally consistent."""
+    errors = validate_api(Path("."))
+
+    assert errors == [], "Generated API artifact validation failed:\n" + "\n".join(errors[:20])
+
+    print("✓ Generated API artifact validation test passed")
+
+
 def test_generate_agent_docs():
     """Test the generated agent-friendly documentation artifacts."""
     metadata = {
@@ -694,6 +901,12 @@ def test_generate_agent_docs():
         "source": "https://csrc.nist.gov/projects/cryptographic-module-validation-program/validated-modules/search",
         "modules_in_process_source": "https://csrc.nist.gov/Projects/cryptographic-module-validation-program/modules-in-process/modules-in-process-list",
         "algorithm_source": "crawl4ai",
+        "algorithm_cache_version": ALGORITHM_CACHE_VERSION,
+        "algorithm_extraction_schema_version": ALGORITHM_EXTRACTION_SCHEMA_VERSION,
+        "extraction_metrics": build_extraction_metrics(
+            {"html_reused": 1, "pdf_reused": 1, "algorithm_cache_hits": 1},
+            {"html_refreshed": 1, "pdf_refreshed": 1, "algorithm_successes": 1},
+        ),
         "version": "3.0",
     }
     sample_module = {
@@ -711,6 +924,18 @@ def test_generate_agent_docs():
         "certificate_detail_url": "https://csrc.nist.gov/projects/cryptographic-module-validation-program/certificate/5238",
         "detail_available": True,
         "description": "OpenSSL is an open-source library of various cryptographic algorithms written mainly in C.",
+        "algorithm_extraction": {
+            "schema_version": ALGORITHM_EXTRACTION_SCHEMA_VERSION,
+            "status": "cached",
+            "configured_source": "crawl4ai",
+            "source": "crawl4ai",
+            "source_url": "https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5238.pdf",
+            "cached": True,
+            "fallback_used": False,
+            "cache_version": ALGORITHM_CACHE_VERSION,
+            "algorithm_count": 3,
+            "detailed_algorithm_count": 0,
+        },
     }
     sample_detail = {
         "certificate_number": "5238",
@@ -744,6 +969,25 @@ def test_generate_agent_docs():
             {"date": "4/10/2026", "type": "Initial", "lab": "Example Lab"}
         ],
         "algorithms": ["AES", "HMAC", "RSA"],
+        "algorithm_extraction": {
+            "schema_version": ALGORITHM_EXTRACTION_SCHEMA_VERSION,
+            "status": "parsed",
+            "configured_source": "crawl4ai",
+            "source": "crawl4ai",
+            "source_url": "https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5238.pdf",
+            "cached": False,
+            "fallback_used": False,
+            "cache_version": ALGORITHM_CACHE_VERSION,
+            "algorithm_count": 3,
+            "detailed_algorithm_count": 12,
+            "attempts": [
+                {
+                    "source": "crawl4ai",
+                    "url": "https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5238.pdf",
+                    "status": "parsed",
+                }
+            ],
+        },
     }
     algorithms_summary = {
         "total_unique_algorithms": 45,
@@ -766,11 +1010,28 @@ def test_generate_agent_docs():
     assert "api/docs.md" in artifacts, "Missing Markdown API docs artifact"
     assert "api/algorithms.json" in artifacts["llms.txt"], "llms.txt should reference algorithms endpoint when available"
     assert 'href="api/docs.md"' in artifacts["index.html"], "Homepage should link to api/docs.md"
+    assert 'href="api/schemas/index.schema.json"' in artifacts["index.html"], "Homepage should link to JSON schemas"
     assert "GET api/certificates/{certificate}.json" in artifacts["api/docs.md"], "API docs should include certificate detail endpoint"
+    assert "GET api/schemas/index.schema.json" in artifacts["api/docs.md"], "API docs should include JSON schema endpoint"
+    assert "algorithm_extraction" in artifacts["api/docs.md"], "API docs should describe extraction provenance"
 
     index_payload = build_index_payload(metadata, algorithms_summary)
     assert index_payload["documentation"]["llms_full_txt"] == "/llms-full.txt", "Index payload should advertise llms-full.txt"
+    assert index_payload["documentation"]["json_schemas"] == "/api/schemas/index.schema.json", "Index payload should advertise JSON schemas"
+    assert index_payload["schemas"]["certificate_detail"] == "/api/schemas/certificate-detail.schema.json", "Index payload should advertise certificate detail schema"
     assert index_payload["features"]["markdown_api_docs"] is True, "Index payload should advertise Markdown docs support"
+    assert index_payload["features"]["algorithm_extraction_provenance"] is True, "Index payload should advertise extraction provenance"
+    assert index_payload["features"]["extraction_metrics"] is True, "Index payload should advertise extraction metrics"
+    assert index_payload["features"]["json_schemas"] is True, "Index payload should advertise JSON schema support"
+
+    schema_artifacts = generate_json_schema_artifacts(algorithms_summary)
+    assert "api/schemas/modules.schema.json" in schema_artifacts, "Missing modules JSON schema"
+    assert "api/schemas/module-in-process.schema.json" in schema_artifacts, "Missing module-in-process JSON schema"
+    assert "api/schemas/certificate-detail.schema.json" in schema_artifacts, "Missing certificate detail JSON schema"
+    assert "api/schemas/algorithms.schema.json" in schema_artifacts, "Missing algorithms JSON schema"
+    assert schema_artifacts["api/schemas/modules-in-process.schema.json"]["properties"]["modules_in_process"]["items"]["$ref"] == "/api/schemas/module-in-process.schema.json", "Modules-in-process response should use its own row schema"
+    assert schema_artifacts["api/schemas/module.schema.json"]["properties"]["algorithm_extraction"]["type"] == "object", "Module schema should include extraction provenance"
+    assert schema_artifacts["api/schemas/certificate-detail.schema.json"]["properties"]["certificate"]["properties"]["algorithm_extraction"]["type"] == "object", "Certificate detail schema should include extraction provenance"
 
     openapi = generate_openapi_spec(
         [sample_module],
@@ -782,10 +1043,14 @@ def test_generate_agent_docs():
     assert openapi["components"]["schemas"]["Module"]["properties"]["detail_available"]["type"] == "boolean", "detail_available should be typed as boolean"
     module_properties = openapi["components"]["schemas"]["Module"]["properties"]
     certificate_properties = openapi["components"]["schemas"]["CertificateDetail"]["properties"]
+    metadata_properties = openapi["components"]["schemas"]["Metadata"]["properties"]
     for key in ("software_versions", "hardware_versions", "firmware_versions"):
         assert key in module_properties, f"OpenAPI module schema should include {key}"
         assert key in certificate_properties, f"OpenAPI certificate detail schema should include {key}"
         assert module_properties[key]["nullable"] is True, f"OpenAPI module schema should mark {key} nullable"
+    assert "algorithm_extraction" in module_properties, "OpenAPI module schema should include algorithm_extraction"
+    assert "algorithm_extraction" in certificate_properties, "OpenAPI certificate schema should include algorithm_extraction"
+    assert "extraction_metrics" in metadata_properties, "OpenAPI metadata schema should include extraction metrics"
 
     print("✓ Agent-friendly docs generation test passed")
 
@@ -808,13 +1073,19 @@ def main():
         test_parse_algorithms_from_policy_text()
         test_parse_algorithms_from_legacy_policy_text()
         test_extract_legacy_algorithm_section_prefers_body_over_toc()
+        test_parse_real_world_fips_140_3_policy_fixture()
+        test_parse_real_world_fips_140_2_policy_fixture()
         test_parse_algorithms_from_policy_markdown()
         test_extract_text_from_crawl4ai_html()
         test_extract_text_from_crawl4ai_process_result()
         test_select_algorithm_source()
         test_build_certificate_fingerprint()
         test_should_reuse_cached_algorithms()
+        test_algorithm_extraction_provenance_and_metrics()
+        test_fetch_policy_pdf_bytes_reuses_in_run_cache()
+        test_process_certificate_record_applies_cached_algorithm_provenance()
         test_prune_orphan_certificate_details()
+        test_validate_generated_api_artifacts()
         test_generate_agent_docs()
         
         print()
diff --git a/tests/fixtures/nist_security_policies/5152_fips_140_2_algorithms.txt b/tests/fixtures/nist_security_policies/5152_fips_140_2_algorithms.txt
new file mode 100644
index 000000000..e56801a1e
--- /dev/null
+++ b/tests/fixtures/nist_security_policies/5152_fips_140_2_algorithms.txt
@@ -0,0 +1,21 @@
+Source: https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5152.pdf
+Certificate: 5152
+Standard: FIPS 140-2
+
+3.4 Algorithms
+Table 10 lists the FIPS Approved cryptographic algorithms used by the module.
+Algorithm
+AES Cert. #A3424
+DRBG Cert. #A3424
+ECDSA Cert. #A3424
+HMAC Cert. #A3424
+KAS Cert. #A3424
+KDF TLS
+RSA Cert. #A3424
+SHS Cert. #A3424
+SSH KDF
+
+3.5 Allowed Algorithms
+Table 11 describes the non-approved but allowed algorithms in FIPS mode.
+Algorithm
+Triple-DES
diff --git a/tests/fixtures/nist_security_policies/5260_fips_140_3_algorithms.txt b/tests/fixtures/nist_security_policies/5260_fips_140_3_algorithms.txt
new file mode 100644
index 000000000..4aaa824ce
--- /dev/null
+++ b/tests/fixtures/nist_security_policies/5260_fips_140_3_algorithms.txt
@@ -0,0 +1,17 @@
+Source: https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5260.pdf
+Certificate: 5260
+Standard: FIPS 140-3
+
+2.5 Algorithms
+Approved Algorithms:
+Cipher
+Algorithm CAVP Cert Properties Reference
+AES-CBC A4593 Direction - Decrypt, Encrypt Key Length - 128, 192, 256 SP 800-38A
+AES-GCM A4593 Direction - Decrypt, Encrypt Key Length - 128, 192, 256 IV Generation - Internal SP 800-38D
+Message Authentication
+HMAC SHA2-256 A4593 Message Authentication FIPS 198-1
+Message Digest
+SHA2-256 A4593 Message Digest FIPS 180-4
+Random Bit Generation
+CTR_DRBG A4593 Deterministic Random Bit Generation SP 800-90A
+2.6 Security Function Implementations
diff --git a/validate_api.py b/validate_api.py
new file mode 100644
index 000000000..489dda664
--- /dev/null
+++ b/validate_api.py
@@ -0,0 +1,508 @@
+#!/usr/bin/env python3
+"""Validate generated static API artifacts for internal consistency."""
+
+import argparse
+import json
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
+
+
+REQUIRED_TOP_LEVEL_FILES = (
+    "api/modules.json",
+    "api/historical-modules.json",
+    "api/modules-in-process.json",
+    "api/metadata.json",
+    "api/index.json",
+    "openapi.json",
+    "llms.txt",
+    "llms-full.txt",
+    "api/docs.md",
+    "index.html",
+)
+
+DETAIL_REQUIRED_FIELDS = (
+    "certificate_number",
+    "dataset",
+    "generated_at",
+    "nist_page_url",
+    "certificate_detail_url",
+    "security_policy_url",
+    "vendor_name",
+    "module_name",
+    "standard",
+    "status",
+    "related_files",
+    "validation_history",
+    "vendor",
+)
+
+CURRENT_SCHEMA_DETAIL_FIELDS = (
+    "software_versions",
+    "hardware_versions",
+    "firmware_versions",
+    "algorithm_extraction",
+)
+
+ALGORITHM_EXTRACTION_REQUIRED_FIELDS = (
+    "schema_version",
+    "status",
+    "configured_source",
+    "source",
+    "source_url",
+    "cached",
+    "fallback_used",
+    "cache_version",
+    "algorithm_count",
+    "detailed_algorithm_count",
+)
+
+ALGORITHM_EXTRACTION_STATUSES = {"parsed", "cached", "miss", "skipped"}
+
+JSON_SCHEMA_FILES = (
+    "api/schemas/index.schema.json",
+    "api/schemas/metadata.schema.json",
+    "api/schemas/module.schema.json",
+    "api/schemas/module-in-process.schema.json",
+    "api/schemas/modules.schema.json",
+    "api/schemas/historical-modules.schema.json",
+    "api/schemas/modules-in-process.schema.json",
+    "api/schemas/certificate-detail.schema.json",
+)
+
+
+def load_json(path: Path, errors: List[str]) -> Optional[Dict]:
+    """Load a JSON file and append a validation error on failure."""
+    try:
+        with path.open("r", encoding="utf-8") as handle:
+            payload = json.load(handle)
+    except Exception as exc:
+        errors.append(f"{path}: failed to load JSON: {exc}")
+        return None
+
+    if not isinstance(payload, dict):
+        errors.append(f"{path}: top-level JSON value must be an object")
+        return None
+    return payload
+
+
+def parse_certificate_number(record: Dict) -> Optional[int]:
+    """Return a numeric certificate number from a module or detail record."""
+    for key in ("Certificate Number", "certificate_number"):
+        value = str(record.get(key, "")).strip()
+        if value.isdigit():
+            return int(value)
+    return None
+
+
+def add_error(errors: List[str], condition: bool, message: str) -> None:
+    """Append message when condition is false."""
+    if not condition:
+        errors.append(message)
+
+
+def count_certificate_algorithm_pairs(cert_algorithms: Dict[int, List[str]]) -> int:
+    """Count certificate/algorithm pairs from module rows."""
+    return sum(len(algorithms) for algorithms in cert_algorithms.values())
+
+
+def build_expected_algorithm_index(cert_algorithms: Dict[int, List[str]]) -> Dict[str, Set[int]]:
+    """Build algorithm -> certificate set from module rows."""
+    expected: Dict[str, Set[int]] = {}
+    for cert_number, algorithms in cert_algorithms.items():
+        for algorithm in algorithms:
+            expected.setdefault(algorithm, set()).add(cert_number)
+    return expected
+
+
+def validate_algorithm_extraction(
+    record: Dict,
+    label: str,
+    require_current_schema: bool,
+    errors: List[str],
+) -> None:
+    """Validate an optional per-certificate algorithm_extraction object."""
+    extraction = record.get("algorithm_extraction")
+    if extraction is None:
+        if require_current_schema:
+            errors.append(f"{label}: missing algorithm_extraction")
+        return
+
+    if not isinstance(extraction, dict):
+        errors.append(f"{label}: algorithm_extraction must be an object")
+        return
+
+    for field in ALGORITHM_EXTRACTION_REQUIRED_FIELDS:
+        add_error(errors, field in extraction, f"{label}: algorithm_extraction missing {field}")
+
+    status = extraction.get("status")
+    add_error(
+        errors,
+        status in ALGORITHM_EXTRACTION_STATUSES,
+        f"{label}: invalid algorithm_extraction.status {status!r}",
+    )
+
+    algorithms = record.get("algorithms") or []
+    detailed = record.get("algorithms_detailed") or []
+    if isinstance(extraction.get("algorithm_count"), int):
+        add_error(
+            errors,
+            extraction["algorithm_count"] == len(algorithms),
+            f"{label}: algorithm_extraction.algorithm_count does not match algorithms length",
+        )
+    if isinstance(extraction.get("detailed_algorithm_count"), int):
+        add_error(
+            errors,
+            extraction["detailed_algorithm_count"] == len(detailed),
+            f"{label}: algorithm_extraction.detailed_algorithm_count does not match algorithms_detailed length",
+        )
+
+
+def validate_module_rows(
+    rows: Iterable[Dict],
+    dataset: str,
+    errors: List[str],
+    require_current_schema: bool,
+) -> Tuple[Dict[int, str], Dict[int, List[str]]]:
+    """Validate active or historical module rows and return cert metadata."""
+    cert_datasets: Dict[int, str] = {}
+    cert_algorithms: Dict[int, List[str]] = {}
+
+    for index, row in enumerate(rows):
+        label = f"{dataset} modules[{index}]"
+        cert_number = parse_certificate_number(row)
+        if cert_number is None:
+            errors.append(f"{label}: missing numeric Certificate Number")
+            continue
+
+        if cert_number in cert_datasets:
+            errors.append(f"{label}: duplicate certificate {cert_number}")
+        cert_datasets[cert_number] = dataset
+
+        for field in ("Vendor Name", "Module Name"):
+            add_error(errors, field in row, f"{label}: missing {field}")
+        for field in ("security_policy_url", "certificate_detail_url"):
+            add_error(errors, bool(row.get(field)), f"{label}: missing {field}")
+        add_error(errors, row.get("detail_available") is True, f"{label}: detail_available is not true")
+
+        algorithms = row.get("algorithms") or []
+        if algorithms:
+            add_error(errors, isinstance(algorithms, list), f"{label}: algorithms must be a list")
+            cert_algorithms[cert_number] = algorithms
+        validate_algorithm_extraction(row, label, require_current_schema, errors)
+
+    return cert_datasets, cert_algorithms
+
+
+def validate_certificate_details(
+    detail_dir: Path,
+    expected_datasets: Dict[int, str],
+    expected_algorithms: Dict[int, List[str]],
+    errors: List[str],
+    require_current_schema: bool,
+) -> None:
+    """Validate per-certificate detail files."""
+    detail_files = sorted(detail_dir.glob("*.json"))
+    found_certificates: Set[int] = set()
+
+    for filepath in detail_files:
+        label = str(filepath)
+        if not filepath.stem.isdigit():
+            errors.append(f"{label}: certificate detail filename must be numeric")
+            continue
+
+        file_cert_number = int(filepath.stem)
+        payload = load_json(filepath, errors)
+        if payload is None:
+            continue
+
+        metadata = payload.get("metadata")
+        certificate = payload.get("certificate")
+        add_error(errors, isinstance(metadata, dict), f"{label}: metadata must be an object")
+        add_error(errors, isinstance(certificate, dict), f"{label}: certificate must be an object")
+        if not isinstance(certificate, dict):
+            continue
+
+        cert_number = parse_certificate_number(certificate)
+        add_error(errors, cert_number == file_cert_number, f"{label}: certificate_number does not match filename")
+        if cert_number is None:
+            continue
+
+        found_certificates.add(cert_number)
+        expected_dataset = expected_datasets.get(cert_number)
+        add_error(errors, expected_dataset is not None, f"{label}: certificate is not in active or historical modules")
+        add_error(errors, certificate.get("dataset") == expected_dataset, f"{label}: dataset does not match module list")
+
+        for field in DETAIL_REQUIRED_FIELDS:
+            add_error(errors, field in certificate, f"{label}: certificate missing {field}")
+        if require_current_schema:
+            for field in CURRENT_SCHEMA_DETAIL_FIELDS:
+                add_error(errors, field in certificate, f"{label}: certificate missing current schema field {field}")
+
+        add_error(errors, isinstance(certificate.get("related_files"), list), f"{label}: related_files must be a list")
+        add_error(errors, isinstance(certificate.get("validation_history"), list), f"{label}: validation_history must be a list")
+        add_error(errors, isinstance(certificate.get("vendor"), dict), f"{label}: vendor must be an object")
+
+        expected_detail_algorithms = expected_algorithms.get(cert_number, [])
+        actual_detail_algorithms = certificate.get("algorithms") or []
+        add_error(
+            errors,
+            actual_detail_algorithms == expected_detail_algorithms,
+            f"{label}: detail algorithms do not match module row algorithms",
+        )
+        validate_algorithm_extraction(certificate, label, require_current_schema, errors)
+
+    missing_details = sorted(set(expected_datasets) - found_certificates)
+    orphan_details = sorted(found_certificates - set(expected_datasets))
+    if missing_details:
+        errors.append(f"api/certificates: missing detail files for {len(missing_details)} certificates; first={missing_details[:5]}")
+    if orphan_details:
+        errors.append(f"api/certificates: found {len(orphan_details)} orphan detail files; first={orphan_details[:5]}")
+
+
+def validate_algorithms_summary(
+    root: Path,
+    metadata: Dict,
+    expected_cert_algorithms: Dict[int, List[str]],
+    errors: List[str],
+) -> None:
+    """Validate api/algorithms.json against module row algorithm fields."""
+    algorithms_path = root / "api" / "algorithms.json"
+    expected_total = metadata.get("total_certificates_with_algorithms", 0)
+
+    if expected_total == 0:
+        add_error(errors, not algorithms_path.exists(), "api/algorithms.json exists despite zero algorithm coverage")
+        return
+
+    summary = load_json(algorithms_path, errors)
+    if summary is None:
+        return
+
+    algorithms = summary.get("algorithms")
+    add_error(errors, isinstance(algorithms, dict), "api/algorithms.json: algorithms must be an object")
+    if not isinstance(algorithms, dict):
+        return
+
+    expected_index = build_expected_algorithm_index(expected_cert_algorithms)
+    add_error(errors, summary.get("total_unique_algorithms") == len(expected_index), "api/algorithms.json: total_unique_algorithms mismatch")
+    add_error(
+        errors,
+        summary.get("total_certificate_algorithm_pairs") == count_certificate_algorithm_pairs(expected_cert_algorithms),
+        "api/algorithms.json: total_certificate_algorithm_pairs mismatch",
+    )
+    add_error(errors, expected_total == len(expected_cert_algorithms), "metadata: total_certificates_with_algorithms mismatch")
+
+    for algorithm, expected_certs in expected_index.items():
+        entry = algorithms.get(algorithm)
+        if not isinstance(entry, dict):
+            errors.append(f"api/algorithms.json: missing algorithm {algorithm}")
+            continue
+        certs = entry.get("certificates")
+        add_error(errors, isinstance(certs, list), f"api/algorithms.json: {algorithm}.certificates must be a list")
+        if not isinstance(certs, list):
+            continue
+        add_error(errors, entry.get("count") == len(certs), f"api/algorithms.json: {algorithm}.count mismatch")
+        add_error(errors, len(certs) == len(set(certs)), f"api/algorithms.json: {algorithm}.certificates has duplicates")
+        add_error(errors, set(certs) == expected_certs, f"api/algorithms.json: {algorithm}.certificates mismatch")
+
+    extra_algorithms = sorted(set(algorithms) - set(expected_index))
+    if extra_algorithms:
+        errors.append(f"api/algorithms.json: unexpected algorithms present: {extra_algorithms[:5]}")
+
+
+def validate_docs_and_index(
+    root: Path,
+    metadata: Dict,
+    has_algorithms: bool,
+    errors: List[str],
+    require_current_schema: bool,
+) -> None:
+    """Validate API index, OpenAPI, and docs artifacts at a structural level."""
+    index = load_json(root / "api" / "index.json", errors)
+    if index:
+        for key in (
+            "total_modules",
+            "total_historical_modules",
+            "total_modules_in_process",
+            "total_certificates_with_algorithms",
+            "total_certificate_details",
+        ):
+            add_error(errors, index.get(key) == metadata.get(key), f"api/index.json: {key} mismatch")
+        endpoints = index.get("endpoints") or {}
+        add_error(errors, isinstance(endpoints, dict), "api/index.json: endpoints must be an object")
+        if isinstance(endpoints, dict):
+            add_error(errors, ("algorithms" in endpoints) == has_algorithms, "api/index.json: algorithms endpoint presence mismatch")
+        features = index.get("features") or {}
+        if require_current_schema and isinstance(features, dict):
+            add_error(errors, features.get("algorithm_extraction_provenance") is True, "api/index.json: missing algorithm_extraction_provenance feature")
+            add_error(errors, features.get("extraction_metrics") is True, "api/index.json: missing extraction_metrics feature")
+            add_error(errors, features.get("json_schemas") is True, "api/index.json: missing json_schemas feature")
+            schemas = index.get("schemas")
+            add_error(errors, isinstance(schemas, dict), "api/index.json: schemas must be an object")
+
+    openapi = load_json(root / "openapi.json", errors)
+    if openapi:
+        paths = openapi.get("paths") or {}
+        for path in (
+            "/api/index.json",
+            "/api/metadata.json",
+            "/api/modules.json",
+            "/api/historical-modules.json",
+            "/api/modules-in-process.json",
+            "/api/certificates/{certificate}.json",
+        ):
+            add_error(errors, path in paths, f"openapi.json: missing path {path}")
+        add_error(errors, ("/api/algorithms.json" in paths) == has_algorithms, "openapi.json: algorithms path presence mismatch")
+
+    for doc_path, required_text in (
+        ("README.md", "certificates/{certificate}.json"),
+        ("llms.txt", "api/metadata.json"),
+        ("llms-full.txt", "GET api/certificates/{certificate}.json"),
+        ("api/docs.md", "GET api/certificates/{certificate}.json"),
+        ("index.html", "api/metadata.json"),
+    ):
+        path = root / doc_path
+        try:
+            content = path.read_text(encoding="utf-8")
+        except Exception as exc:
+            errors.append(f"{doc_path}: failed to read: {exc}")
+            continue
+        add_error(errors, bool(content.strip()), f"{doc_path}: empty documentation file")
+        add_error(errors, required_text in content, f"{doc_path}: missing expected text {required_text!r}")
+        if require_current_schema and doc_path in {"llms.txt", "api/docs.md", "index.html"}:
+            add_error(errors, "api/schemas/index.schema.json" in content, f"{doc_path}: missing JSON Schema link")
+
+    if require_current_schema:
+        expected_schema_files = list(JSON_SCHEMA_FILES)
+        if has_algorithms:
+            expected_schema_files.append("api/schemas/algorithms.schema.json")
+        for relative_path in expected_schema_files:
+            schema = load_json(root / relative_path, errors)
+            if schema:
+                add_error(errors, schema.get("$schema") == "https://json-schema.org/draft/2020-12/schema", f"{relative_path}: missing JSON Schema draft marker")
+                add_error(errors, bool(schema.get("$id")), f"{relative_path}: missing $id")
+                add_error(errors, bool(schema.get("title")), f"{relative_path}: missing title")
+        if not has_algorithms:
+            add_error(errors, not (root / "api/schemas/algorithms.schema.json").exists(), "api/schemas/algorithms.schema.json exists despite zero algorithm coverage")
+
+
+def validate_api(
+    root: Path = Path("."),
+    require_current_schema: bool = False,
+    forbid_firecrawl_run_source: bool = False,
+) -> List[str]:
+    """Return a list of validation errors for generated API artifacts."""
+    errors: List[str] = []
+    root = root.resolve()
+
+    for relative_path in REQUIRED_TOP_LEVEL_FILES:
+        add_error(errors, (root / relative_path).exists(), f"{relative_path}: missing required artifact")
+
+    metadata = load_json(root / "api" / "metadata.json", errors)
+    modules_payload = load_json(root / "api" / "modules.json", errors)
+    historical_payload = load_json(root / "api" / "historical-modules.json", errors)
+    in_process_payload = load_json(root / "api" / "modules-in-process.json", errors)
+    if not all(isinstance(payload, dict) for payload in (metadata, modules_payload, historical_payload, in_process_payload)):
+        return errors
+
+    assert metadata is not None and modules_payload is not None and historical_payload is not None and in_process_payload is not None
+
+    for label, payload in (
+        ("api/modules.json", modules_payload),
+        ("api/historical-modules.json", historical_payload),
+        ("api/modules-in-process.json", in_process_payload),
+    ):
+        add_error(errors, payload.get("metadata") == metadata, f"{label}: embedded metadata does not match api/metadata.json")
+
+    modules = modules_payload.get("modules")
+    historical_modules = historical_payload.get("modules")
+    modules_in_process = in_process_payload.get("modules_in_process")
+    add_error(errors, isinstance(modules, list), "api/modules.json: modules must be a list")
+    add_error(errors, isinstance(historical_modules, list), "api/historical-modules.json: modules must be a list")
+    add_error(errors, isinstance(modules_in_process, list), "api/modules-in-process.json: modules_in_process must be a list")
+    if not isinstance(modules, list) or not isinstance(historical_modules, list) or not isinstance(modules_in_process, list):
+        return errors
+
+    add_error(errors, metadata.get("total_modules") == len(modules), "metadata: total_modules mismatch")
+    add_error(errors, metadata.get("total_historical_modules") == len(historical_modules), "metadata: total_historical_modules mismatch")
+    add_error(errors, metadata.get("total_modules_in_process") == len(modules_in_process), "metadata: total_modules_in_process mismatch")
+
+    active_datasets, active_algorithms = validate_module_rows(modules, "active", errors, require_current_schema)
+    historical_datasets, historical_algorithms = validate_module_rows(historical_modules, "historical", errors, require_current_schema)
+    overlapping_certs = sorted(set(active_datasets) & set(historical_datasets))
+    if overlapping_certs:
+        errors.append(f"active/historical modules: duplicate certificate numbers across datasets: {overlapping_certs[:5]}")
+
+    expected_datasets = {**active_datasets, **historical_datasets}
+    expected_algorithms = {**active_algorithms, **historical_algorithms}
+    add_error(errors, metadata.get("total_certificate_details") == len(expected_datasets), "metadata: total_certificate_details mismatch")
+
+    if require_current_schema:
+        add_error(errors, "algorithm_extraction_schema_version" in metadata, "metadata: missing algorithm_extraction_schema_version")
+        add_error(errors, "extraction_metrics" in metadata, "metadata: missing extraction_metrics")
+
+    if forbid_firecrawl_run_source:
+        add_error(errors, metadata.get("algorithm_source") != "firecrawl", "metadata: algorithm_source must not be firecrawl")
+
+    validate_certificate_details(
+        root / "api" / "certificates",
+        expected_datasets,
+        expected_algorithms,
+        errors,
+        require_current_schema,
+    )
+    validate_algorithms_summary(root, metadata, expected_algorithms, errors)
+    validate_docs_and_index(
+        root,
+        metadata,
+        bool(expected_algorithms),
+        errors,
+        require_current_schema,
+    )
+
+    if forbid_firecrawl_run_source and metadata.get("total_certificates_with_algorithms", 0):
+        algorithms_metadata = load_json(root / "api" / "algorithms.json", errors)
+        if algorithms_metadata:
+            nested_metadata = algorithms_metadata.get("metadata") or {}
+            add_error(errors, nested_metadata.get("source") != "firecrawl", "api/algorithms.json: metadata.source must not be firecrawl")
+            add_error(errors, nested_metadata.get("algorithm_source") != "firecrawl", "api/algorithms.json: metadata.algorithm_source must not be firecrawl")
+
+    return errors
+
+
+def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
+    """Parse CLI arguments."""
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--root", default=".", help="Repository root containing generated API artifacts")
+    parser.add_argument(
+        "--require-current-schema",
+        action="store_true",
+        help="Require fields generated by the current scraper schema, including extraction provenance",
+    )
+    parser.add_argument(
+        "--forbid-firecrawl-run-source",
+        action="store_true",
+        help="Fail if the current run metadata says algorithm extraction used Firecrawl",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    """CLI entry point."""
+    args = parse_args(argv)
+    errors = validate_api(
+        Path(args.root),
+        require_current_schema=args.require_current_schema,
+        forbid_firecrawl_run_source=args.forbid_firecrawl_run_source,
+    )
+    if errors:
+        print("API artifact validation failed:")
+        for error in errors:
+            print(f"- {error}")
+        return 1
+
+    print("API artifact validation passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 6ee83a15e4810a1c56c28c067c4e0183f6309093 Mon Sep 17 00:00:00 2001
From: Ethan Troy <63926014+ethanolivertroy@users.noreply.github.com>
Date: Thu, 14 May 2026 04:38:37 +0000
Subject: [PATCH 2/2] add pr validation workflow

---
 .github/workflows/validate.yml | 47 ++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 .github/workflows/validate.yml

diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml
new file mode 100644
index 000000000..7b3c02896
--- /dev/null
+++ b/.github/workflows/validate.yml
@@ -0,0 +1,47 @@
+name: Validate
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - 'codex/**'
+
+permissions:
+  contents: read
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Compile Python files
+        run: |
+          python -m py_compile scraper.py test_scraper.py validate_api.py
+
+      - name: Run scraper tests
+        run: |
+          python test_scraper.py
+
+      - name: Validate checked-in API artifacts
+        run: |
+          python validate_api.py
+
+      - name: Check whitespace errors
+        run: |
+          git diff --check