diff --git a/.github/ETHICS_QUESTIONNAIRE.MD b/.github/ETHICS_QUESTIONNAIRE.MD new file mode 100644 index 0000000..57223a3 --- /dev/null +++ b/.github/ETHICS_QUESTIONNAIRE.MD @@ -0,0 +1,36 @@ +**Ethics & Regulatory Questionnaire** +*This PR cannot be merged until this form is completed.* + +Please reply to this comment and answer all questions below (you can copy-paste and fill it). + +1. Does this change involve any of the following? (check all that apply) + - [ ] Training or fine-tuning of AI/ML models + - [ ] Inference/serving of AI/ML models in production + - [ ] Processing of personal data (PII, health, biometric, financial, children’s data, etc.) + - [ ] Dual-use or military-applicable technology + - [ ] Safety-critical systems (medical device, aviation, automotive, etc.) + - [ ] High-impact algorithmic decision-making (credit, hiring, criminal justice, etc.) + - [ ] None of the above (pure docs, tests, CI, formatting, etc.) + +2. Estimated risk level (your honest assessment) + - [ ] Low – no ethical or regulatory impact + - [ ] Medium – possible fairness/privacy concerns + - [ ] High – potential for serious harm or legal non-compliance + +3. Brief description of any ethical/regulatory impact (or write “None”) + + > + +4. Relevant regulations / standards considered (e.g., EU AI Act, GDPR, HIPAA, NIST AI RMF, export controls, etc.) + List them or write “N/A” + + > + +5. Have mitigation measures been implemented (bias testing, data minimization, consent flows, etc.)? + - [ ] Yes → describe below + - [ ] No + - [ ] Not applicable + + > + +Thank you! The ethics gate will evaluate your answers automatically. \ No newline at end of file diff --git a/.github/issue_template/questionnaire.yaml b/.github/issue_template/questionnaire.yaml new file mode 100644 index 0000000..b770075 --- /dev/null +++ b/.github/issue_template/questionnaire.yaml @@ -0,0 +1,23 @@ +name: Ethics & Regulatory Questionnaire +description: Required for all PRs with potential ethical/regulatory impact +title: "[Ethics Review] " +body: + - type: checkboxes + attributes: + label: Scope of Change + options: + - label: Involves training or inference of AI/ML models + - label: Processes personal data (PII, health, financial, etc.) + - label: Dual-use potential (could be used in weapons/autonomous systems) + - label: Affects safety-critical systems + - label: Purely documentation / tests / CI changes (safe) + + - type: textarea + attributes: + label: Description of ethical/regulatory impact (if any) + placeholder: Explain who might be harmed, fairness implications, compliance requirements, etc. + + - type: dropdown + attributes: + label: Have you consulted the relevant regulatory framework? + options: ["Yes", "No", "Not applicable"] \ No newline at end of file diff --git a/.github/workflows/docker-build-deploy.yaml b/.github/workflows/docker-build-deploy.yaml index 177a365..98c8e53 100644 --- a/.github/workflows/docker-build-deploy.yaml +++ b/.github/workflows/docker-build-deploy.yaml @@ -12,6 +12,9 @@ jobs: scan: runs-on: ubuntu-latest steps: + - name: Install presidio-analyzer + run: pip install presidio-analyzer + - name: Checkout repository uses: actions/checkout@v2 @@ -24,15 +27,16 @@ jobs: run: | python -m venv venv source venv/bin/activate + pip install --upgrade pip pip install -r requirements.txt - + - name: Run scan run: | source venv/bin/activate python main.py - - name: Upload scan report - uses: actions/upload-artifact@v2 - with: - name: scan_report - path: reports/scan_report.json \ No newline at end of file + ##- name: Upload scan report + ## uses: actions/upload-artifact@v4 + ## with: + ## name: scan_report + ## path: reports/scan_report.json \ No newline at end of file diff --git a/.github/workflows/ethics-gate.yaml b/.github/workflows/ethics-gate.yaml new file mode 100644 index 0000000..28aee42 --- /dev/null +++ b/.github/workflows/ethics-gate.yaml @@ -0,0 +1,148 @@ +on: + pull_request_target: + types: [opened, reopened, synchronize] + issue_comment: + types: [created] + +permissions: + contents: read # needed for checkout + pull-requests: write # needed for commenting & reviews (gh) when running in pull_request_target + checks: write # needed to create check runs + +jobs: + # Job that posts the questionnaire (runs in the trusted pull_request_target context). + post-questionnaire: + if: github.event_name == 'pull_request_target' && github.event.pull_request.draft == false + runs-on: ubuntu-latest + steps: + - name: Checkout base repo (safe; do NOT checkout PR head here) + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.base.sha }} + fetch-depth: 0 + + - name: Authenticate gh CLI with GITHUB_TOKEN + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token + + - name: Check if questionnaire already answered + id: check + run: | + PR_NUMBER=${{ github.event.pull_request.number }} + # Collect PR comments (robust to empty output) + RESPONSES=$(gh pr view "$PR_NUMBER" --json comments --jq '.comments[].body' 2>/dev/null | grep -i "Ethics & Regulatory Questionnaire" -A 20 || true) + if [[ -z "$RESPONSES" ]]; then + echo "status=missing" >> $GITHUB_OUTPUT + else + echo "status=answered" >> $GITHUB_OUTPUT + fi + + - name: Post questionnaire if missing + if: steps.check.outputs.status == 'missing' + run: | + # ensure file exists in the base repo checkout (case-sensitive) + if [[ ! -f .github/ETHICS_QUESTIONNAIRE.MD ]]; then + echo ".github/ETHICS_QUESTIONNAIRE.MD not found in base repo; aborting." >&2 + exit 1 + fi + gh pr comment ${{ github.event.pull_request.number }} --body-file .github/ETHICS_QUESTIONNAIRE.MD + echo "Posted ethics questionnaire to PR #${{ github.event.pull_request.number }}." + + # Ethics engine: collects comments, runs evaluation, posts a check, and requests changes for HIGH risk. + # This job runs in the trusted context for pull_request_target and also on issue_comment (untrusted). + # For untrusted issue_comment runs, write actions (requesting changes) may be skipped if permissions are restricted. + ethics-engine: + runs-on: ubuntu-latest + needs: post-questionnaire + steps: + - name: Checkout base repo (we run parser from base repo) + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.base.sha || github.ref }} + fetch-depth: 0 + + - name: Authenticate gh CLI with GITHUB_TOKEN + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token + + - name: Determine PR number + id: prnumber + run: | + # Determine PR number whether triggered by pull_request_target or issue_comment + PR_NUMBER=$(jq -r 'if .pull_request then .pull_request.number elif .issue then .issue.number else empty end' "$GITHUB_EVENT_PATH") + if [[ -z "$PR_NUMBER" ]]; then + echo "No PR number found in event payload; exiting." + echo "risk=UNKNOWN" >> $GITHUB_OUTPUT + exit 0 + fi + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + + - name: Collect comments + id: collect + run: | + PR=${{ steps.prnumber.outputs.pr_number }} + # Gather all PR comments into a single string (robust to empty) + ANSWERS=$(gh pr view "$PR" --json comments --jq '[.comments[].body] | join("\n\n")' 2>/dev/null || true) + echo "$ANSWERS" > answers.txt + # Expose the answers (trim to avoid huge output) + echo "answers=$(echo "$ANSWERS" | head -c 32768 | sed -e 's/"/'"'"'"/g')" >> $GITHUB_OUTPUT + + - name: Run ethics parser & evaluator (safe runs code from base repo) + id: run_engine + env: + PR_NUMBER: ${{ steps.prnumber.outputs.pr_number }} + run: | + # Ensure parser exists + if [[ ! -f .github/workflows/parse_and_evaluate.py ]]; then + echo "Parser .github/workflows/parse_and_evaluate.py not found in base repo; aborting." + echo "RISK_LEVEL=UNKNOWN" > result.txt + else + python3 .github/workflows/parse_and_evaluate.py "$(cat answers.txt)" > result.txt || true + fi + cat result.txt + # Extract RISK_LEVEL=XYZ from result.txt if present + RISK=$(grep -m1 '^RISK_LEVEL=' result.txt | cut -d= -f2 || echo "LOW") + echo "risk=$RISK" >> $GITHUB_OUTPUT + + - name: Create/update "Ethics Review" check run + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const risk = "${{ steps.run_engine.outputs.risk }}".trim(); + const conclusions = { + "LOW": "success", + "MEDIUM": "action_required", + "HIGH": "failure" + }; + const conclusion = conclusions[risk] || "failure"; + const head_sha = (context.payload.pull_request && context.payload.pull_request.head && context.payload.pull_request.head.sha) || (context.payload.issue && context.payload.issue.pull_request && context.payload.issue.number ? undefined : undefined) || github.event.pull_request?.head?.sha; + await github.rest.checks.create({ + owner: context.repo.owner, + repo: context.repo.repo, + name: "Ethics Review", + head_sha: head_sha || context.sha, + status: "completed", + conclusion, + output: { + title: risk === "LOW" ? "Ethics cleared" : `Ethics review: ${risk}`, + summary: risk === "LOW" ? "Low risk – automatically approved" : `Risk level ${risk} – review required` + } + }); + + - name: Request changes on HIGH risk (trusted-only; skip on untrusted events) + if: steps.run_engine.outputs.risk == 'HIGH' + run: | + PR=${{ steps.prnumber.outputs.pr_number }} + # Only attempt to request changes when running in pull_request_target context (trusted). + if [[ "${GITHUB_EVENT_NAME}" != "pull_request_target" ]]; then + echo "Not in pull_request_target context; skipping request-changes (insufficient permissions for fork PRs)." + exit 0 + fi + # Request changes using gh (GITHUB_TOKEN from pull_request_target has write rights) + gh pr review "$PR" --request-changes -b "@ethics-team Required manual review for high-risk change" + echo "Requested changes on PR #$PR due to HIGH risk." + + - name: Final status message + run: | + echo "Ethics engine completed. Risk level: ${{ steps.run_engine.outputs.risk }}" \ No newline at end of file diff --git a/.github/workflows/redeengine.py b/.github/workflows/redeengine.py new file mode 100644 index 0000000..c9f5be2 --- /dev/null +++ b/.github/workflows/redeengine.py @@ -0,0 +1,38 @@ +import os +import json +import sys + +def evaluate_risk(answers): + risk_score = 0 + flags = [] + + if answers.get("involves_ai", False): + risk_score += 3 + flags.append("AI/ML component") + if answers.get("processes_pii", False): + risk_score += 5 + flags.append("Personal data") + if answers.get("dual_use", False): + risk_score += 10 + flags.append("🚨 Dual-use technology") + if answers.get("safety_critical", False): + risk_score += 8 + flags.append("Safety-critical") + + if "purely documentation" in answers.get("safe_changes", []): + return "LOW", "No ethical concerns detected." + + if risk_score >= 10: + return "HIGH", " | ".join(flags) + elif risk_score >= 5: + return "MEDIUM", " | ".join(flags) + else: + return "LOW", "Minor changes" + +# Parse comment or form submission here (simplified) +# In real use, you'd parse the actual comment body +answers = json.loads(sys.argv[1]) # passed from workflow +level, reason = evaluate_risk(answers) + +print(f"RISK_LEVEL={level}") +print(f"REASON={reason}") \ No newline at end of file diff --git a/README.md b/README.md index 9cf83ba..21540fc 100644 Binary files a/README.md and b/README.md differ diff --git a/asset-scanner/.gitignore b/asset-scanner/.gitignore new file mode 100644 index 0000000..2bbed19 --- /dev/null +++ b/asset-scanner/.gitignore @@ -0,0 +1,20 @@ +# PII & Secrets Scanner - do not commit files +scan_report.json +scan_report.local.json +scan_report.shareable.json +*.local.json +local_scan_*.json +temp_report_*.json + +# ignore any backup or temp reports +*.json.bak +*.json.tmp + +# Optional: ignore the raw findings before enrichment (if you ever dump them) +raw_findings.json +debug_scan.json + +# OS / editor +.DS_Store +Thumbs.db +*.log \ No newline at end of file diff --git a/asset-scanner/patterns.json b/asset-scanner/patterns.json index 3c80dc3..9efd109 100644 --- a/asset-scanner/patterns.json +++ b/asset-scanner/patterns.json @@ -1,102 +1,42 @@ { - "email": { - "pattern": "[a-zA-Z0-9+._%-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,63}", - "risk": "Medium", - "description": "Email address" - }, "aws_access_key": { "pattern": "\\bAKIA[0-9A-Z]{16}\\b", "risk": "High", - "description": "AWS Access Key" + "description": "AWS Access Key ID" }, - "aws_secret_access_key": { + "aws_secret_key": { "pattern": "(?, "file": , "line": , "match": , "description": } +#!/usr/bin/env python3 +""" +scanner.py — Redback Ethics PII & Secrets Scanner (Presidio-powered) + +Features: + • Hybrid detection: Microsoft Presidio (NLP) + custom regex fallback + • High-accuracy detection of names, emails, phones, credit cards, addresses + • Keeps full compatibility with: + - patterns.json → only secrets not covered by Presidio needed + - reporter.py → identical findings schema and exit code behavior + +Findings schema (unchanged): + { + "pattern": , + "file": , + "line": , + "match": , + "description": + } Exit code: - - 1 if any High-risk finding (per risk_rules.json via reporter.write_report) - - 0 otherwise + • 1 → if any High-risk finding (via reporter.write_report + risk_rules.json) + • 0 → otherwise + +Now requires: pip install presidio-analyzer """ from __future__ import annotations @@ -18,176 +33,178 @@ import re import sys from bisect import bisect -from typing import Dict, Any, Iterable, List, Tuple +from typing import Dict, Any, Iterable, List import os -# v1/v3 utilities (project-provided) -from file_handler import find_files, read_file +# Presidio +try: + from presidio_analyzer import AnalyzerEngine + from presidio_analyzer import PatternRecognizer, Pattern +except ImportError: + print("[!] ERROR: presidio-analyzer not installed") + print(" Run: pip install presidio-analyzer") + sys.exit(1) -# Belle's reporter (Stream 4) +from file_handler import find_files, read_file from reporter import write_report, generate_console_report -# ---- Defaults (align with your repo) ---- +# Defaults DEFAULT_PATTERNS_FILE = "patterns.json" DEFAULT_TARGET_EXTS = [".py", ".txt", ".md", ".cfg", ".json", ".docx", ".csv", ".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp"] -DEFAULT_OUT = "scan_report.json" +DEFAULT_OUT = "scan_report.local.json" -# ---- Patterns ---- +# Presidio Engine (auto-download once) +def get_analyzer() -> AnalyzerEngine: + print("[i] Initializing Presidio analyzer (first run downloads ~120 MB model)...") + return AnalyzerEngine() +# Load patterns.json def load_patterns(path: str) -> Dict[str, Dict[str, Any]]: - """ - Load pattern definitions from patterns.json - Expected shape: - { - "email": { "pattern": "...", "risk": "Low|High|...", "description": "..." }, - ... - } - """ with open(path, "r", encoding="utf-8") as f: data = json.load(f) - if not isinstance(data, dict): - raise ValueError("patterns.json must be a JSON object mapping ids to rules.") - for pid, rule in data.items(): - if "pattern" not in rule: - raise ValueError(f"Pattern '{pid}' is missing the 'pattern' field.") + print(f"[i] Loaded {len(data)} patterns from {path}") return data -def compile_patterns(patterns: Dict[str, Dict[str, Any]]) -> Dict[str, re.Pattern]: - """Compile all regexes once with DOTALL (to match across lines where needed).""" - compiled: Dict[str, re.Pattern] = {} - for pid, rule in patterns.items(): - pat = rule["pattern"] - try: - compiled[pid] = re.compile(pat, re.DOTALL) - except re.error as e: - raise ValueError(f"Invalid regex for pattern '{pid}': {e}") - return compiled - -# ---- Scanning helpers ---- - -def _newline_indices(text: str) -> List[int]: - return [i for i, ch in enumerate(text) if ch == "\n"] - -def _line_number(newlines: List[int], idx: int) -> int: - # 1-based line numbers: count of newlines before idx + 1 - return bisect(newlines, idx) + 1 - -def scan_text(text: str, file_path: str, - compiled: Dict[str, re.Pattern], - meta: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: - """ - Run all compiled patterns over a text blob, recording file and line per match. - Returns a list of finding dicts for reporter.py. - """ +# Core scanning +def scan_text(text: str, file_path: str, analyzer: AnalyzerEngine, patterns_meta: Dict) -> List[Dict[str, Any]]: findings: List[Dict[str, Any]] = [] - if not text: + if not text.strip(): return findings - newlines = _newline_indices(text) - for pid, regex in compiled.items(): - desc = meta.get(pid, {}).get("description", pid) - for m in regex.finditer(text): - start = m.start() - line = _line_number(newlines, start) - raw = m.group(0) + newlines = [i for i, c in enumerate(text) if c == "\n"] + + # Presidio scan + try: + results = analyzer.analyze(text=text, language="en", score_threshold=0.01) + print(f"[i] Presidio found {len(results)} potential entities in {os.path.basename(file_path)}") + + for r in results: + if r.score < 0.3: + continue + + # Map Presidio entity to pattern ID + entity = r.entity_type.upper() + pattern_id = None + + # Direct match via "presidio_entity" field in patterns.json + for pid, rule in patterns_meta.items(): + if rule.get("presidio_entity", "").upper() == entity: + pattern_id = pid + break + # Fallback: common built-in names + if not pattern_id: + fallback_map = { + "EMAIL_ADDRESS": "email", + "PHONE_NUMBER": "phone", + "CREDIT_CARD": "credit_card", + "US_SSN": "ssn", + "PERSON": "full_name", + "LOCATION": "location", + "IP_ADDRESS": "ip_address" + } + pattern_id = fallback_map.get(entity, entity.lower()) + + line = bisect(newlines, r.start) + 1 + match_text = text[r.start:r.end] + findings.append({ - "pattern": pid, + "pattern": pattern_id, "file": file_path, "line": line, - "match": raw, - "description": desc + "match": match_text, + "description": patterns_meta.get(pattern_id, {}).get("description", f"Detected {entity}") }) + print(f" → Found: {pattern_id} | {match_text} | Line {line}") + + except Exception as e: + print(f"[!] Presidio crashed: {e}") + + # regex fallback + for pid, rule in patterns_meta.items(): + pat = rule.get("pattern") + if not pat or pat == "NOT_NEEDED": + continue + try: + for m in re.finditer(pat, text, re.DOTALL): + line = bisect(newlines, m.start()) + 1 + findings.append({ + "pattern": pid, + "file": file_path, + "line": line, + "match": m.group(0), + "description": rule.get("description", pid) + }) + print(f" → Regex hit: {pid} | {m.group(0)} | Line {line}") + except re.error: + pass + return findings -def scan_paths(paths: Iterable[str], - compiled: Dict[str, re.Pattern], - meta: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: - all_findings: List[Dict[str, Any]] = [] +# file scanner +def scan_paths(paths: Iterable[str], analyzer: AnalyzerEngine, patterns_meta: Dict) -> List[Dict[str, Any]]: + all_findings = [] for path in paths: + print(f"\n[i] Reading: {path}") content = read_file(path) - # Ensure we have text (read_file should return str; if bytes, decode) if isinstance(content, bytes): try: content = content.decode("utf-8") - except UnicodeDecodeError: + except: content = content.decode("latin-1", errors="ignore") - if not isinstance(content, str): - continue - all_findings.extend(scan_text(content, path, compiled, meta)) + if isinstance(content, str) and content.strip(): + print(f" → Extracted {len(content):,} characters") + all_findings.extend(scan_text(content, path, analyzer, patterns_meta)) + else: + print(" → No text extracted (image-only PDF?)") return all_findings -# ---- CLI ---- - -def parse_args(argv: List[str]) -> argparse.Namespace: - ap = argparse.ArgumentParser(description="Unified sensitive-data scanner") - ap.add_argument("--file", help="Single file to scan (overrides --root and --ext)") - ap.add_argument("--root", default=".", help="Root directory to scan (default: current dir)") - ap.add_argument("--patterns", default=DEFAULT_PATTERNS_FILE, help="Path to patterns.json") - ap.add_argument("--out", default=DEFAULT_OUT, help="Path to JSON report output") - ap.add_argument("--ext", nargs="*", default=DEFAULT_TARGET_EXTS, - help="File extensions to include (e.g., .py .txt .md .cfg .json)") - ap.add_argument("--no-console", action="store_true", help="Skip console summary output") - return ap.parse_args(argv) +# CLI & main +def parse_args(argv=None): + # parse_args function + ap = argparse.ArgumentParser(description="Sensitive data scanner") + ap.add_argument("--file", help="Single file to scan") + ap.add_argument("--root", default=".", help="Root directory") + ap.add_argument("--patterns", default=DEFAULT_PATTERNS_FILE) + ap.add_argument("--out", default=DEFAULT_OUT) + ap.add_argument("--ext", nargs="*", default=DEFAULT_TARGET_EXTS) + ap.add_argument("--no-console", action="store_true") + return ap.parse_args(argv or sys.argv[1:]) -# Function to get a valid directory path from the user def get_valid_path(): while True: - path = input("Enter the directory path containing the files to scan (press Enter to use the project folder): ").strip() - path = path.strip('"').strip("'") # Remove surrounding quotes if present - if not path: # If no input is provided, use the current directory - print("No path provided. Files will be scanned in the project folder.") - print("-" * 63) + path = input("Enter directory (or Enter for current): ").strip().strip('"\'') + if not path: return os.getcwd() - elif os.path.isdir(path): # Validate the provided path - print("-" * 63) + if os.path.isdir(path): return path - - else: - print("We cannot find that path. Please enter a valid directory or press Enter to use the project folder.") - -# ---- Main ---- - -def main(argv: List[str] | None = None) -> int: - ns = parse_args(argv or sys.argv[1:]) + print("Invalid path, try again.") - patterns = load_patterns(ns.patterns) - compiled = compile_patterns(patterns) +def main(): + ns = parse_args() + patterns_meta = load_patterns(ns.patterns) + analyzer = get_analyzer() - # Check if a specific file is provided if ns.file: - # Validate the file path - if not os.path.isfile(ns.file): - print(f"[!] The specified file does not exist: {ns.file}") - return 1 - - # Scan only the specified file - print(f"[i] Scanning the specified file: {ns.file}") - findings = scan_paths([ns.file], compiled, patterns) + paths = [ns.file] + print(f"[i] Scanning single file: {ns.file}") else: - # Identify valid directory to scan directory = get_valid_path() + paths = list(find_files(directory, ns.ext)) + print(f"[i] Found {len(paths)} files to scan in {directory}") - # Use project helper to expand files under root with extension filter - file_list = list(find_files(directory, ns.ext)) - findings = scan_paths(file_list, compiled, patterns) + findings = scan_paths(paths, analyzer, patterns_meta) - # JSON report (enriched with risk/tip/laws by reporter.write_report) enriched = write_report(findings, out_path=ns.out) + print(f"\n[i] Full report (with paths & raw PII) saved locally → {ns.out}") + print(" This file is git-ignored and must NEVER be committed.") - # Console summary (masked) - if not ns.no_console: - generate_console_report(findings) - - # Exit code policy: fail if any High risk present - has_high = any(f.get("risk") == "High" for f in enriched) - if has_high: - print("[!] High-risk data found. Failing scan.") + if any(f.get("risk") == "High" for f in enriched): + print("\n[!] HIGH-RISK PII DETECTED → SCAN FAILED") return 1 - - if enriched: - print("[i] Findings present. Review the report.") + elif findings: + print(f"\n[i] {len(findings)} findings → check {ns.out}") else: - print("[✓] No sensitive data detected.") + print("\n[Success] NO PII FOUND!") return 0 if __name__ == "__main__": diff --git a/main.py b/main.py new file mode 100644 index 0000000..3ca2e89 --- /dev/null +++ b/main.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import os +print("Redback Ethics scanner – environment is ready!") +print("Installed packages test:") + +try: + import spacy + print(f"spaCy {spacy.__version__} OK") +except Exception as e: + print(f"spaCy failed: {e}") + +try: + from presidio_analyzer import AnalyzerEngine + print("Presidio Analyzer OK") +except Exception as e: + print(f"Presidio failed: {e}") + +try: + import cv2 + print(f"OpenCV {cv2.__version__} OK") +except Exception as e: + print(f"OpenCV failed: {e}") + +print("\nNext step: actual scanner code needs to be added in scanner-bot/ or asset-scanner/") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 15227e1..64e62f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,12 @@ +presidio-analyzer +presidio-anonymizer +spacy==3.8.0 defusedxml==0.7.1 Faker==37.1.0 fonttools==4.57.0 fpdf2==2.8.3 lxml==5.4.0 -numpy==2.0.2 +numpy opencv-python==4.12.0.88 packaging==25.0 pandas==2.2.3 @@ -14,4 +17,4 @@ python-docx==1.1.2 pytz==2025.2 six==1.17.0 typing_extensions==4.13.2 -tzdata==2025.2 \ No newline at end of file +tzdata==2025.2