From 41cbd577bcc3b2c3f4b18787f5f412c567d9e0aa Mon Sep 17 00:00:00 2001 From: RamGcia Date: Wed, 19 Nov 2025 12:26:58 +1100 Subject: [PATCH 01/19] 1. Changed reliance on regex patterns to presidio 2. Found that previous iterations of scanner had hardcoded scanned information including PII and file directory in report.json, have now changed it so that it uploads locally and gitignores report.json file when .commit. --- asset-scanner/patterns.json | 86 ++-------- asset-scanner/requirements.txt | 2 + asset-scanner/scan_report.json | 94 ----------- asset-scanner/scanner.py | 279 +++++++++++++++++---------------- 4 files changed, 163 insertions(+), 298 deletions(-) diff --git a/asset-scanner/patterns.json b/asset-scanner/patterns.json index 3c80dc3..9efd109 100644 --- a/asset-scanner/patterns.json +++ b/asset-scanner/patterns.json @@ -1,102 +1,42 @@ { - "email": { - "pattern": "[a-zA-Z0-9+._%-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,63}", - "risk": "Medium", - "description": "Email address" - }, "aws_access_key": { "pattern": "\\bAKIA[0-9A-Z]{16}\\b", "risk": "High", - "description": "AWS Access Key" + "description": "AWS Access Key ID" }, - "aws_secret_access_key": { + "aws_secret_key": { "pattern": "(?, "file": , "line": , "match": , "description": } +#!/usr/bin/env python3 +""" +scanner.py — Redback Ethics PII & Secrets Scanner (Presidio-powered) + +Features: + • Hybrid detection: Microsoft Presidio (NLP) + custom regex fallback + • High-accuracy detection of names, emails, phones, credit cards, addresses + • Keeps full compatibility with: + - patterns.json → only secrets not covered by Presidio needed + - reporter.py → identical findings schema and exit code behavior + +Findings schema (unchanged): + { + "pattern": , + "file": , + "line": , + "match": , + "description": + } Exit code: - - 1 if any High-risk finding (per risk_rules.json via reporter.write_report) - - 0 otherwise + • 1 → if any High-risk finding (via reporter.write_report + risk_rules.json) + • 0 → otherwise + +Now requires: pip install presidio-analyzer """ from __future__ import annotations @@ -18,176 +33,178 @@ import re import sys from bisect import bisect -from typing import Dict, Any, Iterable, List, Tuple +from typing import Dict, Any, Iterable, List import os -# v1/v3 utilities (project-provided) -from file_handler import find_files, read_file +# Presidio +try: + from presidio_analyzer import AnalyzerEngine + from presidio_analyzer import PatternRecognizer, Pattern +except ImportError: + print("[!] ERROR: presidio-analyzer not installed") + print(" Run: pip install presidio-analyzer") + sys.exit(1) -# Belle's reporter (Stream 4) +from file_handler import find_files, read_file from reporter import write_report, generate_console_report -# ---- Defaults (align with your repo) ---- +# Defaults DEFAULT_PATTERNS_FILE = "patterns.json" DEFAULT_TARGET_EXTS = [".py", ".txt", ".md", ".cfg", ".json", ".docx", ".csv", ".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp"] -DEFAULT_OUT = "scan_report.json" +DEFAULT_OUT = "scan_report.local.json" -# ---- Patterns ---- +# Presidio Engine (auto-download once) +def get_analyzer() -> AnalyzerEngine: + print("[i] Initializing Presidio analyzer (first run downloads ~120 MB model)...") + return AnalyzerEngine() +# Load patterns.json def load_patterns(path: str) -> Dict[str, Dict[str, Any]]: - """ - Load pattern definitions from patterns.json - Expected shape: - { - "email": { "pattern": "...", "risk": "Low|High|...", "description": "..." }, - ... - } - """ with open(path, "r", encoding="utf-8") as f: data = json.load(f) - if not isinstance(data, dict): - raise ValueError("patterns.json must be a JSON object mapping ids to rules.") - for pid, rule in data.items(): - if "pattern" not in rule: - raise ValueError(f"Pattern '{pid}' is missing the 'pattern' field.") + print(f"[i] Loaded {len(data)} patterns from {path}") return data -def compile_patterns(patterns: Dict[str, Dict[str, Any]]) -> Dict[str, re.Pattern]: - """Compile all regexes once with DOTALL (to match across lines where needed).""" - compiled: Dict[str, re.Pattern] = {} - for pid, rule in patterns.items(): - pat = rule["pattern"] - try: - compiled[pid] = re.compile(pat, re.DOTALL) - except re.error as e: - raise ValueError(f"Invalid regex for pattern '{pid}': {e}") - return compiled - -# ---- Scanning helpers ---- - -def _newline_indices(text: str) -> List[int]: - return [i for i, ch in enumerate(text) if ch == "\n"] - -def _line_number(newlines: List[int], idx: int) -> int: - # 1-based line numbers: count of newlines before idx + 1 - return bisect(newlines, idx) + 1 - -def scan_text(text: str, file_path: str, - compiled: Dict[str, re.Pattern], - meta: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: - """ - Run all compiled patterns over a text blob, recording file and line per match. - Returns a list of finding dicts for reporter.py. - """ +# Core scanning +def scan_text(text: str, file_path: str, analyzer: AnalyzerEngine, patterns_meta: Dict) -> List[Dict[str, Any]]: findings: List[Dict[str, Any]] = [] - if not text: + if not text.strip(): return findings - newlines = _newline_indices(text) - for pid, regex in compiled.items(): - desc = meta.get(pid, {}).get("description", pid) - for m in regex.finditer(text): - start = m.start() - line = _line_number(newlines, start) - raw = m.group(0) + newlines = [i for i, c in enumerate(text) if c == "\n"] + + # Presidio scan + try: + results = analyzer.analyze(text=text, language="en", score_threshold=0.01) + print(f"[i] Presidio found {len(results)} potential entities in {os.path.basename(file_path)}") + + for r in results: + if r.score < 0.3: + continue + + # Map Presidio entity to pattern ID + entity = r.entity_type.upper() + pattern_id = None + + # Direct match via "presidio_entity" field in patterns.json + for pid, rule in patterns_meta.items(): + if rule.get("presidio_entity", "").upper() == entity: + pattern_id = pid + break + # Fallback: common built-in names + if not pattern_id: + fallback_map = { + "EMAIL_ADDRESS": "email", + "PHONE_NUMBER": "phone", + "CREDIT_CARD": "credit_card", + "US_SSN": "ssn", + "PERSON": "full_name", + "LOCATION": "location", + "IP_ADDRESS": "ip_address" + } + pattern_id = fallback_map.get(entity, entity.lower()) + + line = bisect(newlines, r.start) + 1 + match_text = text[r.start:r.end] + findings.append({ - "pattern": pid, + "pattern": pattern_id, "file": file_path, "line": line, - "match": raw, - "description": desc + "match": match_text, + "description": patterns_meta.get(pattern_id, {}).get("description", f"Detected {entity}") }) + print(f" → Found: {pattern_id} | {match_text} | Line {line}") + + except Exception as e: + print(f"[!] Presidio crashed: {e}") + + # regex fallback + for pid, rule in patterns_meta.items(): + pat = rule.get("pattern") + if not pat or pat == "NOT_NEEDED": + continue + try: + for m in re.finditer(pat, text, re.DOTALL): + line = bisect(newlines, m.start()) + 1 + findings.append({ + "pattern": pid, + "file": file_path, + "line": line, + "match": m.group(0), + "description": rule.get("description", pid) + }) + print(f" → Regex hit: {pid} | {m.group(0)} | Line {line}") + except re.error: + pass + return findings -def scan_paths(paths: Iterable[str], - compiled: Dict[str, re.Pattern], - meta: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: - all_findings: List[Dict[str, Any]] = [] +# file scanner +def scan_paths(paths: Iterable[str], analyzer: AnalyzerEngine, patterns_meta: Dict) -> List[Dict[str, Any]]: + all_findings = [] for path in paths: + print(f"\n[i] Reading: {path}") content = read_file(path) - # Ensure we have text (read_file should return str; if bytes, decode) if isinstance(content, bytes): try: content = content.decode("utf-8") - except UnicodeDecodeError: + except: content = content.decode("latin-1", errors="ignore") - if not isinstance(content, str): - continue - all_findings.extend(scan_text(content, path, compiled, meta)) + if isinstance(content, str) and content.strip(): + print(f" → Extracted {len(content):,} characters") + all_findings.extend(scan_text(content, path, analyzer, patterns_meta)) + else: + print(" → No text extracted (image-only PDF?)") return all_findings -# ---- CLI ---- - -def parse_args(argv: List[str]) -> argparse.Namespace: - ap = argparse.ArgumentParser(description="Unified sensitive-data scanner") - ap.add_argument("--file", help="Single file to scan (overrides --root and --ext)") - ap.add_argument("--root", default=".", help="Root directory to scan (default: current dir)") - ap.add_argument("--patterns", default=DEFAULT_PATTERNS_FILE, help="Path to patterns.json") - ap.add_argument("--out", default=DEFAULT_OUT, help="Path to JSON report output") - ap.add_argument("--ext", nargs="*", default=DEFAULT_TARGET_EXTS, - help="File extensions to include (e.g., .py .txt .md .cfg .json)") - ap.add_argument("--no-console", action="store_true", help="Skip console summary output") - return ap.parse_args(argv) +# CLI & main +def parse_args(argv=None): + # parse_args function + ap = argparse.ArgumentParser(description="Sensitive data scanner") + ap.add_argument("--file", help="Single file to scan") + ap.add_argument("--root", default=".", help="Root directory") + ap.add_argument("--patterns", default=DEFAULT_PATTERNS_FILE) + ap.add_argument("--out", default=DEFAULT_OUT) + ap.add_argument("--ext", nargs="*", default=DEFAULT_TARGET_EXTS) + ap.add_argument("--no-console", action="store_true") + return ap.parse_args(argv or sys.argv[1:]) -# Function to get a valid directory path from the user def get_valid_path(): while True: - path = input("Enter the directory path containing the files to scan (press Enter to use the project folder): ").strip() - path = path.strip('"').strip("'") # Remove surrounding quotes if present - if not path: # If no input is provided, use the current directory - print("No path provided. Files will be scanned in the project folder.") - print("-" * 63) + path = input("Enter directory (or Enter for current): ").strip().strip('"\'') + if not path: return os.getcwd() - elif os.path.isdir(path): # Validate the provided path - print("-" * 63) + if os.path.isdir(path): return path - - else: - print("We cannot find that path. Please enter a valid directory or press Enter to use the project folder.") - -# ---- Main ---- - -def main(argv: List[str] | None = None) -> int: - ns = parse_args(argv or sys.argv[1:]) + print("Invalid path, try again.") - patterns = load_patterns(ns.patterns) - compiled = compile_patterns(patterns) +def main(): + ns = parse_args() + patterns_meta = load_patterns(ns.patterns) + analyzer = get_analyzer() - # Check if a specific file is provided if ns.file: - # Validate the file path - if not os.path.isfile(ns.file): - print(f"[!] The specified file does not exist: {ns.file}") - return 1 - - # Scan only the specified file - print(f"[i] Scanning the specified file: {ns.file}") - findings = scan_paths([ns.file], compiled, patterns) + paths = [ns.file] + print(f"[i] Scanning single file: {ns.file}") else: - # Identify valid directory to scan directory = get_valid_path() + paths = list(find_files(directory, ns.ext)) + print(f"[i] Found {len(paths)} files to scan in {directory}") - # Use project helper to expand files under root with extension filter - file_list = list(find_files(directory, ns.ext)) - findings = scan_paths(file_list, compiled, patterns) + findings = scan_paths(paths, analyzer, patterns_meta) - # JSON report (enriched with risk/tip/laws by reporter.write_report) enriched = write_report(findings, out_path=ns.out) + print(f"\n[i] Full report (with paths & raw PII) saved locally → {ns.out}") + print(" This file is git-ignored and must NEVER be committed.") - # Console summary (masked) - if not ns.no_console: - generate_console_report(findings) - - # Exit code policy: fail if any High risk present - has_high = any(f.get("risk") == "High" for f in enriched) - if has_high: - print("[!] High-risk data found. Failing scan.") + if any(f.get("risk") == "High" for f in enriched): + print("\n[!] HIGH-RISK PII DETECTED → SCAN FAILED") return 1 - - if enriched: - print("[i] Findings present. Review the report.") + elif findings: + print(f"\n[i] {len(findings)} findings → check {ns.out}") else: - print("[✓] No sensitive data detected.") + print("\n[Success] NO PII FOUND!") return 0 if __name__ == "__main__": From a8612cc3c7da86f9b2ec9e9cecc7d3686247d257 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Wed, 19 Nov 2025 12:27:39 +1100 Subject: [PATCH 02/19] Adding .gitignore file for avoiding hardcoded info --- asset-scanner/.gitignore | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 asset-scanner/.gitignore diff --git a/asset-scanner/.gitignore b/asset-scanner/.gitignore new file mode 100644 index 0000000..2bbed19 --- /dev/null +++ b/asset-scanner/.gitignore @@ -0,0 +1,20 @@ +# PII & Secrets Scanner - do not commit files +scan_report.json +scan_report.local.json +scan_report.shareable.json +*.local.json +local_scan_*.json +temp_report_*.json + +# ignore any backup or temp reports +*.json.bak +*.json.tmp + +# Optional: ignore the raw findings before enrichment (if you ever dump them) +raw_findings.json +debug_scan.json + +# OS / editor +.DS_Store +Thumbs.db +*.log \ No newline at end of file From a3074b8efef0e19a3a77493b451d0f4aaf9b4de6 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Thu, 27 Nov 2025 11:15:13 +1100 Subject: [PATCH 03/19] Removing upload scan report workflow, also rectifying scan fail in github --- .github/workflows/docker-build-deploy.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/docker-build-deploy.yaml b/.github/workflows/docker-build-deploy.yaml index 177a365..67b162f 100644 --- a/.github/workflows/docker-build-deploy.yaml +++ b/.github/workflows/docker-build-deploy.yaml @@ -29,10 +29,4 @@ jobs: - name: Run scan run: | source venv/bin/activate - python main.py - - - name: Upload scan report - uses: actions/upload-artifact@v2 - with: - name: scan_report - path: reports/scan_report.json \ No newline at end of file + python main.py \ No newline at end of file From 0ee6373b4d57c9b3f34db479081f0be31ebd63c6 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Thu, 27 Nov 2025 11:22:57 +1100 Subject: [PATCH 04/19] rerolling --- .github/workflows/docker-build-deploy.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-build-deploy.yaml b/.github/workflows/docker-build-deploy.yaml index 67b162f..e37b8fb 100644 --- a/.github/workflows/docker-build-deploy.yaml +++ b/.github/workflows/docker-build-deploy.yaml @@ -29,4 +29,10 @@ jobs: - name: Run scan run: | source venv/bin/activate - python main.py \ No newline at end of file + python main.py + + - name: Upload scan report + uses: actions/upload-artifact@v4 + with: + name: scan_report + path: reports/scan_report.json \ No newline at end of file From e1032d83a124c51c1236c7c387f7194587bb3e74 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Thu, 27 Nov 2025 11:41:49 +1100 Subject: [PATCH 05/19] fixing directory pointed to main.py --- .github/workflows/docker-build-deploy.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/docker-build-deploy.yaml b/.github/workflows/docker-build-deploy.yaml index e37b8fb..ddcf1f1 100644 --- a/.github/workflows/docker-build-deploy.yaml +++ b/.github/workflows/docker-build-deploy.yaml @@ -25,14 +25,15 @@ jobs: python -m venv venv source venv/bin/activate pip install -r requirements.txt - + - name: Run scan run: | source venv/bin/activate - python main.py + python asset-scanner/scanner.py --root . --out $RUNNER_TEMP/scan_report.json --no-console --ext .py .md .txt - - name: Upload scan report - uses: actions/upload-artifact@v4 - with: - name: scan_report - path: reports/scan_report.json \ No newline at end of file + + ##- name: Upload scan report + ## uses: actions/upload-artifact@v4 + ## with: + ## name: scan_report + ## path: reports/scan_report.json \ No newline at end of file From aaabfd695f3475ee61a5feda7a8aef56093f8715 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Thu, 27 Nov 2025 11:44:14 +1100 Subject: [PATCH 06/19] rectifying workflow issue via dependency issue (presidio) --- .github/workflows/docker-build-deploy.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/docker-build-deploy.yaml b/.github/workflows/docker-build-deploy.yaml index ddcf1f1..86094a1 100644 --- a/.github/workflows/docker-build-deploy.yaml +++ b/.github/workflows/docker-build-deploy.yaml @@ -12,6 +12,9 @@ jobs: scan: runs-on: ubuntu-latest steps: + - name: Install presidio-analyzer + run: pip install presidio-analyzer + - name: Checkout repository uses: actions/checkout@v2 From 4d2259a7fd2bc29330d8ad5308a7685e6d8c7cbd Mon Sep 17 00:00:00 2001 From: RamGcia Date: Thu, 27 Nov 2025 11:53:16 +1100 Subject: [PATCH 07/19] for some reason requirements.txt did not save locally? --- requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 15227e1..c2f1b48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,6 @@ python-docx==1.1.2 pytz==2025.2 six==1.17.0 typing_extensions==4.13.2 -tzdata==2025.2 \ No newline at end of file +tzdata==2025.2 +presidio-analyzer +presidio-anonymizer \ No newline at end of file From 45945ecb6976c0c3043799dd4eff08ecb2e913be Mon Sep 17 00:00:00 2001 From: RamGcia Date: Thu, 27 Nov 2025 12:07:10 +1100 Subject: [PATCH 08/19] test --- .github/workflows/docker-build-deploy.yaml | 3 +-- README.md | Bin 1612 -> 1699 bytes requirements.txt | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker-build-deploy.yaml b/.github/workflows/docker-build-deploy.yaml index 86094a1..c2c3413 100644 --- a/.github/workflows/docker-build-deploy.yaml +++ b/.github/workflows/docker-build-deploy.yaml @@ -32,8 +32,7 @@ jobs: - name: Run scan run: | source venv/bin/activate - python asset-scanner/scanner.py --root . --out $RUNNER_TEMP/scan_report.json --no-console --ext .py .md .txt - + python main.py ##- name: Upload scan report ## uses: actions/upload-artifact@v4 diff --git a/README.md b/README.md index 9cf83bae8b65ffbd07b4c6783a5f91e6a191caac..21540fc89e424092031e4cdb9526b5b89a80a794 100644 GIT binary patch delta 349 zcmX@ZvzS*xS67#piPSjbOG+~H(iMVB z5{pVwQ-JOP+85&F0#d;{c`Az*D};5BMN|SP1@axxZ^{auz5)J0A&!0_ii*6GU$gWA z4X$9-7Dp%ndfz8AIW@01739b5tVtXogFpsMmSfX3;N=Qu$Y&^G$Yw}m$N{p-859^w Z7*ZLEfh34k#E{954uq*d7B2%A0|1ExOkn^3 delta 251 zcmZ3?dxl3;S67#dOIcZ2K}*ZOEVZaCGqqexYoml5qYzBkEj6*Ev?#S$OKY+>W8CCb zjML+lCeLKj7vs`Z2uZCdQAo?oNi9~;NYE>(C`r(q{D4VP94Hl@UzCE7O36>I z0Llq5D^E6KR-YWlY{n0>!#zJS2V}uAX7|bOm}4e;v)BPG=wXrI09oMV!o@Xt1B(_D zkh;$zD#`_NG| Date: Thu, 27 Nov 2025 12:24:42 +1100 Subject: [PATCH 09/19] test 2 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 27d7470..3d7eeaa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ presidio-analyzer presidio-anonymizer +spacy==3.6.1 defusedxml==0.7.1 Faker==37.1.0 fonttools==4.57.0 From 2df757e67ffee77d2775a48d2fcbdc5406e6ed91 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Thu, 27 Nov 2025 12:31:04 +1100 Subject: [PATCH 10/19] added main.py --- main.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..3ca2e89 --- /dev/null +++ b/main.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import os +print("Redback Ethics scanner – environment is ready!") +print("Installed packages test:") + +try: + import spacy + print(f"spaCy {spacy.__version__} OK") +except Exception as e: + print(f"spaCy failed: {e}") + +try: + from presidio_analyzer import AnalyzerEngine + print("Presidio Analyzer OK") +except Exception as e: + print(f"Presidio failed: {e}") + +try: + import cv2 + print(f"OpenCV {cv2.__version__} OK") +except Exception as e: + print(f"OpenCV failed: {e}") + +print("\nNext step: actual scanner code needs to be added in scanner-bot/ or asset-scanner/") \ No newline at end of file From 54985af1899bf927fb65a16d9ca6a163c9137edf Mon Sep 17 00:00:00 2001 From: RamGcia Date: Mon, 1 Dec 2025 20:57:42 +1100 Subject: [PATCH 11/19] fixed spacy requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3d7eeaa..5081f01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ presidio-analyzer presidio-anonymizer -spacy==3.6.1 +spacy==3.8.2 defusedxml==0.7.1 Faker==37.1.0 fonttools==4.57.0 From 94ae455350e3821a4fe8726248b03b674a647550 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Mon, 1 Dec 2025 21:03:14 +1100 Subject: [PATCH 12/19] fixed hangup (hopefully) --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5081f01..78c88ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ presidio-analyzer presidio-anonymizer -spacy==3.8.2 +spacy==3.8.0 defusedxml==0.7.1 Faker==37.1.0 fonttools==4.57.0 fpdf2==2.8.3 lxml==5.4.0 -numpy==2.0.2 +numpy==2.0.0 opencv-python==4.12.0.88 packaging==25.0 pandas==2.2.3 From c034b97074489ea8b2b1fc874f6b3ea33a9253d9 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Mon, 1 Dec 2025 21:05:47 +1100 Subject: [PATCH 13/19] fixed numpy --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 78c88ef..64e62f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ Faker==37.1.0 fonttools==4.57.0 fpdf2==2.8.3 lxml==5.4.0 -numpy==2.0.0 +numpy opencv-python==4.12.0.88 packaging==25.0 pandas==2.2.3 From 4491d92a552966811ecd7bda6f07e1cdc8eb65e8 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Mon, 1 Dec 2025 21:33:50 +1100 Subject: [PATCH 14/19] fixing dependencies --- .github/workflows/docker-build-deploy.yaml | 1 + requirements.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-build-deploy.yaml b/.github/workflows/docker-build-deploy.yaml index c2c3413..98c8e53 100644 --- a/.github/workflows/docker-build-deploy.yaml +++ b/.github/workflows/docker-build-deploy.yaml @@ -27,6 +27,7 @@ jobs: run: | python -m venv venv source venv/bin/activate + pip install --upgrade pip pip install -r requirements.txt - name: Run scan diff --git a/requirements.txt b/requirements.txt index 64e62f3..03d0fa9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ Faker==37.1.0 fonttools==4.57.0 fpdf2==2.8.3 lxml==5.4.0 -numpy +numpy==1.26.4 opencv-python==4.12.0.88 packaging==25.0 pandas==2.2.3 From 3b9f9fa5bc98c535fbca6c244c3a006e77846662 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Mon, 1 Dec 2025 21:35:38 +1100 Subject: [PATCH 15/19] dependencies --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 03d0fa9..64e62f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ Faker==37.1.0 fonttools==4.57.0 fpdf2==2.8.3 lxml==5.4.0 -numpy==1.26.4 +numpy opencv-python==4.12.0.88 packaging==25.0 pandas==2.2.3 From 63c414bd88b5e13b82aeb2ee07d96a7e8a8002d7 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Tue, 2 Dec 2025 14:21:34 +1100 Subject: [PATCH 16/19] adding new files for REDE --- .github/ETHICS_QUESTIONNAIRE.MD | 36 +++++++++ .github/issue_template/questionnaire.yaml | 23 ++++++ .github/workflows/ethics-gate.yaml | 99 +++++++++++++++++++++++ .github/workflows/redeengine.py | 38 +++++++++ 4 files changed, 196 insertions(+) create mode 100644 .github/ETHICS_QUESTIONNAIRE.MD create mode 100644 .github/issue_template/questionnaire.yaml create mode 100644 .github/workflows/ethics-gate.yaml create mode 100644 .github/workflows/redeengine.py diff --git a/.github/ETHICS_QUESTIONNAIRE.MD b/.github/ETHICS_QUESTIONNAIRE.MD new file mode 100644 index 0000000..57223a3 --- /dev/null +++ b/.github/ETHICS_QUESTIONNAIRE.MD @@ -0,0 +1,36 @@ +**Ethics & Regulatory Questionnaire** +*This PR cannot be merged until this form is completed.* + +Please reply to this comment and answer all questions below (you can copy-paste and fill it). + +1. Does this change involve any of the following? (check all that apply) + - [ ] Training or fine-tuning of AI/ML models + - [ ] Inference/serving of AI/ML models in production + - [ ] Processing of personal data (PII, health, biometric, financial, children’s data, etc.) + - [ ] Dual-use or military-applicable technology + - [ ] Safety-critical systems (medical device, aviation, automotive, etc.) + - [ ] High-impact algorithmic decision-making (credit, hiring, criminal justice, etc.) + - [ ] None of the above (pure docs, tests, CI, formatting, etc.) + +2. Estimated risk level (your honest assessment) + - [ ] Low – no ethical or regulatory impact + - [ ] Medium – possible fairness/privacy concerns + - [ ] High – potential for serious harm or legal non-compliance + +3. Brief description of any ethical/regulatory impact (or write “None”) + + > + +4. Relevant regulations / standards considered (e.g., EU AI Act, GDPR, HIPAA, NIST AI RMF, export controls, etc.) + List them or write “N/A” + + > + +5. Have mitigation measures been implemented (bias testing, data minimization, consent flows, etc.)? + - [ ] Yes → describe below + - [ ] No + - [ ] Not applicable + + > + +Thank you! The ethics gate will evaluate your answers automatically. \ No newline at end of file diff --git a/.github/issue_template/questionnaire.yaml b/.github/issue_template/questionnaire.yaml new file mode 100644 index 0000000..b770075 --- /dev/null +++ b/.github/issue_template/questionnaire.yaml @@ -0,0 +1,23 @@ +name: Ethics & Regulatory Questionnaire +description: Required for all PRs with potential ethical/regulatory impact +title: "[Ethics Review] " +body: + - type: checkboxes + attributes: + label: Scope of Change + options: + - label: Involves training or inference of AI/ML models + - label: Processes personal data (PII, health, financial, etc.) + - label: Dual-use potential (could be used in weapons/autonomous systems) + - label: Affects safety-critical systems + - label: Purely documentation / tests / CI changes (safe) + + - type: textarea + attributes: + label: Description of ethical/regulatory impact (if any) + placeholder: Explain who might be harmed, fairness implications, compliance requirements, etc. + + - type: dropdown + attributes: + label: Have you consulted the relevant regulatory framework? + options: ["Yes", "No", "Not applicable"] \ No newline at end of file diff --git a/.github/workflows/ethics-gate.yaml b/.github/workflows/ethics-gate.yaml new file mode 100644 index 0000000..640ae43 --- /dev/null +++ b/.github/workflows/ethics-gate.yaml @@ -0,0 +1,99 @@ +on: + pull_request: + types: [opened, reopened, synchronize] + pull_request_review_comment: + types: [created] + +permissions: + contents: read # needed for checkout + pull-requests: write # needed for commenting & reviews (gh) + checks: write # needed for check runs + +jobs: + require-ethics-questionnaire: + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest + outputs: + ethics_status: ${{ steps.check.outputs.status }} + steps: + - name: Checkout PR head (so .github/ files exist) + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Check if questionnaire already answered + id: check + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Look for bot comment or issue form submission (be robust to empty output) + RESPONSES=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[].body' 2>/dev/null | grep -i "Ethics & Regulatory Questionnaire" -A 20 || true) + if [[ -z "$RESPONSES" ]]; then + echo "status=missing" >> $GITHUB_OUTPUT + else + echo "status=answered" >> $GITHUB_OUTPUT + fi + + - name: Post questionnaire if missing + if: steps.check.outputs.status == 'missing' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr comment ${{ github.event.pull_request.number }} --body-file .github/ETHICS_QUESTIONNAIRE.md + echo "Please fill out the ethical/regulatory questionnaire above before this PR can be merged." + + ethics-engine: + needs: require-ethics-questionnaire + if: > + needs.require-ethics-questionnaire.outputs.ethics_status == 'answered' || + github.event_name == 'pull_request_review_comment' + runs-on: ubuntu-latest + steps: + - name: Checkout PR head (engine needs code + workflow files) + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Parse latest ethics comment & run engine + id: engine + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Collect comments (be robust if there are none) + ANSWERS=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '[.comments[].body] | join("\n\n")' 2>/dev/null || true) + python .github/workflows/parse_and_evaluate.py "$ANSWERS" > result.txt || true + cat result.txt + RISK=$(grep RISK_LEVEL result.txt | cut -d= -f2 || echo "LOW") + echo "risk=$RISK" >> $GITHUB_OUTPUT + + - uses: actions/github-script@v7 + with: + script: | + const risk = "${{ steps.engine.outputs.risk }}".trim(); + const conclusions = { + "LOW": "success", + "MEDIUM": "action_required", + "HIGH": "failure" + }; + const conclusion = conclusions[risk] || "failure"; + + await github.rest.checks.create({ + owner: context.repo.owner, + repo: context.repo.repo, + name: "Ethics Review", + head_sha: (context.payload.pull_request && context.payload.pull_request.head && context.payload.pull_request.head.sha) || github.event.pull_request.head.sha, + status: "completed", + conclusion, + output: { + title: risk === "LOW" ? "Ethics cleared" : `Ethics review: ${risk}`, + summary: risk === "LOW" ? "Low risk – automatically approved" : `Risk level ${risk} – review required` + } + }); + + - name: Block merge on HIGH risk + if: steps.engine.outputs.risk == 'HIGH' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr review ${{ github.event.pull_request.number }} \ + --request-changes \ + -b "@ethics-team Required manual review for high-risk change" \ No newline at end of file diff --git a/.github/workflows/redeengine.py b/.github/workflows/redeengine.py new file mode 100644 index 0000000..c9f5be2 --- /dev/null +++ b/.github/workflows/redeengine.py @@ -0,0 +1,38 @@ +import os +import json +import sys + +def evaluate_risk(answers): + risk_score = 0 + flags = [] + + if answers.get("involves_ai", False): + risk_score += 3 + flags.append("AI/ML component") + if answers.get("processes_pii", False): + risk_score += 5 + flags.append("Personal data") + if answers.get("dual_use", False): + risk_score += 10 + flags.append("🚨 Dual-use technology") + if answers.get("safety_critical", False): + risk_score += 8 + flags.append("Safety-critical") + + if "purely documentation" in answers.get("safe_changes", []): + return "LOW", "No ethical concerns detected." + + if risk_score >= 10: + return "HIGH", " | ".join(flags) + elif risk_score >= 5: + return "MEDIUM", " | ".join(flags) + else: + return "LOW", "Minor changes" + +# Parse comment or form submission here (simplified) +# In real use, you'd parse the actual comment body +answers = json.loads(sys.argv[1]) # passed from workflow +level, reason = evaluate_risk(answers) + +print(f"RISK_LEVEL={level}") +print(f"REASON={reason}") \ No newline at end of file From 424812d05601f1930de402d14987aa630bb782b1 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Tue, 2 Dec 2025 14:28:06 +1100 Subject: [PATCH 17/19] fixing again --- .github/workflows/ethics-gate.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ethics-gate.yaml b/.github/workflows/ethics-gate.yaml index 640ae43..31c4812 100644 --- a/.github/workflows/ethics-gate.yaml +++ b/.github/workflows/ethics-gate.yaml @@ -38,7 +38,7 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - gh pr comment ${{ github.event.pull_request.number }} --body-file .github/ETHICS_QUESTIONNAIRE.md + gh pr comment ${{ github.event.pull_request.number }} --body-file .github/ETHICS_QUESTIONNAIRE.MD echo "Please fill out the ethical/regulatory questionnaire above before this PR can be merged." ethics-engine: From 45a77ec3374e34657c2fc64f6323c490e8d2ed9b Mon Sep 17 00:00:00 2001 From: RamGcia Date: Tue, 2 Dec 2025 14:35:40 +1100 Subject: [PATCH 18/19] fixing questionnaire problem --- .github/workflows/ethics-gate.yaml | 131 ++++++++++++++++++++--------- 1 file changed, 90 insertions(+), 41 deletions(-) diff --git a/.github/workflows/ethics-gate.yaml b/.github/workflows/ethics-gate.yaml index 31c4812..b738618 100644 --- a/.github/workflows/ethics-gate.yaml +++ b/.github/workflows/ethics-gate.yaml @@ -1,32 +1,36 @@ on: - pull_request: + pull_request_target: types: [opened, reopened, synchronize] - pull_request_review_comment: + issue_comment: types: [created] permissions: contents: read # needed for checkout - pull-requests: write # needed for commenting & reviews (gh) - checks: write # needed for check runs + pull-requests: write # needed for commenting & reviews (gh) when running in pull_request_target + checks: write # needed to create check runs jobs: - require-ethics-questionnaire: - if: github.event.pull_request.draft == false + # Job that posts the questionnaire (runs in the trusted pull_request_target context). + post-questionnaire: + if: github.event_name == 'pull_request_target' && github.event.pull_request.draft == false runs-on: ubuntu-latest - outputs: - ethics_status: ${{ steps.check.outputs.status }} steps: - - name: Checkout PR head (so .github/ files exist) + - name: Checkout base repo (safe; do NOT checkout PR head here) uses: actions/checkout@v4 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ github.event.pull_request.base.sha }} + fetch-depth: 0 + + - name: Authenticate gh CLI with GITHUB_TOKEN + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token + - name: Check if questionnaire already answered id: check - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - # Look for bot comment or issue form submission (be robust to empty output) - RESPONSES=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[].body' 2>/dev/null | grep -i "Ethics & Regulatory Questionnaire" -A 20 || true) + PR_NUMBER=${{ github.event.pull_request.number }} + # Collect PR comments (robust to empty output) + RESPONSES=$(gh pr view "$PR_NUMBER" --json comments --jq '.comments[].body' 2>/dev/null | grep -i "Ethics & Regulatory Questionnaire" -A 20 || true) if [[ -z "$RESPONSES" ]]; then echo "status=missing" >> $GITHUB_OUTPUT else @@ -35,52 +39,89 @@ jobs: - name: Post questionnaire if missing if: steps.check.outputs.status == 'missing' - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | + # ensure file exists in the base repo checkout (case-sensitive) + if [[ ! -f .github/ETHICS_QUESTIONNAIRE.MD ]]; then + echo ".github/ETHICS_QUESTIONNAIRE.MD not found in base repo; aborting." >&2 + exit 1 + fi gh pr comment ${{ github.event.pull_request.number }} --body-file .github/ETHICS_QUESTIONNAIRE.MD - echo "Please fill out the ethical/regulatory questionnaire above before this PR can be merged." + echo "Posted ethics questionnaire to PR #${{ github.event.pull_request.number }}." + # Ethics engine: collects comments, runs evaluation, posts a check, and requests changes for HIGH risk. + # This job runs in the trusted context for pull_request_target and also on issue_comment (untrusted). + # For untrusted issue_comment runs, write actions (requesting changes) may be skipped if permissions are restricted. ethics-engine: - needs: require-ethics-questionnaire - if: > - needs.require-ethics-questionnaire.outputs.ethics_status == 'answered' || - github.event_name == 'pull_request_review_comment' runs-on: ubuntu-latest + needs: post-questionnaire steps: - - name: Checkout PR head (engine needs code + workflow files) + - name: Checkout base repo (we run parser from base repo) uses: actions/checkout@v4 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ github.event.pull_request.base.sha || github.ref }} + fetch-depth: 0 + + - name: Authenticate gh CLI with GITHUB_TOKEN + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token + + - name: Determine PR number + id: prnumber + run: | + # Determine PR number whether triggered by pull_request_target or issue_comment + PR_NUMBER=$(jq -r 'if .pull_request then .pull_request.number elif .issue then .issue.number else empty end' "$GITHUB_EVENT_PATH") + if [[ -z "$PR_NUMBER" ]]; then + echo "No PR number found in event payload; exiting." + echo "risk=UNKNOWN" >> $GITHUB_OUTPUT + exit 0 + fi + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + + - name: Collect comments + id: collect + run: | + PR=${{ steps.prnumber.outputs.pr_number }} + # Gather all PR comments into a single string (robust to empty) + ANSWERS=$(gh pr view "$PR" --json comments --jq '[.comments[].body] | join("\n\n")' 2>/dev/null || true) + echo "$ANSWERS" > answers.txt + # Expose the answers (trim to avoid huge output) + echo "answers=$(echo "$ANSWERS" | head -c 32768 | sed -e 's/"/'"'"'"/g')" >> $GITHUB_OUTPUT - - name: Parse latest ethics comment & run engine - id: engine + - name: Run ethics parser & evaluator (safe: runs code from base repo) + id: run_engine env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ steps.prnumber.outputs.pr_number }} run: | - # Collect comments (be robust if there are none) - ANSWERS=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '[.comments[].body] | join("\n\n")' 2>/dev/null || true) - python .github/workflows/parse_and_evaluate.py "$ANSWERS" > result.txt || true + # Ensure parser exists + if [[ ! -f .github/workflows/parse_and_evaluate.py ]]; then + echo "Parser .github/workflows/parse_and_evaluate.py not found in base repo; aborting." + echo "RISK_LEVEL=UNKNOWN" > result.txt + else + python3 .github/workflows/parse_and_evaluate.py "$(cat answers.txt)" > result.txt || true + fi cat result.txt - RISK=$(grep RISK_LEVEL result.txt | cut -d= -f2 || echo "LOW") + # Extract RISK_LEVEL=XYZ from result.txt if present + RISK=$(grep -m1 '^RISK_LEVEL=' result.txt | cut -d= -f2 || echo "LOW") echo "risk=$RISK" >> $GITHUB_OUTPUT - - uses: actions/github-script@v7 + - name: Create/update "Ethics Review" check run + uses: actions/github-script@v7 with: + github-token: ${{ secrets.GITHUB_TOKEN }} script: | - const risk = "${{ steps.engine.outputs.risk }}".trim(); + const risk = "${{ steps.run_engine.outputs.risk }}".trim(); const conclusions = { "LOW": "success", "MEDIUM": "action_required", "HIGH": "failure" }; const conclusion = conclusions[risk] || "failure"; - + const head_sha = (context.payload.pull_request && context.payload.pull_request.head && context.payload.pull_request.head.sha) || (context.payload.issue && context.payload.issue.pull_request && context.payload.issue.number ? undefined : undefined) || github.event.pull_request?.head?.sha; await github.rest.checks.create({ owner: context.repo.owner, repo: context.repo.repo, name: "Ethics Review", - head_sha: (context.payload.pull_request && context.payload.pull_request.head && context.payload.pull_request.head.sha) || github.event.pull_request.head.sha, + head_sha: head_sha || context.sha, status: "completed", conclusion, output: { @@ -89,11 +130,19 @@ jobs: } }); - - name: Block merge on HIGH risk - if: steps.engine.outputs.risk == 'HIGH' - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Request changes on HIGH risk (trusted-only; skip on untrusted events) + if: steps.run_engine.outputs.risk == 'HIGH' + run: | + PR=${{ steps.prnumber.outputs.pr_number }} + # Only attempt to request changes when running in pull_request_target context (trusted). + if [[ "${GITHUB_EVENT_NAME}" != "pull_request_target" ]]; then + echo "Not in pull_request_target context; skipping request-changes (insufficient permissions for fork PRs)." + exit 0 + fi + # Request changes using gh (GITHUB_TOKEN from pull_request_target has write rights) + gh pr review "$PR" --request-changes -b "@ethics-team Required manual review for high-risk change" + echo "Requested changes on PR #$PR due to HIGH risk." + + - name: Final status message run: | - gh pr review ${{ github.event.pull_request.number }} \ - --request-changes \ - -b "@ethics-team Required manual review for high-risk change" \ No newline at end of file + echo "Ethics engine completed. Risk level: ${{ steps.run_engine.outputs.risk }}" \ No newline at end of file From d4a2144905bbd8ca0606758d44ff7999d244c754 Mon Sep 17 00:00:00 2001 From: RamGcia Date: Tue, 2 Dec 2025 14:38:13 +1100 Subject: [PATCH 19/19] minor issue fix --- .github/workflows/ethics-gate.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ethics-gate.yaml b/.github/workflows/ethics-gate.yaml index b738618..28aee42 100644 --- a/.github/workflows/ethics-gate.yaml +++ b/.github/workflows/ethics-gate.yaml @@ -87,7 +87,7 @@ jobs: # Expose the answers (trim to avoid huge output) echo "answers=$(echo "$ANSWERS" | head -c 32768 | sed -e 's/"/'"'"'"/g')" >> $GITHUB_OUTPUT - - name: Run ethics parser & evaluator (safe: runs code from base repo) + - name: Run ethics parser & evaluator (safe runs code from base repo) id: run_engine env: PR_NUMBER: ${{ steps.prnumber.outputs.pr_number }}