diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 152990aa479..f106fcc3098 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1424,6 +1424,7 @@ name = "codex-memory" version = "0.0.0" dependencies = [ "anyhow", + "regex", "rusqlite", "serde", "serde_json", diff --git a/codex-rs/memory/Cargo.toml b/codex-rs/memory/Cargo.toml index 8d5c26a4bd4..be0026239b4 100644 --- a/codex-rs/memory/Cargo.toml +++ b/codex-rs/memory/Cargo.toml @@ -7,6 +7,7 @@ edition = { workspace = true } anyhow = "1" serde = { version = "1", features = ["derive"] } serde_json = "1" +regex = "1" [features] default = [] diff --git a/codex-rs/memory/src/redact.rs b/codex-rs/memory/src/redact.rs index 673aa20c833..ac5674d0111 100644 --- a/codex-rs/memory/src/redact.rs +++ b/codex-rs/memory/src/redact.rs @@ -4,6 +4,108 @@ pub struct Redaction { pub blocked: bool, } -pub fn redact_candidate(_s: &str) -> Redaction { - todo!() +pub fn redact_candidate(s: &str) -> Redaction { + use regex::Regex; + + // Issues discovered while scanning the input and byte ranges to mask. + let mut issues = Vec::new(); + let mut spans: Vec<(usize, usize)> = Vec::new(); + + fn push_span( + spans: &mut Vec<(usize, usize)>, + issues: &mut Vec, + range: (usize, usize), + issue: &str, + ) { + if spans.iter().any(|(s, e)| range.0 >= *s && range.1 <= *e) { + return; + } + spans.push(range); + issues.push(issue.to_string()); + } + + // API keys, tokens or secrets of the form NAME=VALUE where VALUE is long. + let api_re = + Regex::new(r"(?i)(api[_-]?key|token|secret|password)[\s:=]+([A-Za-z0-9_\-]{16,})").unwrap(); + for caps in api_re.captures_iter(s) { + if let Some(mat) = caps.get(2) { + push_span(&mut spans, &mut issues, (mat.start(), mat.end()), "possible API key"); + } + } + + // SSH public keys or PEM encoded private keys. + let ssh_re = Regex::new(r"ssh-(rsa|ed25519) [A-Za-z0-9+/=]{20,}").unwrap(); + for mat in ssh_re.find_iter(s) { + push_span(&mut spans, &mut issues, (mat.start(), mat.end()), "possible SSH key"); + } + + let pem_re = + Regex::new(r"-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]+?-----END [A-Z ]*PRIVATE KEY-----") + .unwrap(); + for mat in pem_re.find_iter(s) { + push_span(&mut spans, &mut issues, (mat.start(), mat.end()), "possible private key"); + } + + // High entropy strings: long base64/hex-like tokens. + let ent_re = Regex::new(r"[A-Za-z0-9+/=_-]{20,}").unwrap(); + for mat in ent_re.find_iter(s) { + let token = mat.as_str(); + if spans + .iter() + .any(|(start, end)| mat.start() < *end && mat.end() > *start) + { + continue; + } + if shannon_entropy(token) >= 4.5 { + push_span(&mut spans, &mut issues, (mat.start(), mat.end()), "high-entropy string"); + } + } + + spans.sort_by_key(|r| r.0); + let mut merged: Vec<(usize, usize)> = Vec::new(); + for (start, end) in spans.into_iter() { + if let Some(last) = merged.last_mut() && start <= last.1 { + last.1 = last.1.max(end); + continue; + } + merged.push((start, end)); + } + + // Build the masked string. + let mut masked = String::new(); + let mut last = 0usize; + for (start, end) in merged { + if start > last { + masked.push_str(&s[last..start]); + } + masked.push_str("[REDACTED]"); + last = end; + } + if last < s.len() { + masked.push_str(&s[last..]); + } + + let blocked = !issues.is_empty(); + Redaction { + masked, + issues, + blocked, + } +} + +fn shannon_entropy(s: &str) -> f64 { + let mut freq = [0u32; 256]; + let mut len = 0usize; + for b in s.bytes() { + freq[b as usize] += 1; + len += 1; + } + let mut ent = 0f64; + for &count in &freq { + if count > 0 { + let p = count as f64 / len as f64; + ent -= p * p.log2(); + } + } + ent } diff --git a/codex-rs/memory/tests/redact.rs b/codex-rs/memory/tests/redact.rs index 27a5b49e5e8..fe9b036debf 100644 --- a/codex-rs/memory/tests/redact.rs +++ b/codex-rs/memory/tests/redact.rs @@ -1,4 +1,37 @@ +use codex_memory::redact::redact_candidate; + #[test] -fn placeholder() { - // placeholder test +fn api_key_detection() { + let input = "Here is API_KEY=ABCD1234EFGH5678IJKL9012"; + let result = redact_candidate(input); + assert!(result.blocked); + assert!(result.issues.iter().any(|i| i.contains("API key"))); + assert_eq!(result.masked, "Here is API_KEY=[REDACTED]"); +} + +#[test] +fn ssh_key_detection() { + let input = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBS8up32jhRz25k4b1qm0Nn1ta1Vx"; + let result = redact_candidate(input); + assert!(result.blocked); + assert!(result.issues.iter().any(|i| i.contains("SSH key"))); + assert_eq!(result.masked, "[REDACTED]"); +} + +#[test] +fn high_entropy_detection() { + let input = "token: XyZ0123456789+/ABCdefghIJKLmnoPQRstuVWxyz0123"; + let result = redact_candidate(input); + assert!(result.blocked); + assert!(result.issues.iter().any(|i| i.contains("high-entropy"))); + assert_eq!(result.masked, "token: [REDACTED]"); +} + +#[test] +fn no_detection() { + let input = "ordinary text"; + let result = redact_candidate(input); + assert!(!result.blocked); + assert!(result.issues.is_empty()); + assert_eq!(result.masked, input); }