diff --git a/sds/src/scanner/regex_rule/config.rs b/sds/src/scanner/regex_rule/config.rs index 1704c646..1d0d9c5b 100644 --- a/sds/src/scanner/regex_rule/config.rs +++ b/sds/src/scanner/regex_rule/config.rs @@ -264,6 +264,7 @@ pub enum SecondaryValidator { MoneroAddress, NhsCheckDigit, NirChecksum, + NonHexChecker, PolishNationalIdChecksum, PolishNipChecksum, PortugueseTaxIdChecksum, diff --git a/sds/src/scanner/test/validators.rs b/sds/src/scanner/test/validators.rs index 460d9473..1d50fd01 100644 --- a/sds/src/scanner/test/validators.rs +++ b/sds/src/scanner/test/validators.rs @@ -1,5 +1,6 @@ use crate::SecondaryValidator::{ ChineseIdChecksum, GithubTokenChecksum, IbanChecker, JwtExpirationChecker, NhsCheckDigit, + NonHexChecker, }; use crate::scanner::RootRuleConfig; use crate::{MatchAction, RegexRuleConfig, ScannerBuilder, SecondaryValidator}; @@ -170,3 +171,32 @@ fn test_nhs_checksum() { assert_eq!(matches.len(), 1); assert_eq!(content, "[NHS]"); } + +#[test] +fn test_non_hex_checker_filters_pure_hex() { + let rule = RegexRuleConfig::new("[a-zA-Z0-9_]{16,}"); + let match_action = MatchAction::Redact { + replacement: "[token]".to_string(), + }; + + let rule_with_validator = + RootRuleConfig::new(rule.clone().with_validator(Some(NonHexChecker)).build()) + .match_action(match_action.clone()); + + let scanner_without = + ScannerBuilder::new(&[RootRuleConfig::new(rule.build()).match_action(match_action)]) + .build() + .unwrap(); + + let mut pure_hex = "0123456789abcdef".to_string(); + assert_eq!(scanner_without.scan(&mut pure_hex).unwrap().len(), 1); + + let scanner_with = ScannerBuilder::new(&[rule_with_validator]).build().unwrap(); + let mut pure_hex_again = "0123456789abcdef".to_string(); + assert_eq!(scanner_with.scan(&mut pure_hex_again).unwrap().len(), 0); + assert_eq!(pure_hex_again, "0123456789abcdef"); + + let mut with_prefix = "sk_live_0123456789abcd".to_string(); + assert_eq!(scanner_with.scan(&mut with_prefix).unwrap().len(), 1); + assert_eq!(with_prefix, "[token]"); +} diff --git a/sds/src/secondary_validation/mod.rs b/sds/src/secondary_validation/mod.rs index c0bd0111..8fc23077 100644 --- a/sds/src/secondary_validation/mod.rs +++ b/sds/src/secondary_validation/mod.rs @@ -37,6 +37,7 @@ mod luxembourg_individual_nin_checksum; mod monero_address; mod nhs_check_digit; mod nir_checksum; +mod non_hex_checker; mod polish_national_id_checksum; mod polish_nip_checksum; mod portuguese_tax_id_checksum; @@ -100,6 +101,7 @@ pub use crate::secondary_validation::luxembourg_individual_nin_checksum::Luxembo pub use crate::secondary_validation::monero_address::MoneroAddress; pub use crate::secondary_validation::nhs_check_digit::NhsCheckDigit; pub use crate::secondary_validation::nir_checksum::NirChecksum; +pub use crate::secondary_validation::non_hex_checker::NonHexChecker; pub use crate::secondary_validation::polish_national_id_checksum::PolishNationalIdChecksum; pub use crate::secondary_validation::polish_nip_checksum::PolishNipChecksum; pub use crate::secondary_validation::portuguese_tax_id_checksum::PortugueseTaxIdChecksum; @@ -253,6 +255,7 @@ impl SecondaryValidator { SecondaryValidator::MoneroAddress => Arc::new(MoneroAddress), SecondaryValidator::NhsCheckDigit => Arc::new(NhsCheckDigit), SecondaryValidator::NirChecksum => Arc::new(NirChecksum), + SecondaryValidator::NonHexChecker => Arc::new(NonHexChecker), SecondaryValidator::PolishNationalIdChecksum => Arc::new(PolishNationalIdChecksum), SecondaryValidator::PolishNipChecksum => Arc::new(PolishNipChecksum), SecondaryValidator::PortugueseTaxIdChecksum => Arc::new(PortugueseTaxIdChecksum), diff --git a/sds/src/secondary_validation/non_hex_checker.rs b/sds/src/secondary_validation/non_hex_checker.rs new file mode 100644 index 00000000..f1e8ae23 --- /dev/null +++ b/sds/src/secondary_validation/non_hex_checker.rs @@ -0,0 +1,53 @@ +use crate::secondary_validation::Validator; + +/// Accepts matches that contain at least one character outside `[0-9a-fA-F]`. +/// +/// Useful to drop pure hexadecimal substrings (for example hashes or UUIDs without separators) +/// while keeping tokens that use a wider alphabet (base64, prefixes, punctuation, etc.). +pub struct NonHexChecker; + +impl Validator for NonHexChecker { + fn is_valid_match(&self, regex_match: &str) -> bool { + regex_match.chars().any(|c| !c.is_ascii_hexdigit()) + } +} + +#[cfg(test)] +mod tests { + use crate::secondary_validation::Validator; + use crate::secondary_validation::non_hex_checker::NonHexChecker; + + #[test] + fn rejects_pure_hex() { + for input in [ + "", + "a", + "deadbeef", + "DEADBEEF0123456789", + "0123456789abcdef", + "AbCdEf0123456789", + ] { + assert!( + !NonHexChecker.is_valid_match(input), + "expected pure hex or empty to be rejected: {input:?}" + ); + } + } + + #[test] + fn accepts_when_any_non_hex_present() { + for input in [ + "g", + "0g", + "sk_live_abc", + "abc-def", + "ff_FF", // underscore is not hex + "日本", + ] { + assert!( + NonHexChecker.is_valid_match(input), + "expected non-hex character to accept: {input:?}" + ); + } + } +}