From 32d63992ecb89fe2f54bae84a07b636041d61ac1 Mon Sep 17 00:00:00 2001 From: deemonic Date: Thu, 26 Mar 2026 21:35:39 +0000 Subject: [PATCH 1/3] fix: detect profanity with invisible unicode chars and asterisk censoring Two bypass vectors were reported where profanity went undetected: 1. Invisible Unicode characters between letters (e.g. f\u{2063}uck) 2. Asterisk-censored words (e.g. f*g, s**t) Fixes: strip \p{Cf} format characters before processing, add '*' as a universal letter substitution, and use \x01 for internal masking to prevent re-matching masked text. Co-Authored-By: Claude Opus 4.6 (1M context) --- config/blasp.php | 50 ++++++++--------- src/Drivers/RegexDriver.php | 7 ++- tests/BypassVulnerabilityTest.php | 91 +++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 26 deletions(-) create mode 100644 tests/BypassVulnerabilityTest.php diff --git a/config/blasp.php b/config/blasp.php index 30f6206..be7ad1b 100644 --- a/config/blasp.php +++ b/config/blasp.php @@ -156,32 +156,32 @@ |-------------------------------------------------------------------------- */ 'substitutions' => [ - '/a/' => ['a', '4', '@', 'Á', 'á', 'À', 'Â', 'à', 'Â', 'â', 'Ä', 'ä', 'Ã', 'ã', 'Å', 'å', 'æ', 'Æ', 'α', 'Δ', 'Λ', 'λ'], - '/b/' => ['b', '8', '\\', '3', 'ß', 'Β', 'β'], - '/c/' => ['c', 'Ç', 'ç', 'ć', 'Ć', 'č', 'Č', '¢', '€', '<', '(', '{', '©'], - '/d/' => ['d', '\\', ')', 'Þ', 'þ', 'Ð', 'ð'], - '/e/' => ['e', '3', '€', 'È', 'è', 'É', 'é', 'Ê', 'ê', 'ë', 'Ë', 'ē', 'Ē', 'ė', 'Ė', 'ę', 'Ę', '∑'], - '/f/' => ['f', 'ƒ'], - '/g/' => ['g', '6', '9'], - '/h/' => ['h', 'Η'], - '/i/' => ['i', '!', '|', ']', '[', '1', '∫', 'Ì', 'Í', 'Î', 'Ï', 'ì', 'í', 'î', 'ï', 'ī', 'Ī', 'į', 'Į'], - '/j/' => ['j'], - '/k/' => ['k', 'Κ', 'κ'], - '/l/' => ['l', '!', '|', ']', '[', '£', '∫', 'Ì', 'Í', 'Î', 'Ï', 'ł', 'Ł'], - '/m/' => ['m'], - '/n/' => ['n', 'η', 'Ν', 'Π', 'ñ', 'Ñ', 'ń', 'Ń'], - '/o/' => ['o', '0', 'Ο', 'ο', 'Φ', '¤', '°', 'ø', 'ô', 'Ô', 'ö', 'Ö', 'ò', 'Ò', 'ó', 'Ó', 'œ', 'Œ', 'ø', 'Ø', 'ō', 'Ō', 'õ', 'Õ'], - '/p/' => ['p', 'ρ', 'Ρ', '¶', 'þ'], - '/q/' => ['q'], - '/r/' => ['r', '®'], - '/s/' => ['s', '5', '\$', '§', 'ß', 'Ś', 'ś', 'Š', 'š'], - '/t/' => ['t', 'Τ', 'τ'], + '/a/' => ['a', '4', '@', '*', 'Á', 'á', 'À', 'Â', 'à', 'Â', 'â', 'Ä', 'ä', 'Ã', 'ã', 'Å', 'å', 'æ', 'Æ', 'α', 'Δ', 'Λ', 'λ'], + '/b/' => ['b', '8', '\\', '3', '*', 'ß', 'Β', 'β'], + '/c/' => ['c', '*', 'Ç', 'ç', 'ć', 'Ć', 'č', 'Č', '¢', '€', '<', '(', '{', '©'], + '/d/' => ['d', '*', '\\', ')', 'Þ', 'þ', 'Ð', 'ð'], + '/e/' => ['e', '3', '*', '€', 'È', 'è', 'É', 'é', 'Ê', 'ê', 'ë', 'Ë', 'ē', 'Ē', 'ė', 'Ė', 'ę', 'Ę', '∑'], + '/f/' => ['f', '*', 'ƒ'], + '/g/' => ['g', '6', '9', '*'], + '/h/' => ['h', '*', 'Η'], + '/i/' => ['i', '!', '|', ']', '[', '1', '*', '∫', 'Ì', 'Í', 'Î', 'Ï', 'ì', 'í', 'î', 'ï', 'ī', 'Ī', 'į', 'Į'], + '/j/' => ['j', '*'], + '/k/' => ['k', '*', 'Κ', 'κ'], + '/l/' => ['l', '!', '|', ']', '[', '*', '£', '∫', 'Ì', 'Í', 'Î', 'Ï', 'ł', 'Ł'], + '/m/' => ['m', '*'], + '/n/' => ['n', '*', 'η', 'Ν', 'Π', 'ñ', 'Ñ', 'ń', 'Ń'], + '/o/' => ['o', '0', '*', 'Ο', 'ο', 'Φ', '¤', '°', 'ø', 'ô', 'Ô', 'ö', 'Ö', 'ò', 'Ò', 'ó', 'Ó', 'œ', 'Œ', 'ø', 'Ø', 'ō', 'Ō', 'õ', 'Õ'], + '/p/' => ['p', '*', 'ρ', 'Ρ', '¶', 'þ'], + '/q/' => ['q', '*'], + '/r/' => ['r', '*', '®'], + '/s/' => ['s', '5', '*', '\$', '§', 'ß', 'Ś', 'ś', 'Š', 'š'], + '/t/' => ['t', '*', 'Τ', 'τ'], '/u/' => ['u', 'υ', 'µ', 'û', 'ü', 'ù', 'ú', 'ū', 'Û', 'Ü', 'Ù', 'Ú', 'Ū', '@', '*'], - '/v/' => ['v', 'υ', 'ν'], - '/w/' => ['w', 'ω', 'ψ', 'Ψ'], - '/x/' => ['x', 'Χ', 'χ'], - '/y/' => ['y', '¥', 'γ', 'ÿ', 'ý', 'Ÿ', 'Ý'], - '/z/' => ['z', 'Ζ', 'ž', 'Ž', 'ź', 'Ź', 'ż', 'Ż'], + '/v/' => ['v', '*', 'υ', 'ν'], + '/w/' => ['w', '*', 'ω', 'ψ', 'Ψ'], + '/x/' => ['x', '*', 'Χ', 'χ'], + '/y/' => ['y', '*', '¥', 'γ', 'ÿ', 'ý', 'Ÿ', 'Ý'], + '/z/' => ['z', '*', 'Ζ', 'ž', 'Ž', 'ź', 'Ź', 'ż', 'Ż'], ], /* diff --git a/src/Drivers/RegexDriver.php b/src/Drivers/RegexDriver.php index 810bf17..4a38d39 100644 --- a/src/Drivers/RegexDriver.php +++ b/src/Drivers/RegexDriver.php @@ -27,6 +27,9 @@ public function detect(string $text, Dictionary $dictionary, MaskStrategyInterfa $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); } + // Strip invisible Unicode format characters (zero-width spaces, invisible separators, etc.) + $text = preg_replace('/\p{Cf}/u', '', $text); + $this->filter = new FalsePositiveFilter($dictionary->getFalsePositives()); $this->compoundDetector = new CompoundWordDetector(); @@ -105,7 +108,9 @@ public function detect(string $text, Dictionary $dictionary, MaskStrategyInterfa $continue = true; // Mask in normalizedString only (needed for loop termination) - $normalizedString = mb_substr($normalizedString, 0, $start) . str_repeat('*', $length) . + // Use SOH control char internally to avoid re-matching when '*' is + // a valid substitution character in profanity patterns + $normalizedString = mb_substr($normalizedString, 0, $start) . str_repeat("\x01", $length) . mb_substr($normalizedString, $start + $length); // Record masked range using character positions from immutable string diff --git a/tests/BypassVulnerabilityTest.php b/tests/BypassVulnerabilityTest.php new file mode 100644 index 0000000..5fe33db --- /dev/null +++ b/tests/BypassVulnerabilityTest.php @@ -0,0 +1,91 @@ +assertTrue($result->isOffensive()); + $this->assertContains('fuck', $result->uniqueWords()); + } + + public function test_zero_width_space_in_shit() + { + $result = Blasp::check("s\u{200B}hit"); + $this->assertTrue($result->isOffensive()); + $this->assertContains('shit', $result->uniqueWords()); + } + + public function test_multiple_invisible_chars_in_profanity() + { + $result = Blasp::check("f\u{200B}\u{2063}uck"); + $this->assertTrue($result->isOffensive()); + $this->assertContains('fuck', $result->uniqueWords()); + } + + public function test_invisible_chars_in_clean_text_no_false_positive() + { + $result = Blasp::check("he\u{2063}llo"); + $this->assertFalse($result->isOffensive()); + } + + public function test_invisible_separator_clean_output_masks_profanity() + { + $result = Blasp::check("f\u{2063}uck this"); + $this->assertTrue($result->isOffensive()); + $this->assertSame('**** this', $result->clean()); + } + + // ------------------------------------------------------- + // Censored Profanity (asterisk as letter replacement) + // ------------------------------------------------------- + + public function test_asterisk_censored_fag() + { + $result = Blasp::check('f*g'); + $this->assertTrue($result->isOffensive()); + $this->assertContains('fag', $result->uniqueWords()); + } + + public function test_asterisk_censored_fuck() + { + $result = Blasp::check('f**k'); + $this->assertTrue($result->isOffensive()); + } + + public function test_asterisk_censored_shit() + { + $result = Blasp::check('s**t'); + $this->assertTrue($result->isOffensive()); + } + + public function test_asterisk_fully_censored_fuck() + { + $result = Blasp::check('f***'); + $this->assertTrue($result->isOffensive()); + } + + public function test_asterisk_in_non_profane_word_no_false_positive() + { + $result = Blasp::check('b*g'); + $this->assertFalse($result->isOffensive()); + } + + // ------------------------------------------------------- + // Combined: invisible + wildcard + // ------------------------------------------------------- + + public function test_invisible_char_plus_asterisk_censoring() + { + $result = Blasp::check("f\u{2063}*g"); + $this->assertTrue($result->isOffensive()); + } +} From bcc0d895a0afb8b88577ad16c9834deb2fed9131 Mon Sep 17 00:00:00 2001 From: deemonic Date: Thu, 26 Mar 2026 21:56:17 +0000 Subject: [PATCH 2/3] fix: move invisible char stripping to Analyzer to prevent pipeline position drift Moves the \p{Cf} stripping from RegexDriver to Analyzer so all drivers in a pipeline receive already-stripped text. This prevents position misalignment when PipelineDriver applies RegexDriver positions to the original (unstripped) input. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/Core/Analyzer.php | 4 ++++ src/Drivers/RegexDriver.php | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Core/Analyzer.php b/src/Core/Analyzer.php index 13e611b..1e727fb 100644 --- a/src/Core/Analyzer.php +++ b/src/Core/Analyzer.php @@ -17,6 +17,10 @@ public function analyze( ): Result { $mask = $mask ?? new CharacterMask(config('blasp.mask', config('blasp.mask_character', '*'))); + // Strip invisible Unicode format characters (zero-width spaces, invisible separators, etc.) + // before any driver sees the text, ensuring consistent positions across pipeline drivers + $text = preg_replace('/\p{Cf}/u', '', $text); + return $driver->detect($text, $dictionary, $mask, $options); } } diff --git a/src/Drivers/RegexDriver.php b/src/Drivers/RegexDriver.php index 4a38d39..78890e3 100644 --- a/src/Drivers/RegexDriver.php +++ b/src/Drivers/RegexDriver.php @@ -27,9 +27,6 @@ public function detect(string $text, Dictionary $dictionary, MaskStrategyInterfa $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); } - // Strip invisible Unicode format characters (zero-width spaces, invisible separators, etc.) - $text = preg_replace('/\p{Cf}/u', '', $text); - $this->filter = new FalsePositiveFilter($dictionary->getFalsePositives()); $this->compoundDetector = new CompoundWordDetector(); From ea1de4989ae32cf7994e79358065c261f252a55e Mon Sep 17 00:00:00 2001 From: deemonic Date: Thu, 26 Mar 2026 22:13:14 +0000 Subject: [PATCH 3/3] fix: guard against preg_replace returning null on malformed UTF-8 preg_replace with /u flag returns null on invalid UTF-8 input. Fall back to the original text to avoid silently losing input. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/Core/Analyzer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Analyzer.php b/src/Core/Analyzer.php index 1e727fb..63c1436 100644 --- a/src/Core/Analyzer.php +++ b/src/Core/Analyzer.php @@ -19,7 +19,7 @@ public function analyze( // Strip invisible Unicode format characters (zero-width spaces, invisible separators, etc.) // before any driver sees the text, ensuring consistent positions across pipeline drivers - $text = preg_replace('/\p{Cf}/u', '', $text); + $text = preg_replace('/\p{Cf}/u', '', $text) ?? $text; return $driver->detect($text, $dictionary, $mask, $options); }