From 7ea219128dd7b36c7284cce7238ebf3a97e743f4 Mon Sep 17 00:00:00 2001 From: Dom Morgan Date: Tue, 26 May 2026 11:14:37 +0100 Subject: [PATCH 1/3] Revert "Removing dependency on voku/portable-utf8 (#33)" This reverts commit d96509294ea843b4b86e4900df27424a6ea0ace8. --- composer.json | 2 +- src/Stemmer/Catalan.php | 25 ++-- src/Stemmer/Danish.php | 33 +++-- src/Stemmer/Dutch.php | 63 ++++----- src/Stemmer/English.php | 79 ++++++------ src/Stemmer/Finnish.php | 115 +++++++++-------- src/Stemmer/French.php | 93 +++++++------- src/Stemmer/German.php | 49 +++---- src/Stemmer/Italian.php | 49 +++---- src/Stemmer/Norwegian.php | 23 ++-- src/Stemmer/Portuguese.php | 49 +++---- src/Stemmer/Romanian.php | 29 +++-- src/Stemmer/Russian.php | 45 ++++--- src/Stemmer/Spanish.php | 72 ++++++----- src/Stemmer/Stem.php | 30 ++--- src/Stemmer/Swedish.php | 25 ++-- src/StemmerFactory.php | 4 +- src/Transliterate.php | 253 ------------------------------------- 18 files changed, 427 insertions(+), 611 deletions(-) delete mode 100644 src/Transliterate.php diff --git a/composer.json b/composer.json index d58a45c..b190dda 100644 --- a/composer.json +++ b/composer.json @@ -11,7 +11,7 @@ ], "require": { "php": ">=7.3", - "joomla/string": ">=2.0.1" + "voku/portable-utf8": "^5.4|^6.0" }, "require-dev":{ "phpunit/phpunit": "^9.0" diff --git a/src/Stemmer/Catalan.php b/src/Stemmer/Catalan.php index 8a5c7d3..d52e4fc 100644 --- a/src/Stemmer/Catalan.php +++ b/src/Stemmer/Catalan.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -86,7 +86,12 @@ class Catalan extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); // Catalan stemmer does not use Rv $this->r1(); @@ -122,7 +127,7 @@ private function step0() { if (($position = $this->search(static::$attached_pronoun)) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -141,7 +146,7 @@ private function step1a() // delete if in R2 if (($position = $this->search(['acions', 'ada', 'ades'])) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -157,11 +162,11 @@ private function step1a() // atius atives ativa ativitat ativitats ible ibles assa asses assos ent ents íssim íssima íssims íssimes // ìssem ìsseu ìssin ims ima imes isme ista ismes istes inia inies íinia ínies ita ites triu trius oses osos // ient otes ots - // + // // delete if in R1 if (($position = $this->search(self::$standard_suffix_1a)) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -236,7 +241,7 @@ private function step1b() // delete if in R1 if (($position = $this->search(static::$verb_suffixes)) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -246,7 +251,7 @@ private function step1b() // delete if in R2 if (($position = $this->search(['ando'])) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -265,7 +270,7 @@ private function step2() // delete if in R1 if (($position = $this->search(static::$residual_suffixes)) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -289,7 +294,7 @@ private function step2() */ private function finish() { - $this->word = str_replace( + $this->word = UTF8::str_replace( ['á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ï', 'ü', '·'], ['a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'i', 'u', '.'], $this->word diff --git a/src/Stemmer/Danish.php b/src/Stemmer/Danish.php index 5fc7507..c539fdb 100644 --- a/src/Stemmer/Danish.php +++ b/src/Stemmer/Danish.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -22,7 +22,12 @@ class Danish extends Stem */ public function stem($word): string { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -30,7 +35,7 @@ public function stem($word): string // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = StringHelper::substr($this->word, 3); + $this->r1 = UTF8::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -51,7 +56,7 @@ public function stem($word): string */ private function hasValidSEnding($word) { - $lastLetter = StringHelper::substr($word, -1, 1); + $lastLetter = UTF8::substr($word, -1, 1); return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å')); } @@ -69,14 +74,14 @@ private function step1() 'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds', 'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e' ))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = StringHelper::substr($this->word, 0, $position); + $word = UTF8::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -92,7 +97,7 @@ private function step1() private function step2() { if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } } @@ -103,14 +108,14 @@ private function step3() { // If the word ends igst, remove the final st. if ($this->search(array('igst')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -2); + $this->word = UTF8::substr($this->word, 0, -2); } // Search for the longest among the following suffixes in R1, and perform the action indicated. // ig lig elig els // delete, and then repeat step 2 if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); $this->step2(); return true; } @@ -118,7 +123,7 @@ private function step3() // løst // replace with løs if ($this->searchIfInR1(array('løst')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } } @@ -128,19 +133,19 @@ private function step3() */ private function step4() { - $length = StringHelper::strlen($this->word); + $length = UTF8::strlen($this->word); if (!$this->inR1(($length-1))) { return false; } - $lastLetter = StringHelper::substr($this->word, -1, 1); + $lastLetter = UTF8::substr($this->word, -1, 1); if (in_array($lastLetter, self::$vowels)) { return false; } - $beforeLastLetter = StringHelper::substr($this->word, -2, 1); + $beforeLastLetter = UTF8::substr($this->word, -2, 1); if ($lastLetter == $beforeLastLetter) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } return true; } diff --git a/src/Stemmer/Dutch.php b/src/Stemmer/Dutch.php index 6a2b563..fc7c1af 100644 --- a/src/Stemmer/Dutch.php +++ b/src/Stemmer/Dutch.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -22,10 +22,15 @@ class Dutch extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); // First, remove all umlaut and acute accents. - $this->word = str_replace( + $this->word = UTF8::str_replace( array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), $this->word); @@ -45,7 +50,7 @@ public function stem($word) // but then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = StringHelper::substr($this->word, 3); + $this->r1 = UTF8::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -66,7 +71,7 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = StringHelper::substr($word, -1, 1); + $lastLetter = UTF8::substr($word, -1, 1); return !in_array($lastLetter, array_merge(self::$vowels, array('j'))); } @@ -77,12 +82,12 @@ private function hasValidSEnding($word) */ private function hasValidEnEnding($word) { - $lastLetter = StringHelper::substr($word, -1, 1); + $lastLetter = UTF8::substr($word, -1, 1); if (in_array($lastLetter, self::$vowels)) { return false; } - $threeLastLetters = StringHelper::substr($word, -3, 3); + $threeLastLetters = UTF8::substr($word, -3, 3); if ($threeLastLetters == 'gem') { return false; } @@ -95,7 +100,7 @@ private function hasValidEnEnding($word) private function unDoubling() { if ($this->search(array('kk', 'dd', 'tt')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } } @@ -118,7 +123,7 @@ private function step1() // delete if in R1 and preceded by a valid en-ending, and then undouble the ending if ( ($position = $this->search(array('ene', 'en'))) !== false) { if ($this->inR1($position)) { - $word = StringHelper::substr($this->word, 0, $position); + $word = UTF8::substr($this->word, 0, $position); if ($this->hasValidEnEnding($word)) { $this->word = $word; $this->unDoubling(); @@ -131,7 +136,7 @@ private function step1() // delete if in R1 and preceded by a valid s-ending if ( ($position = $this->search(array('se', 's'))) !== false) { if ($this->inR1($position)) { - $word = StringHelper::substr($this->word, 0, $position); + $word = UTF8::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -150,9 +155,9 @@ private function step2() { if ( ($position = $this->search(array('e'))) !== false) { if ($this->inR1($position)) { - $letter = StringHelper::substr($this->word, -2, 1); + $letter = UTF8::substr($this->word, -2, 1); if (!in_array($letter, self::$vowels)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); $this->unDoubling(); return true; @@ -171,13 +176,13 @@ private function step3a() { if ( ($position = $this->search(array('heid'))) !== false) { if ($this->inR2($position)) { - $letter = StringHelper::substr($this->word, -5, 1); + $letter = UTF8::substr($this->word, -5, 1); if ($letter !== 'c') { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position = $this->search(array('en'))) !== false) { if ($this->inR1($position)) { - $word = StringHelper::substr($this->word, 0, $position); + $word = UTF8::substr($this->word, 0, $position); if ($this->hasValidEnEnding($word)) { $this->word = $word; $this->unDoubling(); @@ -201,12 +206,12 @@ private function step3b($removedE) // if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending if ( ($position = $this->search(array('end', 'ing'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) { - $letter = StringHelper::substr($this->word, -3, 1); + $letter = UTF8::substr($this->word, -3, 1); if ($letter !== 'e') { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } } else { $this->unDoubling(); @@ -221,9 +226,9 @@ private function step3b($removedE) // delete if in R2 and not preceded by e if ( ($position = $this->search(array('ig'))) !== false) { if ($this->inR2($position)) { - $letter = StringHelper::substr($this->word, -3, 1); + $letter = UTF8::substr($this->word, -3, 1); if ($letter !== 'e') { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } return true; @@ -233,7 +238,7 @@ private function step3b($removedE) // delete if in R2, and then repeat step 2 if ( ($position = $this->search(array('lijk'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); $this->step2(); } return true; @@ -243,7 +248,7 @@ private function step3b($removedE) // delete if in R2 if ( ($position = $this->search(array('baar'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -252,7 +257,7 @@ private function step3b($removedE) // delete if in R2 and if step 2 actually removed an e if ( ($position = $this->search(array('bar'))) !== false) { if ($this->inR2($position) && $removedE) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -268,25 +273,25 @@ private function step3b($removedE) private function step4() { // D is a non-vowel other than I - $d = StringHelper::substr($this->word, -1, 1); + $d = UTF8::substr($this->word, -1, 1); if (in_array($d, array_merge(self::$vowels, array('I')))) { return false; } // V is double a, e, o or u - $v = StringHelper::substr($this->word, -3, 2); + $v = UTF8::substr($this->word, -3, 2); if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) { return false; } - $singleV = StringHelper::substr($v, 0, 1); + $singleV = UTF8::substr($v, 0, 1); // C is a non-vowel - $c = StringHelper::substr($this->word, -4, 1); + $c = UTF8::substr($this->word, -4, 1); if (in_array($c, self::$vowels)) { return false; } - $this->word = StringHelper::substr($this->word, 0, -4); + $this->word = UTF8::substr($this->word, 0, -4); $this->word .= $c . $singleV .$d; } @@ -296,6 +301,6 @@ private function step4() */ private function finish() { - $this->word = str_replace(array('I', 'Y'), array('i', 'y'), $this->word); + $this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word); } } diff --git a/src/Stemmer/English.php b/src/Stemmer/English.php index f0e1f2c..fe5f186 100644 --- a/src/Stemmer/English.php +++ b/src/Stemmer/English.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * English Porter 2 @@ -27,11 +27,16 @@ class English extends Stem */ public function stem($word) { - if (StringHelper::strlen($word) < 3) { + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + if (Utf8::strlen($word) < 3) { return $word; } - $this->word = StringHelper::strtolower($word); + $this->word = UTF8::strtolower($word); // exceptions if (null !== ($word = $this->exception1())) { @@ -42,9 +47,9 @@ public function stem($word) $this->plainVowels = implode('', self::$vowels); // Remove initial ', if present. - $first = StringHelper::substr($this->word, 0, 1); + $first = UTF8::substr($this->word, 0, 1); if ($first == "'") { - $this->word = StringHelper::substr($this->word, 1); + $this->word = UTF8::substr($this->word, 1); } // Set initial y, or y after a vowel, to Y @@ -83,7 +88,7 @@ public function stem($word) private function step0() { if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } @@ -118,10 +123,10 @@ private function step1a() // delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it) if ( ($position = $this->search(array('s'))) !== false) { for ($i=0; $i<$position-1; $i++) { - $letter = StringHelper::substr($this->word, $i, 1); + $letter = UTF8::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -152,16 +157,16 @@ private function step1b() // if the word is short, add e (so hop -> hope) if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) { for ($i=0; $i<$position; $i++) { - $letter = StringHelper::substr($this->word, $i, 1); + $letter = UTF8::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ($this->search(array('at', 'bl', 'iz')) !== false) { $this->word .= 'e'; } elseif ( ($position2 = $this->search(self::$doubles)) !== false) { - $this->word = StringHelper::substr($this->word, 0, ($position2+1)); + $this->word = UTF8::substr($this->word, 0, ($position2+1)); } elseif ($this->isShort()) { $this->word .= 'e'; @@ -183,7 +188,7 @@ private function step1c() { // replace suffix y or Y by i if preceded by a non-vowel // which is not the first letter of the word (so cry -> cri, by -> by, say -> say) - $length = StringHelper::strlen($this->word); + $length = UTF8::strlen($this->word); if ($length < 3) { return true; @@ -191,7 +196,7 @@ private function step1c() if ( ($position = $this->search(array('y', 'Y'))) !== false) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if (! in_array($letter, self::$vowels)) { $this->word = preg_replace('#(y|Y)$#u', 'i', $this->word); @@ -318,7 +323,7 @@ private function step2() if ($this->inR1($position)) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ($letter == 'l') { $this->word = preg_replace('#(ogi)$#u', 'og', $this->word); @@ -333,10 +338,10 @@ private function step2() if ($this->inR1($position)) { // a letter for you - $letter = StringHelper::substr($this->word, ($position-1), 1); + $letter = UTF8::substr($this->word, ($position-1), 1); if (in_array($letter, self::$liEnding)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } @@ -378,13 +383,13 @@ private function step3() // ful ness: delete if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } // ative*: delete if in R2 if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } @@ -404,7 +409,7 @@ private function step4() 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -413,10 +418,10 @@ private function step4() // delete if preceded by s or t if ( ($position = $this->searchIfInR2(array('ion'))) !== false) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ($letter == 's' || $letter == 't') { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; @@ -435,11 +440,11 @@ private function step5() // delete if in R2, or in R1 and not preceded by a short syllable if ( ($position = $this->search(array('e'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } @@ -450,10 +455,10 @@ private function step5() // delete if in R2 and preceded by l if ( ($position = $this->searchIfInR2(array('l'))) !== false) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ($letter == 'l') { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; @@ -464,21 +469,21 @@ private function step5() private function finish() { - $this->word = str_replace('Y', 'y', $this->word); + $this->word = UTF8::str_replace('Y', 'y', $this->word); } private function exceptionR1() { - if (StringHelper::strpos($this->word, 'gener') === 0) { - $this->r1 = StringHelper::substr($this->word, 5); + if (Utf8::strpos($this->word, 'gener') === 0) { + $this->r1 = UTF8::substr($this->word, 5); $this->r1Index = 5; - } elseif (StringHelper::strpos($this->word, 'commun') === 0) { - $this->r1 = StringHelper::substr($this->word, 6); + } elseif (Utf8::strpos($this->word, 'commun') === 0) { + $this->r1 = UTF8::substr($this->word, 6); $this->r1Index = 6; - } elseif (StringHelper::strpos($this->word, 'arsen') === 0) { - $this->r1 = StringHelper::substr($this->word, 5); + } elseif (Utf8::strpos($this->word, 'arsen') === 0) { + $this->r1 = UTF8::substr($this->word, 5); $this->r1Index = 5; } } @@ -549,7 +554,7 @@ private function exception2() */ private function isShort() { - $length = StringHelper::strlen($this->word); + $length = UTF8::strlen($this->word); return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) ); } @@ -562,7 +567,7 @@ private function isShort() */ private function searchShortSyllabe($from, $nbLetters) { - $length = StringHelper::strlen($this->word); + $length = UTF8::strlen($this->word); if ($from < 0) { $from = $length + $from; @@ -576,8 +581,8 @@ private function searchShortSyllabe($from, $nbLetters) return false; } - $first = StringHelper::substr($this->word, $from, 1); - $second = StringHelper::substr($this->word, ($from+1), 1); + $first = UTF8::substr($this->word, $from, 1); + $second = UTF8::substr($this->word, ($from+1), 1); if ($nbLetters == 2) { if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) { @@ -585,7 +590,7 @@ private function searchShortSyllabe($from, $nbLetters) } } - $third = StringHelper::substr($this->word, ($from+2), 1); + $third = UTF8::substr($this->word, ($from+2), 1); if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) && (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) { diff --git a/src/Stemmer/Finnish.php b/src/Stemmer/Finnish.php index c6487b5..25539b2 100644 --- a/src/Stemmer/Finnish.php +++ b/src/Stemmer/Finnish.php @@ -6,7 +6,7 @@ */ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * Finnish Snowball Stemmer. @@ -38,7 +38,12 @@ class Finnish extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (! UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = Utf8::strtolower($word); // R1 and R2 are then defined in the usual way $this->r1(); @@ -69,10 +74,10 @@ private function step1() // (a) kin kaan kään ko kö han hän pa pä // delete if preceded by n, t or a vowel if (($position = $this->searchIfInR1(array('kaan', 'kään', 'kin', 'han', 'hän', 'ko', 'kö', 'pa', 'pä'))) !== false) { - $lastLetter = StringHelper::substr($this->word, ($position-1), 1); + $lastLetter = Utf8::substr($this->word, ($position-1), 1); if (in_array($lastLetter, array_merge(['t', 'n'], self::$vowels))) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->r1(); $this->r2(); } @@ -84,7 +89,7 @@ private function step1() // delete if in R2 if (($position = $this->searchIfInR1(array('sti'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->r1(); $this->r2(); } @@ -106,10 +111,10 @@ private function step2() // si // delete if not preceded by k if (($position = $this->searchIfInR1(array('si'))) !== false) { - $lastLetter = StringHelper::substr($this->word, ($position-1), 1); + $lastLetter = Utf8::substr($this->word, ($position-1), 1); if ($lastLetter !== 'k') { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -119,7 +124,7 @@ private function step2() // ni // delete if (($position = $this->searchIfInR1(array('ni'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); // if preceded by kse, replace with ksi if ( ($position = $this->search(array('kse'))) !== false) { $this->word = preg_replace('#(kse)$#u', 'ksi', $this->word); @@ -132,7 +137,7 @@ private function step2() // nsa nsä mme nne // delete if (($position = $this->searchIfInR1(array('nsa', 'nsä', 'mme', 'nne'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -141,9 +146,9 @@ private function step2() // an // delete if preceded by one of ta ssa sta lla lta na if (($position = $this->searchIfInR1(array('an'))) !== false) { - $word = StringHelper::substr($this->word, 0, $position); - $lastThreeLetters = StringHelper::substr($word, -3, 3); - $lastTwoLetters = StringHelper::substr($word, -2, 2); + $word = Utf8::substr($this->word, 0, $position); + $lastThreeLetters = Utf8::substr($word, -3, 3); + $lastTwoLetters = Utf8::substr($word, -2, 2); if (in_array($lastThreeLetters, array('ssa', 'sta', 'lla', 'lta'), true) || in_array($lastTwoLetters, array('na', 'ta'), true)) { $this->word = $word; $this->r1(); @@ -155,9 +160,9 @@ private function step2() // än // delete if preceded by one of tä ssä stä llä ltä nä if (($position = $this->searchIfInR1(array('än'))) !== false) { - $word = StringHelper::substr($this->word, 0, $position); - $lastThreeLetters = StringHelper::substr($word, -3, 3); - $lastTwoLetters = StringHelper::substr($word, -2, 2); + $word = Utf8::substr($this->word, 0, $position); + $lastThreeLetters = Utf8::substr($word, -3, 3); + $lastTwoLetters = Utf8::substr($word, -2, 2); if (in_array($lastThreeLetters, array('ssä', 'stä', 'llä', 'ltä'), true) || in_array($lastTwoLetters, array('nä', 'tä'), true)) { $this->word = $word; $this->r1(); @@ -169,9 +174,9 @@ private function step2() // en // delete if preceded by one of lle ine if (($position = $this->searchIfInR1(array('en'))) !== false) { - $word = StringHelper::substr($this->word, 0, $position); - if (StringHelper::strlen($this->word) > 4) { - $lastThreeLetters = StringHelper::substr($this->word, -5, 3); + $word = Utf8::substr($this->word, 0, $position); + if (Utf8::strlen($this->word) > 4) { + $lastThreeLetters = Utf8::substr($this->word, -5, 3); if (in_array($lastThreeLetters, array('lle', 'ine'), true)) { $this->word = $word; $this->r1(); @@ -199,9 +204,9 @@ private function step3() continue; } if (($position = $this->searchIfInR1(array('h' . $vowel . 'n'))) !== false) { - $lastLetter = StringHelper::substr($this->word, $position-1, 1); + $lastLetter = Utf8::substr($this->word, $position-1, 1); if ($lastLetter === $vowel) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -213,11 +218,11 @@ private function step3() // siin den tten // delete if preceded by Vi if (($position = $this->searchIfInR1(array('siin', 'den', 'tten'))) !== false) { - $lastLetter = StringHelper::substr($this->word, ($position-1), 1); + $lastLetter = Utf8::substr($this->word, ($position-1), 1); if ($lastLetter === 'i') { - $nextLastLetter = StringHelper::substr($this->word, ($position-2), 1); + $nextLastLetter = Utf8::substr($this->word, ($position-2), 1); if (in_array($nextLastLetter, self::$restrictedVowels, true)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -229,10 +234,10 @@ private function step3() // seen // delete if preceded by LV if (($position = $this->searchIfInR1(array('seen'))) !== false) { - $lastLetters = StringHelper::substr($this->word, ($position-2), 2); + $lastLetters = Utf8::substr($this->word, ($position-2), 2); if (in_array($lastLetters, self::$longVowels, true)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -243,10 +248,10 @@ private function step3() // tta ttä // delete if preceded by e if (($position = $this->searchIfInR1(array('tta', 'ttä'))) !== false) { - $lastLetter = StringHelper::substr($this->word, ($position-1), 1); + $lastLetter = Utf8::substr($this->word, ($position-1), 1); if ($lastLetter === 'e') { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -257,7 +262,7 @@ private function step3() // ta tä ssa ssä sta stä lla llä lta ltä lle na nä ksi ine // delete if (($position = $this->searchIfInR1(array('ssa', 'ssä', 'sta', 'stä', 'lla', 'llä', 'lta', 'ltä', 'lle', 'ksi', 'na', 'nä', 'ine', 'ta', 'tä'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -267,11 +272,11 @@ private function step3() // a ä // delete if preceded by cv if (($position = $this->searchIfInR1(array('a', 'ä'))) !== false) { - $lastLetter = StringHelper::substr($this->word, ($position-1), 1); - $nextLastLetter = StringHelper::substr($this->word, ($position-2), 1); + $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $nextLastLetter = Utf8::substr($this->word, ($position-2), 1); if (in_array($lastLetter, self::$vowels, true) && in_array($nextLastLetter, self::$consonants, true)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -282,12 +287,12 @@ private function step3() // n // delete, and if preceded by LV or ie, delete the last vowel if (($position = $this->searchIfInR1(array('n'))) !== false) { - $lastLetters = StringHelper::substr($this->word, ($position-2), 2); + $lastLetters = Utf8::substr($this->word, ($position-2), 2); if (in_array($lastLetters, self::$longVowels, true) || $lastLetters === 'ie') { - $this->word = StringHelper::substr($this->word, 0, $position-1); + $this->word = Utf8::substr($this->word, 0, $position-1); } else { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); } $this->r1(); $this->r2(); @@ -309,9 +314,9 @@ private function step4() // mpi mpa mpä mmi mma mmä // delete if not preceded by po if (($position = $this->searchIfInR2(array('mpi', 'mpa', 'mpä', 'mmi', 'mma', 'mmä'))) !== false) { - $lastLetters = StringHelper::substr($this->word, ($position-2), 2); + $lastLetters = Utf8::substr($this->word, ($position-2), 2); if ($lastLetters !== 'po') { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -321,7 +326,7 @@ private function step4() // impi impa impä immi imma immä eja ejä // delete if (($position = $this->searchIfInR2(array('impi', 'impa', 'impä', 'immi', 'imma', 'immä', 'eja', 'ejä'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -342,27 +347,27 @@ private function step5() { if ($this->_removedInStep3) { if (($position = $this->searchIfInR1(array('i', 'j'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; } } else { if (($position = $this->searchIfInR1(array('t'))) !== false) { - $lastLetter = StringHelper::substr($this->word, ($position-1), 1); + $lastLetter = Utf8::substr($this->word, ($position-1), 1); if (in_array($lastLetter, self::$vowels, true)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = Utf8::substr($this->word, 0, $position); $this->r1(); $this->r2(); if (($position2 = $this->searchIfInR2(array('imma'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = Utf8::substr($this->word, 0, $position2); $this->r1(); $this->r2(); return true; } elseif (($position2 = $this->searchIfInR2(array('mma'))) !== false) { - $lastLetters = StringHelper::substr($this->word, ($position2-2), 2); + $lastLetters = Utf8::substr($this->word, ($position2-2), 2); if ($lastLetters !== 'po') { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = Utf8::substr($this->word, 0, $position2); $this->r1(); $this->r2(); return true; @@ -385,35 +390,35 @@ private function step6() // a) If R1 ends LV // delete the last letter if (($position = $this->searchIfInR1(self::$longVowels)) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position+1); + $this->word = Utf8::substr($this->word, 0, $position+1); $this->r1(); $this->r2(); } // b) If R1 ends cX, c a consonant and X one of a ä e i, // delete the last letter - $lastLetter = StringHelper::substr($this->r1, -1, 1); - $secondToLastLetter = StringHelper::substr($this->r1, -2, 1); + $lastLetter = Utf8::substr($this->r1, -1, 1); + $secondToLastLetter = Utf8::substr($this->r1, -2, 1); if (in_array($secondToLastLetter, self::$consonants, true) && in_array($lastLetter, array('a', 'e', 'i', 'ä'))) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = Utf8::substr($this->word, 0, -1); $this->r1(); $this->r2(); } // c) If R1 ends oj or uj // delete the last letter - $twoLastLetters = StringHelper::substr($this->r1, -2, 2); + $twoLastLetters = Utf8::substr($this->r1, -2, 2); if (in_array($twoLastLetters, array('oj', 'uj'))) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = Utf8::substr($this->word, 0, -1); $this->r1(); $this->r2(); } // d) If R1 ends jo // delete the last letter - $twoLastLetters = StringHelper::substr($this->r1, -2, 2); + $twoLastLetters = Utf8::substr($this->r1, -2, 2); if ($twoLastLetters === 'jo') { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = Utf8::substr($this->word, 0, -1); $this->r1(); $this->r2(); } @@ -422,15 +427,15 @@ private function step6() // vowels, remove the last consonant (so eläkk -> eläk, // aatonaatto -> aatonaato) $endVowels = ''; - for ($i = StringHelper::strlen($this->word) - 1; $i > 0; $i--) { - $letter = StringHelper::substr($this->word, $i, 1); + for ($i = Utf8::strlen($this->word) - 1; $i > 0; $i--) { + $letter = Utf8::substr($this->word, $i, 1); if (in_array($letter, self::$vowels, true)) { $endVowels = $letter . $endVowels; } else { // check for double consonant - $prevLetter = StringHelper::substr($this->word, $i-1, 1); + $prevLetter = Utf8::substr($this->word, $i-1, 1); if ($prevLetter === $letter) { - $this->word = StringHelper::substr($this->word, 0, $i) . $endVowels; + $this->word = Utf8::substr($this->word, 0, $i) . $endVowels; } break; } diff --git a/src/Stemmer/French.php b/src/Stemmer/French.php index 2bc53ca..8e1ee96 100644 --- a/src/Stemmer/French.php +++ b/src/Stemmer/French.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -22,7 +22,12 @@ class French extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); $this->plainVowels = implode('', self::$vowels); @@ -91,7 +96,7 @@ private function step1() // delete if in R2 if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return 3; } @@ -101,10 +106,10 @@ private function step1() // if preceded by ic, delete if in R2, else replace by iqU if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } @@ -145,9 +150,9 @@ private function step1() if ( ($position = $this->search(array('issements', 'issement'))) != false) { if ($this->inR1($position)) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if (! in_array($letter, self::$vowels)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } return 3; @@ -163,20 +168,20 @@ private function step1() // delete if in RV if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, } elseif ( ($position = $this->search(array('eus'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { $this->word = preg_replace('#(eus)$#u', 'eux', $this->word); @@ -184,7 +189,7 @@ private function step1() // if preceded by abl or iqU, delete if in R2, otherwise, } elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); // if preceded by ièr or Ièr, replace by i if in RV } elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) { @@ -202,13 +207,13 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by abil, delete if in R2, else replace by abl, otherwise, if ( ($position = $this->search(array('abil'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } else { $this->word = preg_replace('#(abil)$#u', 'abl', $this->word); } @@ -216,14 +221,14 @@ private function step1() // if preceded by ic, delete if in R2, else replace by iqU, otherwise, } elseif ( ($position = $this->search(array('ic'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } // if preceded by iv, delete if in R2 } elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return 3; @@ -235,15 +240,15 @@ private function step1() if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } if ( ($position = $this->searchIfInR2(array('at'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('ic'))) !== false) { if ($this->inR2($position2)) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } @@ -273,7 +278,7 @@ private function step1() // delete if in R2, else replace by eux if in R1 if ( ($position = $this->search(array('euses', 'euse'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { $this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word); @@ -304,9 +309,9 @@ private function step1() // delete if preceded by a vowel in RV if ( ($position = $this->search(array('ments', 'ment'))) != false) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return 2; @@ -332,9 +337,9 @@ private function step2a() 'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } @@ -355,7 +360,7 @@ private function step2b() 'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez', 'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } @@ -368,12 +373,12 @@ private function step2b() 'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ( $this->inRv($before) && ($letter == 'e') ) { - $this->word = StringHelper::substr($this->word, 0, $before); + $this->word = UTF8::substr($this->word, 0, $before); } else { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; @@ -383,7 +388,7 @@ private function step2b() // delete if in R2 if ( ($position = $this->searchIfInRv(array('ions'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; @@ -408,7 +413,7 @@ private function step4() { //If the word ends s, not preceded by a, i, o, u, è or s, delete it. if (preg_match('#[^aiouès]s$#', $this->word)) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } // In the rest of step 4, all tests are confined to the RV region. @@ -416,9 +421,9 @@ private function step4() // delete if in R2 and preceded by s or t if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -433,7 +438,7 @@ private function step4() // e // delete if ( ($this->searchIfInRv(array('e'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); return true; } @@ -441,7 +446,7 @@ private function step4() // if preceded by gu, delete if ( ($position = $this->searchIfInRv(array('guë'))) !== false) { if ($this->inRv($position+2)) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); return true; } } @@ -456,7 +461,7 @@ private function step4() private function step5() { if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } } @@ -475,7 +480,7 @@ private function step6() */ private function finish() { - $this->word = str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); + $this->word = UTF8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); } /** @@ -486,7 +491,7 @@ private function finish() */ protected function rv() { - $length = StringHelper::strlen($this->word); + $length = UTF8::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; @@ -496,28 +501,28 @@ protected function rv() } // If the word begins with two vowels, RV is the region after the third letter - $first = StringHelper::substr($this->word, 0, 1); - $second = StringHelper::substr($this->word, 1, 1); + $first = UTF8::substr($this->word, 0, 1); + $second = UTF8::substr($this->word, 1, 1); if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) { - $this->rv = StringHelper::substr($this->word, 3); + $this->rv = UTF8::substr($this->word, 3); $this->rvIndex = 3; return true; } // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) - $begin3 = StringHelper::substr($this->word, 0, 3); + $begin3 = UTF8::substr($this->word, 0, 3); if (in_array($begin3, array('par', 'col', 'tap'))) { - $this->rv = StringHelper::substr($this->word, 3); + $this->rv = UTF8::substr($this->word, 3); $this->rvIndex = 3; return true; } // otherwise the region after the first vowel not at the beginning of the word, for ($i=1; $i<$length; $i++) { - $letter = StringHelper::substr($this->word, $i, 1); + $letter = UTF8::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->rv = StringHelper::substr($this->word, ($i + 1)); + $this->rv = UTF8::substr($this->word, ($i + 1)); $this->rvIndex = $i + 1; return true; } diff --git a/src/Stemmer/German.php b/src/Stemmer/German.php index 11dc733..4dc81a3 100644 --- a/src/Stemmer/German.php +++ b/src/Stemmer/German.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -26,12 +26,17 @@ class German extends Stem */ public function stem($word) { + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + $this->plainVowels = implode('', self::$vowels); - $this->word = StringHelper::strtolower($word); + $this->word = UTF8::strtolower($word); // First, replace ß by ss - $this->word = str_replace('ß', 'ss', $this->word); + $this->word = UTF8::str_replace('ß', 'ss', $this->word); // put u and y between vowels into upper case $this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word); @@ -44,7 +49,7 @@ public function stem($word) // but then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = StringHelper::substr($this->word, 3); + $this->r1 = UTF8::substr($this->word, 3); } $this->step1(); @@ -63,7 +68,7 @@ private function step1() // delete if in R1 if ( ($position = $this->search(array('em', 'ern', 'er'))) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -71,11 +76,11 @@ private function step1() // delete if in R1 if ( ($position = $this->search(array('es', 'en', 'e'))) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); //If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s if ($this->search(array('niss')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } } return true; @@ -85,10 +90,10 @@ private function step1() if ( ($position = $this->search(array('s'))) !== false) { if ($this->inR1($position)) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if (in_array($letter, self::$sEndings)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } return true; @@ -106,7 +111,7 @@ private function step2() // delete if in R1 if ( ($position = $this->search(array('en', 'er', 'est'))) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -117,10 +122,10 @@ private function step2() if ($this->inR1($position)) { $before = $position - 1; if ($before >= 3) { - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if (in_array($letter, self::$stEndings)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } } @@ -139,15 +144,15 @@ private function step3() // if preceded by ig, delete if in R2 and not preceded by e if ( ($position = $this->search(array('end', 'ung'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('ig'))) !== false) { $before = $position2 - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ( ($this->inR2($position2)) && ($letter != 'e') ) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } } return true; @@ -157,10 +162,10 @@ private function step3() // delete if in R2 and not preceded by e if ( ($position = $this->search(array('ig', 'ik', 'isch'))) !== false) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ( ($this->inR2($position)) && ($letter != 'e') ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -170,12 +175,12 @@ private function step3() // if preceded by er or en, delete if in R1 if ( ($position = $this->search(array('lich', 'heit'))) != false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('er', 'en'))) !== false) { if ($this->inR1($position2)) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } } return true; @@ -186,12 +191,12 @@ private function step3() // if preceded by lich or ig, delete if in R2 if ( ($position = $this->search(array('keit'))) != false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('lich', 'ig'))) !== false) { if ($this->inR2($position2)) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } } return true; @@ -206,6 +211,6 @@ private function step3() private function finish() { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. - $this->word = str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); + $this->word = UTF8::str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); } } diff --git a/src/Stemmer/Italian.php b/src/Stemmer/Italian.php index 4bb2004..bb09dee 100644 --- a/src/Stemmer/Italian.php +++ b/src/Stemmer/Italian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -22,12 +22,17 @@ class Italian extends Stem */ public function stem($word) { + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + $this->plainVowels = implode('', self::$vowels); - $this->word = StringHelper::strtolower($word); + $this->word = UTF8::strtolower($word); // First, replace all acute accents by grave accents. - $this->word = str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); + $this->word = UTF8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then $this->word = preg_replace('#([q])u#u', '$1U', $this->word); @@ -67,7 +72,7 @@ private function step0() 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', 'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) { - $suffixe = StringHelper::substr($this->word, $position); + $suffixe = UTF8::substr($this->word, $position); // following one of (in RV) // a @@ -77,7 +82,7 @@ private function step0() }, $a); // In case of (a) the suffix is deleted if ($this->searchIfInRv($a) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } //b @@ -107,19 +112,19 @@ private function step1() // if preceded by os, ic or abil, delete if in R2 if ( ($position = $this->search(array('amente'))) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position3); + $this->word = UTF8::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position4); + $this->word = UTF8::substr($this->word, 0, $position4); } return true; } @@ -132,7 +137,7 @@ private function step1() ))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -142,11 +147,11 @@ private function step1() // if preceded by ic, delete if in R2 if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('ic'))) !== false) { if ($this->inR2($position2)) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } } } @@ -184,7 +189,7 @@ private function step1() // delete if in RV if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -194,11 +199,11 @@ private function step1() // if preceded by abil, ic or iv, delete if in R2 if ( ($position = $this->search(array('ità'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } return true; } @@ -208,13 +213,13 @@ private function step1() // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2) if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position3); + $this->word = UTF8::substr($this->word, 0, $position3); } } return true; @@ -238,7 +243,7 @@ private function step2() 'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva', 'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } @@ -249,10 +254,10 @@ private function step2() private function step3a() { if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); if ($this->searchIfInRv(array('i')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } return true; } @@ -279,6 +284,6 @@ private function step3b() */ private function finish() { - $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); + $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); } } diff --git a/src/Stemmer/Norwegian.php b/src/Stemmer/Norwegian.php index 627a578..b44b722 100644 --- a/src/Stemmer/Norwegian.php +++ b/src/Stemmer/Norwegian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -22,7 +22,12 @@ class Norwegian extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -30,7 +35,7 @@ public function stem($word) // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = StringHelper::substr($this->word, 3); + $this->r1 = UTF8::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -51,12 +56,12 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = StringHelper::substr($word, -1, 1); + $lastLetter = UTF8::substr($word, -1, 1); if (in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'))) { return true; } if ($lastLetter == 'k') { - $beforeLetter = StringHelper::substr($word, -2, 1); + $beforeLetter = UTF8::substr($word, -2, 1); if (!in_array($beforeLetter, self::$vowels)) { return true; } @@ -83,14 +88,14 @@ private function step1() 'hetenes', 'hetene', 'hetens', 'heten', 'endes', 'heter', 'ande', 'ende', 'enes', 'edes', 'ede', 'ane', 'ene', 'het', 'ers', 'ets', 'ast', 'ens', 'en', 'ar', 'er', 'as', 'es', 'et', 'a', 'e' ))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = StringHelper::substr($this->word, 0, $position); + $word = UTF8::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -105,7 +110,7 @@ private function step1() private function step2() { if ($this->searchIfInR1(array('dt', 'vt')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } } @@ -119,7 +124,7 @@ private function step3() if ( ($position = $this->searchIfInR1(array( 'hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig' ))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } } diff --git a/src/Stemmer/Portuguese.php b/src/Stemmer/Portuguese.php index c5f3aae..c71cc59 100644 --- a/src/Stemmer/Portuguese.php +++ b/src/Stemmer/Portuguese.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -22,9 +22,14 @@ class Portuguese extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); - $this->word = str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); + $this->word = UTF8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); $this->rv(); $this->r1(); @@ -61,7 +66,7 @@ private function step1() 'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -101,19 +106,19 @@ private function step1() // delete if in R1 if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position3); + $this->word = UTF8::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position4); + $this->word = UTF8::substr($this->word, 0, $position4); } return true; } @@ -125,12 +130,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by ante, avel or ível, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('ante', 'avel', 'ível'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } return true; } @@ -142,12 +147,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by abil, ic or iv, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } return true; } @@ -159,12 +164,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by at, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } return true; } @@ -175,7 +180,7 @@ private function step1() if ($this->inRv($position)) { $before = $position -1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ($letter == 'e') { $this->word = preg_replace('#(iras|ira)$#u', 'ir', $this->word); @@ -208,7 +213,7 @@ private function step2() 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou', ))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } return false; @@ -222,10 +227,10 @@ private function step3() { // Delete suffix i if in RV and preceded by c if ($this->searchIfInRv(array('i')) !== false) { - $letter = StringHelper::substr($this->word, -2, 1); + $letter = UTF8::substr($this->word, -2, 1); if ($letter == 'c') { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } return true; } @@ -239,7 +244,7 @@ private function step4() { // If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it if ( ($position = $this->searchIfInRv(array('os', 'a', 'i', 'o','á', 'í', 'ó'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } return false; @@ -252,11 +257,11 @@ private function step5() { // If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i). if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); if ( ($position2 = $this->search(array('gu', 'ci'))) !== false) { if ($this->inRv(($position2+1))) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } } return true; @@ -273,6 +278,6 @@ private function step5() private function finish() { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. - $this->word = str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); + $this->word = UTF8::str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); } } diff --git a/src/Stemmer/Romanian.php b/src/Stemmer/Romanian.php index 87047dc..5da8744 100644 --- a/src/Stemmer/Romanian.php +++ b/src/Stemmer/Romanian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -22,7 +22,12 @@ class Romanian extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); $this->plainVowels = implode('', self::$vowels); @@ -68,7 +73,7 @@ private function step0() // delete if ( ($position = $this->search(array('ul', 'ului'))) !== false) { if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -104,7 +109,7 @@ private function step0() // replace with i if not preceded by ab if ( ($position = $this->search(array('ile'))) !== false) { if ($this->inR1($position)) { - $before = StringHelper::substr($this->word, ($position-2), 2); + $before = UTF8::substr($this->word, ($position-2), 2); if ($before != 'ab') { $this->word = preg_replace('#(ile)$#u', 'i', $this->word); @@ -221,7 +226,7 @@ private function step2() 'at', 'os', 'iv', 'ut', 'it', 'ic' ))) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -231,9 +236,9 @@ private function step2() if ( ($position = $this->search(array('iune', 'iuni'))) !== false) { if ($this->inR2($position)) { $before = $position - 1; - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ($letter == 'ţ') { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); $this->word = preg_replace('#(ţ)$#u', 't', $this->word); } } @@ -277,10 +282,10 @@ private function step3() if ($this->inRv($position)) { $before = $position - 1; if ($this->inRv($before)) { - $letter = StringHelper::substr($this->word, $before, 1); + $letter = UTF8::substr($this->word, $before, 1); if ( (!in_array($letter, self::$vowels)) || ($letter == 'u') ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } } @@ -296,7 +301,7 @@ private function step3() 'aţi', 'eţi', 'iţi', 'âţi', 'sei', 'se', 'ăm', 'âm', 'em', 'im' ))) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -310,7 +315,7 @@ private function step4() // Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it. if ( ($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } @@ -324,6 +329,6 @@ private function step4() private function finish() { // Turn I, U back into i, u - $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); + $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); } } diff --git a/src/Stemmer/Russian.php b/src/Stemmer/Russian.php index 3949a45..cd18dbf 100644 --- a/src/Stemmer/Russian.php +++ b/src/Stemmer/Russian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -56,7 +56,12 @@ class Russian extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -83,7 +88,7 @@ private function step1() // group 1 if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[0])) !== false) { if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -91,7 +96,7 @@ private function step1() // group 2 if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[1])) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -99,7 +104,7 @@ private function step1() // Otherwise try and remove a REFLEXIVE ending if ( ($position = $this->searchIfInRv(self::$reflexive)) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } } @@ -107,18 +112,18 @@ private function step1() // As soon as one of the endings (1) to (3) is found remove it, and terminate step 1. if ( ($position = $this->searchIfInRv(self::$adjective)) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position2 = $this->search(self::$participle[0])) !== false) { if ( ($this->inRv($position2)) && ($this->checkGroup1($position2)) ) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); return true; } } if ( ($position2 = $this->search(self::$participle[1])) !== false) { if ($this->inRv($position2)) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); return true; } } @@ -129,21 +134,21 @@ private function step1() if ( ($position = $this->searchIfInRv(self::$verb[0])) !== false) { if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } if ( ($position = $this->searchIfInRv(self::$verb[1])) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } if ( ($position = $this->searchIfInRv(self::$noun)) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -158,7 +163,7 @@ private function step2() { if ( ($position = $this->searchIfInRv(array('и'))) !== false) { if ($this->inRv($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -173,7 +178,7 @@ private function step3() { if ( ($position = $this->searchIfInRv(self::$derivational)) !== false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -187,18 +192,18 @@ private function step4() { // (2) if the word ends with a SUPERLATIVE ending, remove it if ( ($position = $this->searchIfInRv(self::$superlative)) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // (1) Undouble н (n) if ( ($position = $this->searchIfInRv(array('нн'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, ($position+1)); + $this->word = UTF8::substr($this->word, 0, ($position+1)); return true; } // (3) if the word ends ь (') (soft sign) remove it if ( ($position = $this->searchIfInRv(array('ь'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -208,15 +213,15 @@ private function step4() */ protected function rv() { - $length = StringHelper::strlen($this->word); + $length = UTF8::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; for ($i=0; $i<$length; $i++) { - $letter = StringHelper::substr($this->word, $i, 1); + $letter = UTF8::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->rv = StringHelper::substr($this->word, ($i+1)); + $this->rv = UTF8::substr($this->word, ($i+1)); $this->rvIndex = $i + 1; return true; } @@ -237,7 +242,7 @@ private function checkGroup1($position) return false; } - $letter = StringHelper::substr($this->word, ($position - 1), 1); + $letter = UTF8::substr($this->word, ($position - 1), 1); if ($letter == 'а' || $letter == 'я') { return true; diff --git a/src/Stemmer/Spanish.php b/src/Stemmer/Spanish.php index b83c040..4f6f2c8 100644 --- a/src/Stemmer/Spanish.php +++ b/src/Stemmer/Spanish.php @@ -2,8 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; -use Wamania\Snowball\Transliterate; +use voku\helper\UTF8; /** * @@ -23,7 +22,12 @@ class Spanish extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); $this->rv(); $this->r1(); @@ -67,7 +71,7 @@ public function stem($word) private function step0() { if ( ($position = $this->searchIfInRv(array('selas', 'selos', 'las', 'los', 'les', 'nos', 'selo', 'sela', 'me', 'se', 'la', 'le', 'lo' ))) != false) { - $suffixe = StringHelper::substr($this->word, $position); + $suffixe = UTF8::substr($this->word, $position); // a $a = array('iéndo', 'ándo', 'ár', 'ér', 'ír'); @@ -76,11 +80,11 @@ private function step0() }, $a); if ( ($position2 = $this->searchIfInRv($a)) !== false) { - $suffixe2 = StringHelper::substr($this->word, $position2); - $suffixe2 = Transliterate::utf8_latin_to_ascii($suffixe2); // unaccent - $this->word = StringHelper::substr($this->word, 0, $position2); + $suffixe2 = UTF8::substr($this->word, $position2); + $suffixe2 = UTF8::to_utf8(UTF8::to_ascii($suffixe2)); // unaccent + $this->word = UTF8::substr($this->word, 0, $position2); $this->word .= $suffixe2; - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } @@ -91,15 +95,15 @@ private function step0() }, $b); if ( ($position2 = $this->searchIfInRv($b)) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } // c if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) { - $before = StringHelper::substr($this->word, ($position2-1), 1); + $before = UTF8::substr($this->word, ($position2-1), 1); if ( (isset($before)) && ($before == 'u') ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -121,7 +125,7 @@ private function step1() 'ible', 'ables', 'able', 'ismos', 'ismo', 'icas', 'icos', 'ica', 'ico', 'anzas', 'anza'))) != false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } return true; } @@ -133,11 +137,11 @@ private function step1() 'adoras', 'adora', 'aciones', 'ación', 'adores', 'ador', 'antes', 'ante', 'ancias', 'ancia'))) != false) { if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('ic')))) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } return true; } @@ -177,19 +181,19 @@ private function step1() // delete if in R1 if ($this->inR1($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position3); + $this->word = UTF8::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position4); + $this->word = UTF8::substr($this->word, 0, $position4); } return true; } @@ -201,12 +205,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by ante, able or ible, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('ante', 'able', 'ible'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } return true; } @@ -218,12 +222,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by abil, ic or iv, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } return true; } @@ -235,12 +239,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); } // if preceded by at, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('at'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); } return true; } @@ -258,9 +262,9 @@ private function step2a() if ( ($position = $this->searchIfInRv(array( 'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) { - $before = StringHelper::substr($this->word, ($position-1), 1); + $before = UTF8::substr($this->word, ($position-1), 1); if ( (isset($before)) && ($before == 'u') ) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } } @@ -285,17 +289,17 @@ private function step2b() 'aré', 'erá', 'eré', 'áis', 'ías', 'irá', 'iré', 'aba', 'ían', 'ada', 'ara', 'ase', 'ida', 'ado', 'ido', 'ará', 'ad', 'ed', 'id', 'ís', 'ió', 'ar', 'er', 'ir', 'as', 'ía', 'an' ))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } // en es éis emos // delete, and if preceded by gu delete the u (the gu need not be in RV) if ( ($position = $this->searchIfInRv(array('éis', 'emos', 'en', 'es'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('gu'))) != false) { - $this->word = StringHelper::substr($this->word, 0, ($position2+1)); + $this->word = UTF8::substr($this->word, 0, ($position2+1)); } @@ -312,19 +316,19 @@ private function step3() // os a o á í ó // delete if in RV if ( ($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } // e é // delete if in RV, and if preceded by gu with the u in RV delete the u if ( ($position = $this->searchIfInRv(array('e', 'é'))) != false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInRv(array('u'))) != false) { - $before = StringHelper::substr($this->word, ($position2-1), 1); + $before = UTF8::substr($this->word, ($position2-1), 1); if ( (isset($before)) && ($before == 'g') ) { - $this->word = StringHelper::substr($this->word, 0, $position2); + $this->word = UTF8::substr($this->word, 0, $position2); return true; } } @@ -339,6 +343,6 @@ private function step3() */ private function finish() { - $this->word = str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); + $this->word = UTF8::str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); } } diff --git a/src/Stemmer/Stem.php b/src/Stemmer/Stem.php index 1ce7274..0c6f148 100644 --- a/src/Stemmer/Stem.php +++ b/src/Stemmer/Stem.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; abstract class Stem implements Stemmer { @@ -94,12 +94,12 @@ protected function searchIfInR2($suffixes) protected function search($suffixes, $offset = 0) { - $length = StringHelper::strlen($this->word); + $length = UTF8::strlen($this->word); if ($offset > $length) { return false; } foreach ($suffixes as $suffixe) { - if ( (($position = StringHelper::strrpos($this->word, $suffixe, $offset)) !== false) && ((StringHelper::strlen($suffixe)+$position) == $length) ) { + if ( (($position = UTF8::strrpos($this->word, $suffixe, $offset)) !== false) && ((Utf8::strlen($suffixe)+$position) == $length) ) { return $position; } } @@ -134,7 +134,7 @@ protected function r2() */ protected function rx($in) { - $length = StringHelper::strlen($in); + $length = UTF8::strlen($in); // defaults $value = ''; @@ -143,7 +143,7 @@ protected function rx($in) // we search all vowels $vowels = array(); for ($i=0; $i<$length; $i++) { - $letter = StringHelper::substr($in, $i, 1); + $letter = UTF8::substr($in, $i, 1); if (in_array($letter, static::$vowels)) { $vowels[] = $i; } @@ -152,11 +152,11 @@ protected function rx($in) // search the non-vowel following a vowel foreach ($vowels as $position) { $after = $position + 1; - $letter = StringHelper::substr($in, $after, 1); + $letter = UTF8::substr($in, $after, 1); if (! in_array($letter, static::$vowels)) { $index = $after + 1; - $value = StringHelper::substr($in, ($after+1)); + $value = UTF8::substr($in, ($after+1)); break; } @@ -175,7 +175,7 @@ protected function rx($in) */ protected function rv() { - $length = StringHelper::strlen($this->word); + $length = UTF8::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; @@ -184,16 +184,16 @@ protected function rv() return true; } - $first = StringHelper::substr($this->word, 0, 1); - $second = StringHelper::substr($this->word, 1, 1); + $first = UTF8::substr($this->word, 0, 1); + $second = UTF8::substr($this->word, 1, 1); // If the second letter is a consonant, RV is the region after the next following vowel, if (!in_array($second, static::$vowels)) { for ($i=2; $i<$length; $i++) { - $letter = StringHelper::substr($this->word, $i, 1); + $letter = UTF8::substr($this->word, $i, 1); if (in_array($letter, static::$vowels)) { $this->rvIndex = $i + 1; - $this->rv = StringHelper::substr($this->word, ($i+1)); + $this->rv = UTF8::substr($this->word, ($i+1)); return true; } } @@ -202,10 +202,10 @@ protected function rv() // or if the first two letters are vowels, RV is the region after the next consonant, if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { for ($i=2; $i<$length; $i++) { - $letter = StringHelper::substr($this->word, $i, 1); + $letter = UTF8::substr($this->word, $i, 1); if (! in_array($letter, static::$vowels)) { $this->rvIndex = $i + 1; - $this->rv = StringHelper::substr($this->word, ($i+1)); + $this->rv = UTF8::substr($this->word, ($i+1)); return true; } } @@ -213,7 +213,7 @@ protected function rv() // and otherwise (consonant-vowel case) RV is the region after the third letter. if ( (! in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { - $this->rv = StringHelper::substr($this->word, 3); + $this->rv = UTF8::substr($this->word, 3); $this->rvIndex = 3; return true; } diff --git a/src/Stemmer/Swedish.php b/src/Stemmer/Swedish.php index ed8103c..32352ef 100644 --- a/src/Stemmer/Swedish.php +++ b/src/Stemmer/Swedish.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use Joomla\String\StringHelper; +use voku\helper\UTF8; /** * @@ -22,7 +22,12 @@ class Swedish extends Stem */ public function stem($word) { - $this->word = StringHelper::strtolower($word); + // we do ALL in UTF-8 + if (!UTF8::is_utf8($word)) { + throw new \Exception('Word must be in UTF-8'); + } + + $this->word = UTF8::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -30,7 +35,7 @@ public function stem($word) // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = StringHelper::substr($this->word, 3); + $this->r1 = UTF8::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -50,7 +55,7 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = StringHelper::substr($word, -1, 1); + $lastLetter = UTF8::substr($word, -1, 1); return in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y')); } @@ -69,14 +74,14 @@ private function step1() 'orna', 'arna', 'erna', 'aren', 'ande', 'ades', 'arne', 'erns', 'aste', 'ade', 'ern', 'het', 'ast', 'are', 'ens', 'or', 'es', 'ad', 'en', 'at', 'ar', 'as', 'er', 'a', 'e' ))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = StringHelper::substr($this->word, 0, $position); + $word = UTF8::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -91,7 +96,7 @@ private function step2() { // dd gd nn dt gt kt tt if ($this->searchIfInR1(array('dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt')) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); } } @@ -104,21 +109,21 @@ private function step3() // lig ig els // delete if ( ($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, $position); + $this->word = UTF8::substr($this->word, 0, $position); return true; } // löst // replace with lös if ( ($this->searchIfInR1(array('löst'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); return true; } // fullt // replace with full if ( ($this->searchIfInR1(array('fullt'))) !== false) { - $this->word = StringHelper::substr($this->word, 0, -1); + $this->word = UTF8::substr($this->word, 0, -1); return true; } } diff --git a/src/StemmerFactory.php b/src/StemmerFactory.php index b8c487a..d60a8c6 100644 --- a/src/StemmerFactory.php +++ b/src/StemmerFactory.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball; -use Joomla\String\StringHelper; +use voku\helper\UTF8; use Wamania\Snowball\Stemmer\Catalan; use Wamania\Snowball\Stemmer\Danish; use Wamania\Snowball\Stemmer\Dutch; @@ -43,7 +43,7 @@ class StemmerFactory */ public static function create(string $code): Stemmer { - $code = StringHelper::strtolower($code); + $code = UTF8::strtolower($code); foreach (self::LANGS as $classname => $isoCodes) { if (in_array($code, $isoCodes)) { diff --git a/src/Transliterate.php b/src/Transliterate.php deleted file mode 100644 index 3399f6b..0000000 --- a/src/Transliterate.php +++ /dev/null @@ -1,253 +0,0 @@ - 'a', - 'ô' => 'o', - 'ď' => 'd', - 'ḟ' => 'f', - 'ë' => 'e', - 'š' => 's', - 'ơ' => 'o', - 'ß' => 'ss', - 'ă' => 'a', - 'ř' => 'r', - 'ț' => 't', - 'ň' => 'n', - 'ā' => 'a', - 'ķ' => 'k', - 'ŝ' => 's', - 'ỳ' => 'y', - 'ņ' => 'n', - 'ĺ' => 'l', - 'ħ' => 'h', - 'ṗ' => 'p', - 'ó' => 'o', - 'ú' => 'u', - 'ě' => 'e', - 'é' => 'e', - 'ç' => 'c', - 'ẁ' => 'w', - 'ċ' => 'c', - 'õ' => 'o', - 'ṡ' => 's', - 'ø' => 'o', - 'ģ' => 'g', - 'ŧ' => 't', - 'ș' => 's', - 'ė' => 'e', - 'ĉ' => 'c', - 'ś' => 's', - 'î' => 'i', - 'ű' => 'u', - 'ć' => 'c', - 'ę' => 'e', - 'ŵ' => 'w', - 'ṫ' => 't', - 'ū' => 'u', - 'č' => 'c', - 'ö' => 'oe', - 'è' => 'e', - 'ŷ' => 'y', - 'ą' => 'a', - 'ł' => 'l', - 'ų' => 'u', - 'ů' => 'u', - 'ş' => 's', - 'ğ' => 'g', - 'ļ' => 'l', - 'ƒ' => 'f', - 'ž' => 'z', - 'ẃ' => 'w', - 'ḃ' => 'b', - 'å' => 'a', - 'ì' => 'i', - 'ï' => 'i', - 'ḋ' => 'd', - 'ť' => 't', - 'ŗ' => 'r', - 'ä' => 'ae', - 'í' => 'i', - 'ŕ' => 'r', - 'ê' => 'e', - 'ü' => 'ue', - 'ò' => 'o', - 'ē' => 'e', - 'ñ' => 'n', - 'ń' => 'n', - 'ĥ' => 'h', - 'ĝ' => 'g', - 'đ' => 'd', - 'ĵ' => 'j', - 'ÿ' => 'y', - 'ũ' => 'u', - 'ŭ' => 'u', - 'ư' => 'u', - 'ţ' => 't', - 'ý' => 'y', - 'ő' => 'o', - 'â' => 'a', - 'ľ' => 'l', - 'ẅ' => 'w', - 'ż' => 'z', - 'ī' => 'i', - 'ã' => 'a', - 'ġ' => 'g', - 'ṁ' => 'm', - 'ō' => 'o', - 'ĩ' => 'i', - 'ù' => 'u', - 'į' => 'i', - 'ź' => 'z', - 'á' => 'a', - 'û' => 'u', - 'þ' => 'th', - 'ð' => 'dh', - 'æ' => 'ae', - 'µ' => 'u', - 'ĕ' => 'e', - 'œ' => 'oe', - ]; - } - - $string = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $string); - } - - if ($case >= 0) { - if (\is_null($UTF8_UPPER_ACCENTS)) { - $UTF8_UPPER_ACCENTS = [ - 'À' => 'A', - 'Ô' => 'O', - 'Ď' => 'D', - 'Ḟ' => 'F', - 'Ë' => 'E', - 'Š' => 'S', - 'Ơ' => 'O', - 'Ă' => 'A', - 'Ř' => 'R', - 'Ț' => 'T', - 'Ň' => 'N', - 'Ā' => 'A', - 'Ķ' => 'K', - 'Ŝ' => 'S', - 'Ỳ' => 'Y', - 'Ņ' => 'N', - 'Ĺ' => 'L', - 'Ħ' => 'H', - 'Ṗ' => 'P', - 'Ó' => 'O', - 'Ú' => 'U', - 'Ě' => 'E', - 'É' => 'E', - 'Ç' => 'C', - 'Ẁ' => 'W', - 'Ċ' => 'C', - 'Õ' => 'O', - 'Ṡ' => 'S', - 'Ø' => 'O', - 'Ģ' => 'G', - 'Ŧ' => 'T', - 'Ș' => 'S', - 'Ė' => 'E', - 'Ĉ' => 'C', - 'Ś' => 'S', - 'Î' => 'I', - 'Ű' => 'U', - 'Ć' => 'C', - 'Ę' => 'E', - 'Ŵ' => 'W', - 'Ṫ' => 'T', - 'Ū' => 'U', - 'Č' => 'C', - 'Ö' => 'Oe', - 'È' => 'E', - 'Ŷ' => 'Y', - 'Ą' => 'A', - 'Ł' => 'L', - 'Ų' => 'U', - 'Ů' => 'U', - 'Ş' => 'S', - 'Ğ' => 'G', - 'Ļ' => 'L', - 'Ƒ' => 'F', - 'Ž' => 'Z', - 'Ẃ' => 'W', - 'Ḃ' => 'B', - 'Å' => 'A', - 'Ì' => 'I', - 'Ï' => 'I', - 'Ḋ' => 'D', - 'Ť' => 'T', - 'Ŗ' => 'R', - 'Ä' => 'Ae', - 'Í' => 'I', - 'Ŕ' => 'R', - 'Ê' => 'E', - 'Ü' => 'Ue', - 'Ò' => 'O', - 'Ē' => 'E', - 'Ñ' => 'N', - 'Ń' => 'N', - 'Ĥ' => 'H', - 'Ĝ' => 'G', - 'Đ' => 'D', - 'Ĵ' => 'J', - 'Ÿ' => 'Y', - 'Ũ' => 'U', - 'Ŭ' => 'U', - 'Ư' => 'U', - 'Ţ' => 'T', - 'Ý' => 'Y', - 'Ő' => 'O', - 'Â' => 'A', - 'Ľ' => 'L', - 'Ẅ' => 'W', - 'Ż' => 'Z', - 'Ī' => 'I', - 'Ã' => 'A', - 'Ġ' => 'G', - 'Ṁ' => 'M', - 'Ō' => 'O', - 'Ĩ' => 'I', - 'Ù' => 'U', - 'Į' => 'I', - 'Ź' => 'Z', - 'Á' => 'A', - 'Û' => 'U', - 'Þ' => 'Th', - 'Ð' => 'Dh', - 'Æ' => 'Ae', - 'Ĕ' => 'E', - 'Œ' => 'Oe', - ]; - } - - $string = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $string); - } - - return $string; - } -} From 73d7331e375f3075a91068dc9bbad281d9d1b4ea Mon Sep 17 00:00:00 2001 From: Dom Morgan Date: Tue, 26 May 2026 11:33:32 +0100 Subject: [PATCH 2/3] Remove deprecations --- .gitignore | 1 + test/CatalanTest.php | 2 +- test/CsvFileIterator.php | 5 +++++ test/CsvFileVerboseIterator.php | 28 ---------------------------- 4 files changed, 7 insertions(+), 29 deletions(-) delete mode 100644 test/CsvFileVerboseIterator.php diff --git a/.gitignore b/.gitignore index 1fc9c34..d2895e2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # PHPUnit /app/phpunit.xml /phpunit.xml +.phpunit.result.cache # Build data /build/ diff --git a/test/CatalanTest.php b/test/CatalanTest.php index 2512c48..b410590 100644 --- a/test/CatalanTest.php +++ b/test/CatalanTest.php @@ -20,6 +20,6 @@ public function testStem($word, $stem) public function load() { - return new CsvFileVerboseIterator('test/files/ca.txt'); + return new CsvFileIterator('test/files/ca.txt'); } } diff --git a/test/CsvFileIterator.php b/test/CsvFileIterator.php index ddc0b23..bf1ed17 100644 --- a/test/CsvFileIterator.php +++ b/test/CsvFileIterator.php @@ -19,6 +19,7 @@ public function __destruct() fclose($this->file); } + #[\ReturnTypeWillChange] public function rewind() { rewind($this->file); @@ -32,21 +33,25 @@ public function rewind() $this->key = 0; } + #[\ReturnTypeWillChange] public function valid() { return !feof($this->file); } + #[\ReturnTypeWillChange] public function key() { return $this->key; } + #[\ReturnTypeWillChange] public function current() { return $this->current; } + #[\ReturnTypeWillChange] public function next() { $line = fgets($this->file); diff --git a/test/CsvFileVerboseIterator.php b/test/CsvFileVerboseIterator.php deleted file mode 100644 index 25314b6..0000000 --- a/test/CsvFileVerboseIterator.php +++ /dev/null @@ -1,28 +0,0 @@ -_updateKey($this->current()); - } - - public function next() - { - parent::next(); - if ($this->valid()) { - $this->_updateKey($this->current()); - } - } - - protected function _updateKey($value) - { - if ($value && sizeof($value)) { - $this->key = $value[0]; - } elseif (sizeof($this->current)) { - $this->key = $this->current[0]; - } - } -} From fcc2652d0107421e8a987be7fbd3ccfccdd608ca Mon Sep 17 00:00:00 2001 From: Dom Morgan Date: Tue, 26 May 2026 17:38:09 +0100 Subject: [PATCH 3/3] Remove deprecated str_replace --- src/Stemmer/Catalan.php | 4 ++-- src/Stemmer/Dutch.php | 4 ++-- src/Stemmer/English.php | 2 +- src/Stemmer/French.php | 2 +- src/Stemmer/German.php | 4 ++-- src/Stemmer/Italian.php | 4 ++-- src/Stemmer/Portuguese.php | 4 ++-- src/Stemmer/Romanian.php | 2 +- src/Stemmer/Spanish.php | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Stemmer/Catalan.php b/src/Stemmer/Catalan.php index d52e4fc..b1de0ab 100644 --- a/src/Stemmer/Catalan.php +++ b/src/Stemmer/Catalan.php @@ -162,7 +162,7 @@ private function step1a() // atius atives ativa ativitat ativitats ible ibles assa asses assos ent ents íssim íssima íssims íssimes // ìssem ìsseu ìssin ims ima imes isme ista ismes istes inia inies íinia ínies ita ites triu trius oses osos // ient otes ots - // + // // delete if in R1 if (($position = $this->search(self::$standard_suffix_1a)) !== false) { if ($this->inR1($position)) { @@ -294,7 +294,7 @@ private function step2() */ private function finish() { - $this->word = UTF8::str_replace( + $this->word = str_replace( ['á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ï', 'ü', '·'], ['a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'i', 'u', '.'], $this->word diff --git a/src/Stemmer/Dutch.php b/src/Stemmer/Dutch.php index fc7c1af..8fba0d6 100644 --- a/src/Stemmer/Dutch.php +++ b/src/Stemmer/Dutch.php @@ -30,7 +30,7 @@ public function stem($word) $this->word = UTF8::strtolower($word); // First, remove all umlaut and acute accents. - $this->word = UTF8::str_replace( + $this->word = str_replace( array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), $this->word); @@ -301,6 +301,6 @@ private function step4() */ private function finish() { - $this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word); + $this->word = str_replace(array('I', 'Y'), array('i', 'y'), $this->word); } } diff --git a/src/Stemmer/English.php b/src/Stemmer/English.php index fe5f186..0e747d0 100644 --- a/src/Stemmer/English.php +++ b/src/Stemmer/English.php @@ -469,7 +469,7 @@ private function step5() private function finish() { - $this->word = UTF8::str_replace('Y', 'y', $this->word); + $this->word = str_replace('Y', 'y', $this->word); } private function exceptionR1() diff --git a/src/Stemmer/French.php b/src/Stemmer/French.php index 8e1ee96..cef305e 100644 --- a/src/Stemmer/French.php +++ b/src/Stemmer/French.php @@ -480,7 +480,7 @@ private function step6() */ private function finish() { - $this->word = UTF8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); + $this->word = str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); } /** diff --git a/src/Stemmer/German.php b/src/Stemmer/German.php index 4dc81a3..2410ee7 100644 --- a/src/Stemmer/German.php +++ b/src/Stemmer/German.php @@ -36,7 +36,7 @@ public function stem($word) $this->word = UTF8::strtolower($word); // First, replace ß by ss - $this->word = UTF8::str_replace('ß', 'ss', $this->word); + $this->word = str_replace('ß', 'ss', $this->word); // put u and y between vowels into upper case $this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word); @@ -211,6 +211,6 @@ private function step3() private function finish() { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. - $this->word = UTF8::str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); + $this->word = str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); } } diff --git a/src/Stemmer/Italian.php b/src/Stemmer/Italian.php index bb09dee..40c9d86 100644 --- a/src/Stemmer/Italian.php +++ b/src/Stemmer/Italian.php @@ -32,7 +32,7 @@ public function stem($word) $this->word = UTF8::strtolower($word); // First, replace all acute accents by grave accents. - $this->word = UTF8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); + $this->word = str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then $this->word = preg_replace('#([q])u#u', '$1U', $this->word); @@ -284,6 +284,6 @@ private function step3b() */ private function finish() { - $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); + $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); } } diff --git a/src/Stemmer/Portuguese.php b/src/Stemmer/Portuguese.php index c71cc59..485aba0 100644 --- a/src/Stemmer/Portuguese.php +++ b/src/Stemmer/Portuguese.php @@ -29,7 +29,7 @@ public function stem($word) $this->word = UTF8::strtolower($word); - $this->word = UTF8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); + $this->word = str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); $this->rv(); $this->r1(); @@ -278,6 +278,6 @@ private function step5() private function finish() { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. - $this->word = UTF8::str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); + $this->word = str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); } } diff --git a/src/Stemmer/Romanian.php b/src/Stemmer/Romanian.php index 5da8744..3e9edd1 100644 --- a/src/Stemmer/Romanian.php +++ b/src/Stemmer/Romanian.php @@ -329,6 +329,6 @@ private function step4() private function finish() { // Turn I, U back into i, u - $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); + $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); } } diff --git a/src/Stemmer/Spanish.php b/src/Stemmer/Spanish.php index 4f6f2c8..190f761 100644 --- a/src/Stemmer/Spanish.php +++ b/src/Stemmer/Spanish.php @@ -343,6 +343,6 @@ private function step3() */ private function finish() { - $this->word = UTF8::str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); + $this->word = str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); } }