From e2c48aaa25f40771f44355af1bd1bc8917e7fa4c Mon Sep 17 00:00:00 2001 From: Hannes Papenberg Date: Wed, 11 Dec 2024 12:02:24 +0100 Subject: [PATCH 1/7] Removing dependency on voku/portable-utf8 --- composer.json | 4 +- src/Stemmer/Catalan.php | 25 ++-- src/Stemmer/Danish.php | 33 ++--- src/Stemmer/Dutch.php | 63 ++++---- src/Stemmer/English.php | 79 +++++----- src/Stemmer/Finnish.php | 115 +++++++-------- src/Stemmer/French.php | 93 ++++++------ src/Stemmer/German.php | 49 +++---- src/Stemmer/Italian.php | 49 +++---- src/Stemmer/Norwegian.php | 23 ++- src/Stemmer/Portuguese.php | 49 +++---- src/Stemmer/Romanian.php | 29 ++-- src/Stemmer/Russian.php | 45 +++--- src/Stemmer/Spanish.php | 72 +++++---- src/Stemmer/Stem.php | 30 ++-- src/Stemmer/Swedish.php | 25 ++-- src/StemmerFactory.php | 4 +- src/Transliterate.php | 253 ++++++++++++++++++++++++++++++++ test/CsvFileIterator.php | 10 +- test/CsvFileVerboseIterator.php | 4 +- 20 files changed, 619 insertions(+), 435 deletions(-) create mode 100644 src/Transliterate.php diff --git a/composer.json b/composer.json index b190dda..b01b73f 100644 --- a/composer.json +++ b/composer.json @@ -10,8 +10,8 @@ } ], "require": { - "php": ">=7.3", - "voku/portable-utf8": "^5.4|^6.0" + "php": ">=8.1", + "joomla/string": ">=3.0.1" }, "require-dev":{ "phpunit/phpunit": "^9.0" diff --git a/src/Stemmer/Catalan.php b/src/Stemmer/Catalan.php index d52e4fc..8a5c7d3 100644 --- a/src/Stemmer/Catalan.php +++ b/src/Stemmer/Catalan.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -86,12 +86,7 @@ class Catalan extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // Catalan stemmer does not use Rv $this->r1(); @@ -127,7 +122,7 @@ private function step0() { if (($position = $this->search(static::$attached_pronoun)) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -146,7 +141,7 @@ private function step1a() // delete if in R2 if (($position = $this->search(['acions', 'ada', 'ades'])) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -162,11 +157,11 @@ private function step1a() // atius atives ativa ativitat ativitats ible ibles assa asses assos ent ents íssim íssima íssims íssimes // ìssem ìsseu ìssin ims ima imes isme ista ismes istes inia inies íinia ínies ita ites triu trius oses osos // ient otes ots - // + // // delete if in R1 if (($position = $this->search(self::$standard_suffix_1a)) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -241,7 +236,7 @@ private function step1b() // delete if in R1 if (($position = $this->search(static::$verb_suffixes)) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -251,7 +246,7 @@ private function step1b() // delete if in R2 if (($position = $this->search(['ando'])) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -270,7 +265,7 @@ private function step2() // delete if in R1 if (($position = $this->search(static::$residual_suffixes)) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -294,7 +289,7 @@ private function step2() */ private function finish() { - $this->word = UTF8::str_replace( + $this->word = str_replace( ['á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ï', 'ü', '·'], ['a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'i', 'u', '.'], $this->word diff --git a/src/Stemmer/Danish.php b/src/Stemmer/Danish.php index c539fdb..5fc7507 100644 --- a/src/Stemmer/Danish.php +++ b/src/Stemmer/Danish.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class Danish extends Stem */ public function stem($word): string { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -35,7 +30,7 @@ public function stem($word): string // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -56,7 +51,7 @@ public function stem($word): string */ private function hasValidSEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å')); } @@ -74,14 +69,14 @@ private function step1() 'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds', 'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e' ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -97,7 +92,7 @@ private function step1() private function step2() { if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -108,14 +103,14 @@ private function step3() { // If the word ends igst, remove the final st. if ($this->search(array('igst')) !== false) { - $this->word = UTF8::substr($this->word, 0, -2); + $this->word = StringHelper::substr($this->word, 0, -2); } // Search for the longest among the following suffixes in R1, and perform the action indicated. // ig lig elig els // delete, and then repeat step 2 if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->step2(); return true; } @@ -123,7 +118,7 @@ private function step3() // løst // replace with løs if ($this->searchIfInR1(array('løst')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -133,19 +128,19 @@ private function step3() */ private function step4() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); if (!$this->inR1(($length-1))) { return false; } - $lastLetter = UTF8::substr($this->word, -1, 1); + $lastLetter = StringHelper::substr($this->word, -1, 1); if (in_array($lastLetter, self::$vowels)) { return false; } - $beforeLastLetter = UTF8::substr($this->word, -2, 1); + $beforeLastLetter = StringHelper::substr($this->word, -2, 1); if ($lastLetter == $beforeLastLetter) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } return true; } diff --git a/src/Stemmer/Dutch.php b/src/Stemmer/Dutch.php index fc7c1af..6a2b563 100644 --- a/src/Stemmer/Dutch.php +++ b/src/Stemmer/Dutch.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,15 +22,10 @@ class Dutch extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // First, remove all umlaut and acute accents. - $this->word = UTF8::str_replace( + $this->word = str_replace( array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), $this->word); @@ -50,7 +45,7 @@ public function stem($word) // but then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -71,7 +66,7 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); return !in_array($lastLetter, array_merge(self::$vowels, array('j'))); } @@ -82,12 +77,12 @@ private function hasValidSEnding($word) */ private function hasValidEnEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); if (in_array($lastLetter, self::$vowels)) { return false; } - $threeLastLetters = UTF8::substr($word, -3, 3); + $threeLastLetters = StringHelper::substr($word, -3, 3); if ($threeLastLetters == 'gem') { return false; } @@ -100,7 +95,7 @@ private function hasValidEnEnding($word) private function unDoubling() { if ($this->search(array('kk', 'dd', 'tt')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -123,7 +118,7 @@ private function step1() // delete if in R1 and preceded by a valid en-ending, and then undouble the ending if ( ($position = $this->search(array('ene', 'en'))) !== false) { if ($this->inR1($position)) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidEnEnding($word)) { $this->word = $word; $this->unDoubling(); @@ -136,7 +131,7 @@ private function step1() // delete if in R1 and preceded by a valid s-ending if ( ($position = $this->search(array('se', 's'))) !== false) { if ($this->inR1($position)) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -155,9 +150,9 @@ private function step2() { if ( ($position = $this->search(array('e'))) !== false) { if ($this->inR1($position)) { - $letter = UTF8::substr($this->word, -2, 1); + $letter = StringHelper::substr($this->word, -2, 1); if (!in_array($letter, self::$vowels)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->unDoubling(); return true; @@ -176,13 +171,13 @@ private function step3a() { if ( ($position = $this->search(array('heid'))) !== false) { if ($this->inR2($position)) { - $letter = UTF8::substr($this->word, -5, 1); + $letter = StringHelper::substr($this->word, -5, 1); if ($letter !== 'c') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position = $this->search(array('en'))) !== false) { if ($this->inR1($position)) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidEnEnding($word)) { $this->word = $word; $this->unDoubling(); @@ -206,12 +201,12 @@ private function step3b($removedE) // if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending if ( ($position = $this->search(array('end', 'ing'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) { - $letter = UTF8::substr($this->word, -3, 1); + $letter = StringHelper::substr($this->word, -3, 1); if ($letter !== 'e') { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } else { $this->unDoubling(); @@ -226,9 +221,9 @@ private function step3b($removedE) // delete if in R2 and not preceded by e if ( ($position = $this->search(array('ig'))) !== false) { if ($this->inR2($position)) { - $letter = UTF8::substr($this->word, -3, 1); + $letter = StringHelper::substr($this->word, -3, 1); if ($letter !== 'e') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } return true; @@ -238,7 +233,7 @@ private function step3b($removedE) // delete if in R2, and then repeat step 2 if ( ($position = $this->search(array('lijk'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->step2(); } return true; @@ -248,7 +243,7 @@ private function step3b($removedE) // delete if in R2 if ( ($position = $this->search(array('baar'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -257,7 +252,7 @@ private function step3b($removedE) // delete if in R2 and if step 2 actually removed an e if ( ($position = $this->search(array('bar'))) !== false) { if ($this->inR2($position) && $removedE) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -273,25 +268,25 @@ private function step3b($removedE) private function step4() { // D is a non-vowel other than I - $d = UTF8::substr($this->word, -1, 1); + $d = StringHelper::substr($this->word, -1, 1); if (in_array($d, array_merge(self::$vowels, array('I')))) { return false; } // V is double a, e, o or u - $v = UTF8::substr($this->word, -3, 2); + $v = StringHelper::substr($this->word, -3, 2); if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) { return false; } - $singleV = UTF8::substr($v, 0, 1); + $singleV = StringHelper::substr($v, 0, 1); // C is a non-vowel - $c = UTF8::substr($this->word, -4, 1); + $c = StringHelper::substr($this->word, -4, 1); if (in_array($c, self::$vowels)) { return false; } - $this->word = UTF8::substr($this->word, 0, -4); + $this->word = StringHelper::substr($this->word, 0, -4); $this->word .= $c . $singleV .$d; } @@ -301,6 +296,6 @@ private function step4() */ private function finish() { - $this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word); + $this->word = str_replace(array('I', 'Y'), array('i', 'y'), $this->word); } } diff --git a/src/Stemmer/English.php b/src/Stemmer/English.php index fe5f186..f0e1f2c 100644 --- a/src/Stemmer/English.php +++ b/src/Stemmer/English.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * English Porter 2 @@ -27,16 +27,11 @@ class English extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - if (Utf8::strlen($word) < 3) { + if (StringHelper::strlen($word) < 3) { return $word; } - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // exceptions if (null !== ($word = $this->exception1())) { @@ -47,9 +42,9 @@ public function stem($word) $this->plainVowels = implode('', self::$vowels); // Remove initial ', if present. - $first = UTF8::substr($this->word, 0, 1); + $first = StringHelper::substr($this->word, 0, 1); if ($first == "'") { - $this->word = UTF8::substr($this->word, 1); + $this->word = StringHelper::substr($this->word, 1); } // Set initial y, or y after a vowel, to Y @@ -88,7 +83,7 @@ public function stem($word) private function step0() { if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -123,10 +118,10 @@ private function step1a() // delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it) if ( ($position = $this->search(array('s'))) !== false) { for ($i=0; $i<$position-1; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -157,16 +152,16 @@ private function step1b() // if the word is short, add e (so hop -> hope) if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) { for ($i=0; $i<$position; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ($this->search(array('at', 'bl', 'iz')) !== false) { $this->word .= 'e'; } elseif ( ($position2 = $this->search(self::$doubles)) !== false) { - $this->word = UTF8::substr($this->word, 0, ($position2+1)); + $this->word = StringHelper::substr($this->word, 0, ($position2+1)); } elseif ($this->isShort()) { $this->word .= 'e'; @@ -188,7 +183,7 @@ private function step1c() { // replace suffix y or Y by i if preceded by a non-vowel // which is not the first letter of the word (so cry -> cri, by -> by, say -> say) - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); if ($length < 3) { return true; @@ -196,7 +191,7 @@ private function step1c() if ( ($position = $this->search(array('y', 'Y'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if (! in_array($letter, self::$vowels)) { $this->word = preg_replace('#(y|Y)$#u', 'i', $this->word); @@ -323,7 +318,7 @@ private function step2() if ($this->inR1($position)) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 'l') { $this->word = preg_replace('#(ogi)$#u', 'og', $this->word); @@ -338,10 +333,10 @@ private function step2() if ($this->inR1($position)) { // a letter for you - $letter = UTF8::substr($this->word, ($position-1), 1); + $letter = StringHelper::substr($this->word, ($position-1), 1); if (in_array($letter, self::$liEnding)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -383,13 +378,13 @@ private function step3() // ful ness: delete if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // ative*: delete if in R2 if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -409,7 +404,7 @@ private function step4() 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -418,10 +413,10 @@ private function step4() // delete if preceded by s or t if ( ($position = $this->searchIfInR2(array('ion'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 's' || $letter == 't') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; @@ -440,11 +435,11 @@ private function step5() // delete if in R2, or in R1 and not preceded by a short syllable if ( ($position = $this->search(array('e'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -455,10 +450,10 @@ private function step5() // delete if in R2 and preceded by l if ( ($position = $this->searchIfInR2(array('l'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 'l') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; @@ -469,21 +464,21 @@ private function step5() private function finish() { - $this->word = UTF8::str_replace('Y', 'y', $this->word); + $this->word = str_replace('Y', 'y', $this->word); } private function exceptionR1() { - if (Utf8::strpos($this->word, 'gener') === 0) { - $this->r1 = UTF8::substr($this->word, 5); + if (StringHelper::strpos($this->word, 'gener') === 0) { + $this->r1 = StringHelper::substr($this->word, 5); $this->r1Index = 5; - } elseif (Utf8::strpos($this->word, 'commun') === 0) { - $this->r1 = UTF8::substr($this->word, 6); + } elseif (StringHelper::strpos($this->word, 'commun') === 0) { + $this->r1 = StringHelper::substr($this->word, 6); $this->r1Index = 6; - } elseif (Utf8::strpos($this->word, 'arsen') === 0) { - $this->r1 = UTF8::substr($this->word, 5); + } elseif (StringHelper::strpos($this->word, 'arsen') === 0) { + $this->r1 = StringHelper::substr($this->word, 5); $this->r1Index = 5; } } @@ -554,7 +549,7 @@ private function exception2() */ private function isShort() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) ); } @@ -567,7 +562,7 @@ private function isShort() */ private function searchShortSyllabe($from, $nbLetters) { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); if ($from < 0) { $from = $length + $from; @@ -581,8 +576,8 @@ private function searchShortSyllabe($from, $nbLetters) return false; } - $first = UTF8::substr($this->word, $from, 1); - $second = UTF8::substr($this->word, ($from+1), 1); + $first = StringHelper::substr($this->word, $from, 1); + $second = StringHelper::substr($this->word, ($from+1), 1); if ($nbLetters == 2) { if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) { @@ -590,7 +585,7 @@ private function searchShortSyllabe($from, $nbLetters) } } - $third = UTF8::substr($this->word, ($from+2), 1); + $third = StringHelper::substr($this->word, ($from+2), 1); if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) && (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) { diff --git a/src/Stemmer/Finnish.php b/src/Stemmer/Finnish.php index 25539b2..c6487b5 100644 --- a/src/Stemmer/Finnish.php +++ b/src/Stemmer/Finnish.php @@ -6,7 +6,7 @@ */ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * Finnish Snowball Stemmer. @@ -38,12 +38,7 @@ class Finnish extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (! UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = Utf8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R1 and R2 are then defined in the usual way $this->r1(); @@ -74,10 +69,10 @@ private function step1() // (a) kin kaan kään ko kö han hän pa pä // delete if preceded by n, t or a vowel if (($position = $this->searchIfInR1(array('kaan', 'kään', 'kin', 'han', 'hän', 'ko', 'kö', 'pa', 'pä'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if (in_array($lastLetter, array_merge(['t', 'n'], self::$vowels))) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); } @@ -89,7 +84,7 @@ private function step1() // delete if in R2 if (($position = $this->searchIfInR1(array('sti'))) !== false) { if ($this->inR2($position)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); } @@ -111,10 +106,10 @@ private function step2() // si // delete if not preceded by k if (($position = $this->searchIfInR1(array('si'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if ($lastLetter !== 'k') { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -124,7 +119,7 @@ private function step2() // ni // delete if (($position = $this->searchIfInR1(array('ni'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); // if preceded by kse, replace with ksi if ( ($position = $this->search(array('kse'))) !== false) { $this->word = preg_replace('#(kse)$#u', 'ksi', $this->word); @@ -137,7 +132,7 @@ private function step2() // nsa nsä mme nne // delete if (($position = $this->searchIfInR1(array('nsa', 'nsä', 'mme', 'nne'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -146,9 +141,9 @@ private function step2() // an // delete if preceded by one of ta ssa sta lla lta na if (($position = $this->searchIfInR1(array('an'))) !== false) { - $word = Utf8::substr($this->word, 0, $position); - $lastThreeLetters = Utf8::substr($word, -3, 3); - $lastTwoLetters = Utf8::substr($word, -2, 2); + $word = StringHelper::substr($this->word, 0, $position); + $lastThreeLetters = StringHelper::substr($word, -3, 3); + $lastTwoLetters = StringHelper::substr($word, -2, 2); if (in_array($lastThreeLetters, array('ssa', 'sta', 'lla', 'lta'), true) || in_array($lastTwoLetters, array('na', 'ta'), true)) { $this->word = $word; $this->r1(); @@ -160,9 +155,9 @@ private function step2() // än // delete if preceded by one of tä ssä stä llä ltä nä if (($position = $this->searchIfInR1(array('än'))) !== false) { - $word = Utf8::substr($this->word, 0, $position); - $lastThreeLetters = Utf8::substr($word, -3, 3); - $lastTwoLetters = Utf8::substr($word, -2, 2); + $word = StringHelper::substr($this->word, 0, $position); + $lastThreeLetters = StringHelper::substr($word, -3, 3); + $lastTwoLetters = StringHelper::substr($word, -2, 2); if (in_array($lastThreeLetters, array('ssä', 'stä', 'llä', 'ltä'), true) || in_array($lastTwoLetters, array('nä', 'tä'), true)) { $this->word = $word; $this->r1(); @@ -174,9 +169,9 @@ private function step2() // en // delete if preceded by one of lle ine if (($position = $this->searchIfInR1(array('en'))) !== false) { - $word = Utf8::substr($this->word, 0, $position); - if (Utf8::strlen($this->word) > 4) { - $lastThreeLetters = Utf8::substr($this->word, -5, 3); + $word = StringHelper::substr($this->word, 0, $position); + if (StringHelper::strlen($this->word) > 4) { + $lastThreeLetters = StringHelper::substr($this->word, -5, 3); if (in_array($lastThreeLetters, array('lle', 'ine'), true)) { $this->word = $word; $this->r1(); @@ -204,9 +199,9 @@ private function step3() continue; } if (($position = $this->searchIfInR1(array('h' . $vowel . 'n'))) !== false) { - $lastLetter = Utf8::substr($this->word, $position-1, 1); + $lastLetter = StringHelper::substr($this->word, $position-1, 1); if ($lastLetter === $vowel) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -218,11 +213,11 @@ private function step3() // siin den tten // delete if preceded by Vi if (($position = $this->searchIfInR1(array('siin', 'den', 'tten'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if ($lastLetter === 'i') { - $nextLastLetter = Utf8::substr($this->word, ($position-2), 1); + $nextLastLetter = StringHelper::substr($this->word, ($position-2), 1); if (in_array($nextLastLetter, self::$restrictedVowels, true)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -234,10 +229,10 @@ private function step3() // seen // delete if preceded by LV if (($position = $this->searchIfInR1(array('seen'))) !== false) { - $lastLetters = Utf8::substr($this->word, ($position-2), 2); + $lastLetters = StringHelper::substr($this->word, ($position-2), 2); if (in_array($lastLetters, self::$longVowels, true)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -248,10 +243,10 @@ private function step3() // tta ttä // delete if preceded by e if (($position = $this->searchIfInR1(array('tta', 'ttä'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if ($lastLetter === 'e') { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -262,7 +257,7 @@ private function step3() // ta tä ssa ssä sta stä lla llä lta ltä lle na nä ksi ine // delete if (($position = $this->searchIfInR1(array('ssa', 'ssä', 'sta', 'stä', 'lla', 'llä', 'lta', 'ltä', 'lle', 'ksi', 'na', 'nä', 'ine', 'ta', 'tä'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -272,11 +267,11 @@ private function step3() // a ä // delete if preceded by cv if (($position = $this->searchIfInR1(array('a', 'ä'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); - $nextLastLetter = Utf8::substr($this->word, ($position-2), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); + $nextLastLetter = StringHelper::substr($this->word, ($position-2), 1); if (in_array($lastLetter, self::$vowels, true) && in_array($nextLastLetter, self::$consonants, true)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->_removedInStep3 = true; $this->r1(); $this->r2(); @@ -287,12 +282,12 @@ private function step3() // n // delete, and if preceded by LV or ie, delete the last vowel if (($position = $this->searchIfInR1(array('n'))) !== false) { - $lastLetters = Utf8::substr($this->word, ($position-2), 2); + $lastLetters = StringHelper::substr($this->word, ($position-2), 2); if (in_array($lastLetters, self::$longVowels, true) || $lastLetters === 'ie') { - $this->word = Utf8::substr($this->word, 0, $position-1); + $this->word = StringHelper::substr($this->word, 0, $position-1); } else { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } $this->r1(); $this->r2(); @@ -314,9 +309,9 @@ private function step4() // mpi mpa mpä mmi mma mmä // delete if not preceded by po if (($position = $this->searchIfInR2(array('mpi', 'mpa', 'mpä', 'mmi', 'mma', 'mmä'))) !== false) { - $lastLetters = Utf8::substr($this->word, ($position-2), 2); + $lastLetters = StringHelper::substr($this->word, ($position-2), 2); if ($lastLetters !== 'po') { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -326,7 +321,7 @@ private function step4() // impi impa impä immi imma immä eja ejä // delete if (($position = $this->searchIfInR2(array('impi', 'impa', 'impä', 'immi', 'imma', 'immä', 'eja', 'ejä'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; @@ -347,27 +342,27 @@ private function step5() { if ($this->_removedInStep3) { if (($position = $this->searchIfInR1(array('i', 'j'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); return true; } } else { if (($position = $this->searchIfInR1(array('t'))) !== false) { - $lastLetter = Utf8::substr($this->word, ($position-1), 1); + $lastLetter = StringHelper::substr($this->word, ($position-1), 1); if (in_array($lastLetter, self::$vowels, true)) { - $this->word = Utf8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->r1(); $this->r2(); if (($position2 = $this->searchIfInR2(array('imma'))) !== false) { - $this->word = Utf8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); $this->r1(); $this->r2(); return true; } elseif (($position2 = $this->searchIfInR2(array('mma'))) !== false) { - $lastLetters = Utf8::substr($this->word, ($position2-2), 2); + $lastLetters = StringHelper::substr($this->word, ($position2-2), 2); if ($lastLetters !== 'po') { - $this->word = Utf8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); $this->r1(); $this->r2(); return true; @@ -390,35 +385,35 @@ private function step6() // a) If R1 ends LV // delete the last letter if (($position = $this->searchIfInR1(self::$longVowels)) !== false) { - $this->word = Utf8::substr($this->word, 0, $position+1); + $this->word = StringHelper::substr($this->word, 0, $position+1); $this->r1(); $this->r2(); } // b) If R1 ends cX, c a consonant and X one of a ä e i, // delete the last letter - $lastLetter = Utf8::substr($this->r1, -1, 1); - $secondToLastLetter = Utf8::substr($this->r1, -2, 1); + $lastLetter = StringHelper::substr($this->r1, -1, 1); + $secondToLastLetter = StringHelper::substr($this->r1, -2, 1); if (in_array($secondToLastLetter, self::$consonants, true) && in_array($lastLetter, array('a', 'e', 'i', 'ä'))) { - $this->word = Utf8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); $this->r1(); $this->r2(); } // c) If R1 ends oj or uj // delete the last letter - $twoLastLetters = Utf8::substr($this->r1, -2, 2); + $twoLastLetters = StringHelper::substr($this->r1, -2, 2); if (in_array($twoLastLetters, array('oj', 'uj'))) { - $this->word = Utf8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); $this->r1(); $this->r2(); } // d) If R1 ends jo // delete the last letter - $twoLastLetters = Utf8::substr($this->r1, -2, 2); + $twoLastLetters = StringHelper::substr($this->r1, -2, 2); if ($twoLastLetters === 'jo') { - $this->word = Utf8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); $this->r1(); $this->r2(); } @@ -427,15 +422,15 @@ private function step6() // vowels, remove the last consonant (so eläkk -> eläk, // aatonaatto -> aatonaato) $endVowels = ''; - for ($i = Utf8::strlen($this->word) - 1; $i > 0; $i--) { - $letter = Utf8::substr($this->word, $i, 1); + for ($i = StringHelper::strlen($this->word) - 1; $i > 0; $i--) { + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels, true)) { $endVowels = $letter . $endVowels; } else { // check for double consonant - $prevLetter = Utf8::substr($this->word, $i-1, 1); + $prevLetter = StringHelper::substr($this->word, $i-1, 1); if ($prevLetter === $letter) { - $this->word = Utf8::substr($this->word, 0, $i) . $endVowels; + $this->word = StringHelper::substr($this->word, 0, $i) . $endVowels; } break; } diff --git a/src/Stemmer/French.php b/src/Stemmer/French.php index 8e1ee96..2bc53ca 100644 --- a/src/Stemmer/French.php +++ b/src/Stemmer/French.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class French extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); $this->plainVowels = implode('', self::$vowels); @@ -96,7 +91,7 @@ private function step1() // delete if in R2 if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return 3; } @@ -106,10 +101,10 @@ private function step1() // if preceded by ic, delete if in R2, else replace by iqU if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } @@ -150,9 +145,9 @@ private function step1() if ( ($position = $this->search(array('issements', 'issement'))) != false) { if ($this->inR1($position)) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if (! in_array($letter, self::$vowels)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } return 3; @@ -168,20 +163,20 @@ private function step1() // delete if in RV if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, } elseif ( ($position = $this->search(array('eus'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { $this->word = preg_replace('#(eus)$#u', 'eux', $this->word); @@ -189,7 +184,7 @@ private function step1() // if preceded by abl or iqU, delete if in R2, otherwise, } elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); // if preceded by ièr or Ièr, replace by i if in RV } elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) { @@ -207,13 +202,13 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by abil, delete if in R2, else replace by abl, otherwise, if ( ($position = $this->search(array('abil'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } else { $this->word = preg_replace('#(abil)$#u', 'abl', $this->word); } @@ -221,14 +216,14 @@ private function step1() // if preceded by ic, delete if in R2, else replace by iqU, otherwise, } elseif ( ($position = $this->search(array('ic'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } // if preceded by iv, delete if in R2 } elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return 3; @@ -240,15 +235,15 @@ private function step1() if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('ic'))) !== false) { if ($this->inR2($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } else { $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); } @@ -278,7 +273,7 @@ private function step1() // delete if in R2, else replace by eux if in R1 if ( ($position = $this->search(array('euses', 'euse'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } elseif ($this->inR1($position)) { $this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word); @@ -309,9 +304,9 @@ private function step1() // delete if preceded by a vowel in RV if ( ($position = $this->search(array('ments', 'ment'))) != false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return 2; @@ -337,9 +332,9 @@ private function step2a() 'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -360,7 +355,7 @@ private function step2b() 'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez', 'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -373,12 +368,12 @@ private function step2b() 'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( $this->inRv($before) && ($letter == 'e') ) { - $this->word = UTF8::substr($this->word, 0, $before); + $this->word = StringHelper::substr($this->word, 0, $before); } else { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; @@ -388,7 +383,7 @@ private function step2b() // delete if in R2 if ( ($position = $this->searchIfInRv(array('ions'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; @@ -413,7 +408,7 @@ private function step4() { //If the word ends s, not preceded by a, i, o, u, è or s, delete it. if (preg_match('#[^aiouès]s$#', $this->word)) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } // In the rest of step 4, all tests are confined to the RV region. @@ -421,9 +416,9 @@ private function step4() // delete if in R2 and preceded by s or t if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -438,7 +433,7 @@ private function step4() // e // delete if ( ($this->searchIfInRv(array('e'))) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); return true; } @@ -446,7 +441,7 @@ private function step4() // if preceded by gu, delete if ( ($position = $this->searchIfInRv(array('guë'))) !== false) { if ($this->inRv($position+2)) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); return true; } } @@ -461,7 +456,7 @@ private function step4() private function step5() { if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -480,7 +475,7 @@ private function step6() */ private function finish() { - $this->word = UTF8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); + $this->word = str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); } /** @@ -491,7 +486,7 @@ private function finish() */ protected function rv() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; @@ -501,28 +496,28 @@ protected function rv() } // If the word begins with two vowels, RV is the region after the third letter - $first = UTF8::substr($this->word, 0, 1); - $second = UTF8::substr($this->word, 1, 1); + $first = StringHelper::substr($this->word, 0, 1); + $second = StringHelper::substr($this->word, 1, 1); if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) { - $this->rv = UTF8::substr($this->word, 3); + $this->rv = StringHelper::substr($this->word, 3); $this->rvIndex = 3; return true; } // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) - $begin3 = UTF8::substr($this->word, 0, 3); + $begin3 = StringHelper::substr($this->word, 0, 3); if (in_array($begin3, array('par', 'col', 'tap'))) { - $this->rv = UTF8::substr($this->word, 3); + $this->rv = StringHelper::substr($this->word, 3); $this->rvIndex = 3; return true; } // otherwise the region after the first vowel not at the beginning of the word, for ($i=1; $i<$length; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->rv = UTF8::substr($this->word, ($i + 1)); + $this->rv = StringHelper::substr($this->word, ($i + 1)); $this->rvIndex = $i + 1; return true; } diff --git a/src/Stemmer/German.php b/src/Stemmer/German.php index 4dc81a3..11dc733 100644 --- a/src/Stemmer/German.php +++ b/src/Stemmer/German.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -26,17 +26,12 @@ class German extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - $this->plainVowels = implode('', self::$vowels); - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // First, replace ß by ss - $this->word = UTF8::str_replace('ß', 'ss', $this->word); + $this->word = str_replace('ß', 'ss', $this->word); // put u and y between vowels into upper case $this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word); @@ -49,7 +44,7 @@ public function stem($word) // but then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } $this->step1(); @@ -68,7 +63,7 @@ private function step1() // delete if in R1 if ( ($position = $this->search(array('em', 'ern', 'er'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -76,11 +71,11 @@ private function step1() // delete if in R1 if ( ($position = $this->search(array('es', 'en', 'e'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); //If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s if ($this->search(array('niss')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } return true; @@ -90,10 +85,10 @@ private function step1() if ( ($position = $this->search(array('s'))) !== false) { if ($this->inR1($position)) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if (in_array($letter, self::$sEndings)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } return true; @@ -111,7 +106,7 @@ private function step2() // delete if in R1 if ( ($position = $this->search(array('en', 'er', 'est'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -122,10 +117,10 @@ private function step2() if ($this->inR1($position)) { $before = $position - 1; if ($before >= 3) { - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if (in_array($letter, self::$stEndings)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } } @@ -144,15 +139,15 @@ private function step3() // if preceded by ig, delete if in R2 and not preceded by e if ( ($position = $this->search(array('end', 'ung'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('ig'))) !== false) { $before = $position2 - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( ($this->inR2($position2)) && ($letter != 'e') ) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } return true; @@ -162,10 +157,10 @@ private function step3() // delete if in R2 and not preceded by e if ( ($position = $this->search(array('ig', 'ik', 'isch'))) !== false) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( ($this->inR2($position)) && ($letter != 'e') ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -175,12 +170,12 @@ private function step3() // if preceded by er or en, delete if in R1 if ( ($position = $this->search(array('lich', 'heit'))) != false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('er', 'en'))) !== false) { if ($this->inR1($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } return true; @@ -191,12 +186,12 @@ private function step3() // if preceded by lich or ig, delete if in R2 if ( ($position = $this->search(array('keit'))) != false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->search(array('lich', 'ig'))) !== false) { if ($this->inR2($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } return true; @@ -211,6 +206,6 @@ private function step3() private function finish() { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. - $this->word = UTF8::str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); + $this->word = str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); } } diff --git a/src/Stemmer/Italian.php b/src/Stemmer/Italian.php index bb09dee..4bb2004 100644 --- a/src/Stemmer/Italian.php +++ b/src/Stemmer/Italian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,17 +22,12 @@ class Italian extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - $this->plainVowels = implode('', self::$vowels); - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // First, replace all acute accents by grave accents. - $this->word = UTF8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); + $this->word = str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then $this->word = preg_replace('#([q])u#u', '$1U', $this->word); @@ -72,7 +67,7 @@ private function step0() 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', 'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) { - $suffixe = UTF8::substr($this->word, $position); + $suffixe = StringHelper::substr($this->word, $position); // following one of (in RV) // a @@ -82,7 +77,7 @@ private function step0() }, $a); // In case of (a) the suffix is deleted if ($this->searchIfInRv($a) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } //b @@ -112,19 +107,19 @@ private function step1() // if preceded by os, ic or abil, delete if in R2 if ( ($position = $this->search(array('amente'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position3); + $this->word = StringHelper::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position4); + $this->word = StringHelper::substr($this->word, 0, $position4); } return true; } @@ -137,7 +132,7 @@ private function step1() ))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -147,11 +142,11 @@ private function step1() // if preceded by ic, delete if in R2 if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('ic'))) !== false) { if ($this->inR2($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } } } @@ -189,7 +184,7 @@ private function step1() // delete if in RV if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -199,11 +194,11 @@ private function step1() // if preceded by abil, ic or iv, delete if in R2 if ( ($position = $this->search(array('ità'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -213,13 +208,13 @@ private function step1() // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2) if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position3); + $this->word = StringHelper::substr($this->word, 0, $position3); } } return true; @@ -243,7 +238,7 @@ private function step2() 'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva', 'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -254,10 +249,10 @@ private function step2() private function step3a() { if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); if ($this->searchIfInRv(array('i')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } return true; } @@ -284,6 +279,6 @@ private function step3b() */ private function finish() { - $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); + $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); } } diff --git a/src/Stemmer/Norwegian.php b/src/Stemmer/Norwegian.php index b44b722..627a578 100644 --- a/src/Stemmer/Norwegian.php +++ b/src/Stemmer/Norwegian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class Norwegian extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -35,7 +30,7 @@ public function stem($word) // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -56,12 +51,12 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); if (in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'))) { return true; } if ($lastLetter == 'k') { - $beforeLetter = UTF8::substr($word, -2, 1); + $beforeLetter = StringHelper::substr($word, -2, 1); if (!in_array($beforeLetter, self::$vowels)) { return true; } @@ -88,14 +83,14 @@ private function step1() 'hetenes', 'hetene', 'hetens', 'heten', 'endes', 'heter', 'ande', 'ende', 'enes', 'edes', 'ede', 'ane', 'ene', 'het', 'ers', 'ets', 'ast', 'ens', 'en', 'ar', 'er', 'as', 'es', 'et', 'a', 'e' ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -110,7 +105,7 @@ private function step1() private function step2() { if ($this->searchIfInR1(array('dt', 'vt')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -124,7 +119,7 @@ private function step3() if ( ($position = $this->searchIfInR1(array( 'hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig' ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } } diff --git a/src/Stemmer/Portuguese.php b/src/Stemmer/Portuguese.php index c71cc59..c5f3aae 100644 --- a/src/Stemmer/Portuguese.php +++ b/src/Stemmer/Portuguese.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,14 +22,9 @@ class Portuguese extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); - $this->word = UTF8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); + $this->word = str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); $this->rv(); $this->r1(); @@ -66,7 +61,7 @@ private function step1() 'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -106,19 +101,19 @@ private function step1() // delete if in R1 if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position3); + $this->word = StringHelper::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position4); + $this->word = StringHelper::substr($this->word, 0, $position4); } return true; } @@ -130,12 +125,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by ante, avel or ível, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('ante', 'avel', 'ível'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -147,12 +142,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by abil, ic or iv, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -164,12 +159,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by at, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -180,7 +175,7 @@ private function step1() if ($this->inRv($position)) { $before = $position -1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 'e') { $this->word = preg_replace('#(iras|ira)$#u', 'ir', $this->word); @@ -213,7 +208,7 @@ private function step2() 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou', ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } return false; @@ -227,10 +222,10 @@ private function step3() { // Delete suffix i if in RV and preceded by c if ($this->searchIfInRv(array('i')) !== false) { - $letter = UTF8::substr($this->word, -2, 1); + $letter = StringHelper::substr($this->word, -2, 1); if ($letter == 'c') { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } return true; } @@ -244,7 +239,7 @@ private function step4() { // If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it if ( ($position = $this->searchIfInRv(array('os', 'a', 'i', 'o','á', 'í', 'ó'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } return false; @@ -257,11 +252,11 @@ private function step5() { // If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i). if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); if ( ($position2 = $this->search(array('gu', 'ci'))) !== false) { if ($this->inRv(($position2+1))) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } return true; @@ -278,6 +273,6 @@ private function step5() private function finish() { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. - $this->word = UTF8::str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); + $this->word = str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); } } diff --git a/src/Stemmer/Romanian.php b/src/Stemmer/Romanian.php index 5da8744..87047dc 100644 --- a/src/Stemmer/Romanian.php +++ b/src/Stemmer/Romanian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class Romanian extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); $this->plainVowels = implode('', self::$vowels); @@ -73,7 +68,7 @@ private function step0() // delete if ( ($position = $this->search(array('ul', 'ului'))) !== false) { if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -109,7 +104,7 @@ private function step0() // replace with i if not preceded by ab if ( ($position = $this->search(array('ile'))) !== false) { if ($this->inR1($position)) { - $before = UTF8::substr($this->word, ($position-2), 2); + $before = StringHelper::substr($this->word, ($position-2), 2); if ($before != 'ab') { $this->word = preg_replace('#(ile)$#u', 'i', $this->word); @@ -226,7 +221,7 @@ private function step2() 'at', 'os', 'iv', 'ut', 'it', 'ic' ))) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -236,9 +231,9 @@ private function step2() if ( ($position = $this->search(array('iune', 'iuni'))) !== false) { if ($this->inR2($position)) { $before = $position - 1; - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ($letter == 'ţ') { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); $this->word = preg_replace('#(ţ)$#u', 't', $this->word); } } @@ -282,10 +277,10 @@ private function step3() if ($this->inRv($position)) { $before = $position - 1; if ($this->inRv($before)) { - $letter = UTF8::substr($this->word, $before, 1); + $letter = StringHelper::substr($this->word, $before, 1); if ( (!in_array($letter, self::$vowels)) || ($letter == 'u') ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } } @@ -301,7 +296,7 @@ private function step3() 'aţi', 'eţi', 'iţi', 'âţi', 'sei', 'se', 'ăm', 'âm', 'em', 'im' ))) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -315,7 +310,7 @@ private function step4() // Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it. if ( ($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -329,6 +324,6 @@ private function step4() private function finish() { // Turn I, U back into i, u - $this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); + $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); } } diff --git a/src/Stemmer/Russian.php b/src/Stemmer/Russian.php index cd18dbf..3949a45 100644 --- a/src/Stemmer/Russian.php +++ b/src/Stemmer/Russian.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -56,12 +56,7 @@ class Russian extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -88,7 +83,7 @@ private function step1() // group 1 if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[0])) !== false) { if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -96,7 +91,7 @@ private function step1() // group 2 if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[1])) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -104,7 +99,7 @@ private function step1() // Otherwise try and remove a REFLEXIVE ending if ( ($position = $this->searchIfInRv(self::$reflexive)) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } } @@ -112,18 +107,18 @@ private function step1() // As soon as one of the endings (1) to (3) is found remove it, and terminate step 1. if ( ($position = $this->searchIfInRv(self::$adjective)) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->search(self::$participle[0])) !== false) { if ( ($this->inRv($position2)) && ($this->checkGroup1($position2)) ) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); return true; } } if ( ($position2 = $this->search(self::$participle[1])) !== false) { if ($this->inRv($position2)) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); return true; } } @@ -134,21 +129,21 @@ private function step1() if ( ($position = $this->searchIfInRv(self::$verb[0])) !== false) { if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } if ( ($position = $this->searchIfInRv(self::$verb[1])) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } if ( ($position = $this->searchIfInRv(self::$noun)) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -163,7 +158,7 @@ private function step2() { if ( ($position = $this->searchIfInRv(array('и'))) !== false) { if ($this->inRv($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -178,7 +173,7 @@ private function step3() { if ( ($position = $this->searchIfInRv(self::$derivational)) !== false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -192,18 +187,18 @@ private function step4() { // (2) if the word ends with a SUPERLATIVE ending, remove it if ( ($position = $this->searchIfInRv(self::$superlative)) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // (1) Undouble н (n) if ( ($position = $this->searchIfInRv(array('нн'))) !== false) { - $this->word = UTF8::substr($this->word, 0, ($position+1)); + $this->word = StringHelper::substr($this->word, 0, ($position+1)); return true; } // (3) if the word ends ь (') (soft sign) remove it if ( ($position = $this->searchIfInRv(array('ь'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -213,15 +208,15 @@ private function step4() */ protected function rv() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; for ($i=0; $i<$length; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { - $this->rv = UTF8::substr($this->word, ($i+1)); + $this->rv = StringHelper::substr($this->word, ($i+1)); $this->rvIndex = $i + 1; return true; } @@ -242,7 +237,7 @@ private function checkGroup1($position) return false; } - $letter = UTF8::substr($this->word, ($position - 1), 1); + $letter = StringHelper::substr($this->word, ($position - 1), 1); if ($letter == 'а' || $letter == 'я') { return true; diff --git a/src/Stemmer/Spanish.php b/src/Stemmer/Spanish.php index 4f6f2c8..b83c040 100644 --- a/src/Stemmer/Spanish.php +++ b/src/Stemmer/Spanish.php @@ -2,7 +2,8 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; +use Wamania\Snowball\Transliterate; /** * @@ -22,12 +23,7 @@ class Spanish extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); $this->rv(); $this->r1(); @@ -71,7 +67,7 @@ public function stem($word) private function step0() { if ( ($position = $this->searchIfInRv(array('selas', 'selos', 'las', 'los', 'les', 'nos', 'selo', 'sela', 'me', 'se', 'la', 'le', 'lo' ))) != false) { - $suffixe = UTF8::substr($this->word, $position); + $suffixe = StringHelper::substr($this->word, $position); // a $a = array('iéndo', 'ándo', 'ár', 'ér', 'ír'); @@ -80,11 +76,11 @@ private function step0() }, $a); if ( ($position2 = $this->searchIfInRv($a)) !== false) { - $suffixe2 = UTF8::substr($this->word, $position2); - $suffixe2 = UTF8::to_utf8(UTF8::to_ascii($suffixe2)); // unaccent - $this->word = UTF8::substr($this->word, 0, $position2); + $suffixe2 = StringHelper::substr($this->word, $position2); + $suffixe2 = Transliterate::utf8_latin_to_ascii($suffixe2); // unaccent + $this->word = StringHelper::substr($this->word, 0, $position2); $this->word .= $suffixe2; - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -95,15 +91,15 @@ private function step0() }, $b); if ( ($position2 = $this->searchIfInRv($b)) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // c if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) { - $before = UTF8::substr($this->word, ($position2-1), 1); + $before = StringHelper::substr($this->word, ($position2-1), 1); if ( (isset($before)) && ($before == 'u') ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -125,7 +121,7 @@ private function step1() 'ible', 'ables', 'able', 'ismos', 'ismo', 'icas', 'icos', 'ica', 'ico', 'anzas', 'anza'))) != false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } return true; } @@ -137,11 +133,11 @@ private function step1() 'adoras', 'adora', 'aciones', 'ación', 'adores', 'ador', 'antes', 'ante', 'ancias', 'ancia'))) != false) { if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } if ( ($position2 = $this->searchIfInR2(array('ic')))) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -181,19 +177,19 @@ private function step1() // delete if in R1 if ($this->inR1($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position3); + $this->word = StringHelper::substr($this->word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position4); + $this->word = StringHelper::substr($this->word, 0, $position4); } return true; } @@ -205,12 +201,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by ante, able or ible, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('ante', 'able', 'ible'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -222,12 +218,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by abil, ic or iv, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -239,12 +235,12 @@ private function step1() // delete if in R2 if ($this->inR2($position)) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); } // if preceded by at, delete if in R2 if ( ($position2 = $this->searchIfInR2(array('at'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); } return true; } @@ -262,9 +258,9 @@ private function step2a() if ( ($position = $this->searchIfInRv(array( 'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) { - $before = UTF8::substr($this->word, ($position-1), 1); + $before = StringHelper::substr($this->word, ($position-1), 1); if ( (isset($before)) && ($before == 'u') ) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } } @@ -289,17 +285,17 @@ private function step2b() 'aré', 'erá', 'eré', 'áis', 'ías', 'irá', 'iré', 'aba', 'ían', 'ada', 'ara', 'ase', 'ida', 'ado', 'ido', 'ará', 'ad', 'ed', 'id', 'ís', 'ió', 'ar', 'er', 'ir', 'as', 'ía', 'an' ))) != false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // en es éis emos // delete, and if preceded by gu delete the u (the gu need not be in RV) if ( ($position = $this->searchIfInRv(array('éis', 'emos', 'en', 'es'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->search(array('gu'))) != false) { - $this->word = UTF8::substr($this->word, 0, ($position2+1)); + $this->word = StringHelper::substr($this->word, 0, ($position2+1)); } @@ -316,19 +312,19 @@ private function step3() // os a o á í ó // delete if in RV if ( ($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // e é // delete if in RV, and if preceded by gu with the u in RV delete the u if ( ($position = $this->searchIfInRv(array('e', 'é'))) != false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); if ( ($position2 = $this->searchIfInRv(array('u'))) != false) { - $before = UTF8::substr($this->word, ($position2-1), 1); + $before = StringHelper::substr($this->word, ($position2-1), 1); if ( (isset($before)) && ($before == 'g') ) { - $this->word = UTF8::substr($this->word, 0, $position2); + $this->word = StringHelper::substr($this->word, 0, $position2); return true; } } @@ -343,6 +339,6 @@ private function step3() */ private function finish() { - $this->word = UTF8::str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); + $this->word = str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); } } diff --git a/src/Stemmer/Stem.php b/src/Stemmer/Stem.php index 0c6f148..1ce7274 100644 --- a/src/Stemmer/Stem.php +++ b/src/Stemmer/Stem.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; abstract class Stem implements Stemmer { @@ -94,12 +94,12 @@ protected function searchIfInR2($suffixes) protected function search($suffixes, $offset = 0) { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); if ($offset > $length) { return false; } foreach ($suffixes as $suffixe) { - if ( (($position = UTF8::strrpos($this->word, $suffixe, $offset)) !== false) && ((Utf8::strlen($suffixe)+$position) == $length) ) { + if ( (($position = StringHelper::strrpos($this->word, $suffixe, $offset)) !== false) && ((StringHelper::strlen($suffixe)+$position) == $length) ) { return $position; } } @@ -134,7 +134,7 @@ protected function r2() */ protected function rx($in) { - $length = UTF8::strlen($in); + $length = StringHelper::strlen($in); // defaults $value = ''; @@ -143,7 +143,7 @@ protected function rx($in) // we search all vowels $vowels = array(); for ($i=0; $i<$length; $i++) { - $letter = UTF8::substr($in, $i, 1); + $letter = StringHelper::substr($in, $i, 1); if (in_array($letter, static::$vowels)) { $vowels[] = $i; } @@ -152,11 +152,11 @@ protected function rx($in) // search the non-vowel following a vowel foreach ($vowels as $position) { $after = $position + 1; - $letter = UTF8::substr($in, $after, 1); + $letter = StringHelper::substr($in, $after, 1); if (! in_array($letter, static::$vowels)) { $index = $after + 1; - $value = UTF8::substr($in, ($after+1)); + $value = StringHelper::substr($in, ($after+1)); break; } @@ -175,7 +175,7 @@ protected function rx($in) */ protected function rv() { - $length = UTF8::strlen($this->word); + $length = StringHelper::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; @@ -184,16 +184,16 @@ protected function rv() return true; } - $first = UTF8::substr($this->word, 0, 1); - $second = UTF8::substr($this->word, 1, 1); + $first = StringHelper::substr($this->word, 0, 1); + $second = StringHelper::substr($this->word, 1, 1); // If the second letter is a consonant, RV is the region after the next following vowel, if (!in_array($second, static::$vowels)) { for ($i=2; $i<$length; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (in_array($letter, static::$vowels)) { $this->rvIndex = $i + 1; - $this->rv = UTF8::substr($this->word, ($i+1)); + $this->rv = StringHelper::substr($this->word, ($i+1)); return true; } } @@ -202,10 +202,10 @@ protected function rv() // or if the first two letters are vowels, RV is the region after the next consonant, if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { for ($i=2; $i<$length; $i++) { - $letter = UTF8::substr($this->word, $i, 1); + $letter = StringHelper::substr($this->word, $i, 1); if (! in_array($letter, static::$vowels)) { $this->rvIndex = $i + 1; - $this->rv = UTF8::substr($this->word, ($i+1)); + $this->rv = StringHelper::substr($this->word, ($i+1)); return true; } } @@ -213,7 +213,7 @@ protected function rv() // and otherwise (consonant-vowel case) RV is the region after the third letter. if ( (! in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { - $this->rv = UTF8::substr($this->word, 3); + $this->rv = StringHelper::substr($this->word, 3); $this->rvIndex = 3; return true; } diff --git a/src/Stemmer/Swedish.php b/src/Stemmer/Swedish.php index 32352ef..ed8103c 100644 --- a/src/Stemmer/Swedish.php +++ b/src/Stemmer/Swedish.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball\Stemmer; -use voku\helper\UTF8; +use Joomla\String\StringHelper; /** * @@ -22,12 +22,7 @@ class Swedish extends Stem */ public function stem($word) { - // we do ALL in UTF-8 - if (!UTF8::is_utf8($word)) { - throw new \Exception('Word must be in UTF-8'); - } - - $this->word = UTF8::strtolower($word); + $this->word = StringHelper::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); @@ -35,7 +30,7 @@ public function stem($word) // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; - $this->r1 = UTF8::substr($this->word, 3); + $this->r1 = StringHelper::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. @@ -55,7 +50,7 @@ public function stem($word) */ private function hasValidSEnding($word) { - $lastLetter = UTF8::substr($word, -1, 1); + $lastLetter = StringHelper::substr($word, -1, 1); return in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y')); } @@ -74,14 +69,14 @@ private function step1() 'orna', 'arna', 'erna', 'aren', 'ande', 'ades', 'arne', 'erns', 'aste', 'ade', 'ern', 'het', 'ast', 'are', 'ens', 'or', 'es', 'ad', 'en', 'at', 'ar', 'as', 'er', 'a', 'e' ))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // s // delete if preceded by a valid s-ending if ( ($position = $this->searchIfInR1(array('s'))) !== false) { - $word = UTF8::substr($this->word, 0, $position); + $word = StringHelper::substr($this->word, 0, $position); if ($this->hasValidSEnding($word)) { $this->word = $word; } @@ -96,7 +91,7 @@ private function step2() { // dd gd nn dt gt kt tt if ($this->searchIfInR1(array('dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt')) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); } } @@ -109,21 +104,21 @@ private function step3() // lig ig els // delete if ( ($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) { - $this->word = UTF8::substr($this->word, 0, $position); + $this->word = StringHelper::substr($this->word, 0, $position); return true; } // löst // replace with lös if ( ($this->searchIfInR1(array('löst'))) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); return true; } // fullt // replace with full if ( ($this->searchIfInR1(array('fullt'))) !== false) { - $this->word = UTF8::substr($this->word, 0, -1); + $this->word = StringHelper::substr($this->word, 0, -1); return true; } } diff --git a/src/StemmerFactory.php b/src/StemmerFactory.php index d60a8c6..b8c487a 100644 --- a/src/StemmerFactory.php +++ b/src/StemmerFactory.php @@ -2,7 +2,7 @@ namespace Wamania\Snowball; -use voku\helper\UTF8; +use Joomla\String\StringHelper; use Wamania\Snowball\Stemmer\Catalan; use Wamania\Snowball\Stemmer\Danish; use Wamania\Snowball\Stemmer\Dutch; @@ -43,7 +43,7 @@ class StemmerFactory */ public static function create(string $code): Stemmer { - $code = UTF8::strtolower($code); + $code = StringHelper::strtolower($code); foreach (self::LANGS as $classname => $isoCodes) { if (in_array($code, $isoCodes)) { diff --git a/src/Transliterate.php b/src/Transliterate.php new file mode 100644 index 0000000..3399f6b --- /dev/null +++ b/src/Transliterate.php @@ -0,0 +1,253 @@ + 'a', + 'ô' => 'o', + 'ď' => 'd', + 'ḟ' => 'f', + 'ë' => 'e', + 'š' => 's', + 'ơ' => 'o', + 'ß' => 'ss', + 'ă' => 'a', + 'ř' => 'r', + 'ț' => 't', + 'ň' => 'n', + 'ā' => 'a', + 'ķ' => 'k', + 'ŝ' => 's', + 'ỳ' => 'y', + 'ņ' => 'n', + 'ĺ' => 'l', + 'ħ' => 'h', + 'ṗ' => 'p', + 'ó' => 'o', + 'ú' => 'u', + 'ě' => 'e', + 'é' => 'e', + 'ç' => 'c', + 'ẁ' => 'w', + 'ċ' => 'c', + 'õ' => 'o', + 'ṡ' => 's', + 'ø' => 'o', + 'ģ' => 'g', + 'ŧ' => 't', + 'ș' => 's', + 'ė' => 'e', + 'ĉ' => 'c', + 'ś' => 's', + 'î' => 'i', + 'ű' => 'u', + 'ć' => 'c', + 'ę' => 'e', + 'ŵ' => 'w', + 'ṫ' => 't', + 'ū' => 'u', + 'č' => 'c', + 'ö' => 'oe', + 'è' => 'e', + 'ŷ' => 'y', + 'ą' => 'a', + 'ł' => 'l', + 'ų' => 'u', + 'ů' => 'u', + 'ş' => 's', + 'ğ' => 'g', + 'ļ' => 'l', + 'ƒ' => 'f', + 'ž' => 'z', + 'ẃ' => 'w', + 'ḃ' => 'b', + 'å' => 'a', + 'ì' => 'i', + 'ï' => 'i', + 'ḋ' => 'd', + 'ť' => 't', + 'ŗ' => 'r', + 'ä' => 'ae', + 'í' => 'i', + 'ŕ' => 'r', + 'ê' => 'e', + 'ü' => 'ue', + 'ò' => 'o', + 'ē' => 'e', + 'ñ' => 'n', + 'ń' => 'n', + 'ĥ' => 'h', + 'ĝ' => 'g', + 'đ' => 'd', + 'ĵ' => 'j', + 'ÿ' => 'y', + 'ũ' => 'u', + 'ŭ' => 'u', + 'ư' => 'u', + 'ţ' => 't', + 'ý' => 'y', + 'ő' => 'o', + 'â' => 'a', + 'ľ' => 'l', + 'ẅ' => 'w', + 'ż' => 'z', + 'ī' => 'i', + 'ã' => 'a', + 'ġ' => 'g', + 'ṁ' => 'm', + 'ō' => 'o', + 'ĩ' => 'i', + 'ù' => 'u', + 'į' => 'i', + 'ź' => 'z', + 'á' => 'a', + 'û' => 'u', + 'þ' => 'th', + 'ð' => 'dh', + 'æ' => 'ae', + 'µ' => 'u', + 'ĕ' => 'e', + 'œ' => 'oe', + ]; + } + + $string = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $string); + } + + if ($case >= 0) { + if (\is_null($UTF8_UPPER_ACCENTS)) { + $UTF8_UPPER_ACCENTS = [ + 'À' => 'A', + 'Ô' => 'O', + 'Ď' => 'D', + 'Ḟ' => 'F', + 'Ë' => 'E', + 'Š' => 'S', + 'Ơ' => 'O', + 'Ă' => 'A', + 'Ř' => 'R', + 'Ț' => 'T', + 'Ň' => 'N', + 'Ā' => 'A', + 'Ķ' => 'K', + 'Ŝ' => 'S', + 'Ỳ' => 'Y', + 'Ņ' => 'N', + 'Ĺ' => 'L', + 'Ħ' => 'H', + 'Ṗ' => 'P', + 'Ó' => 'O', + 'Ú' => 'U', + 'Ě' => 'E', + 'É' => 'E', + 'Ç' => 'C', + 'Ẁ' => 'W', + 'Ċ' => 'C', + 'Õ' => 'O', + 'Ṡ' => 'S', + 'Ø' => 'O', + 'Ģ' => 'G', + 'Ŧ' => 'T', + 'Ș' => 'S', + 'Ė' => 'E', + 'Ĉ' => 'C', + 'Ś' => 'S', + 'Î' => 'I', + 'Ű' => 'U', + 'Ć' => 'C', + 'Ę' => 'E', + 'Ŵ' => 'W', + 'Ṫ' => 'T', + 'Ū' => 'U', + 'Č' => 'C', + 'Ö' => 'Oe', + 'È' => 'E', + 'Ŷ' => 'Y', + 'Ą' => 'A', + 'Ł' => 'L', + 'Ų' => 'U', + 'Ů' => 'U', + 'Ş' => 'S', + 'Ğ' => 'G', + 'Ļ' => 'L', + 'Ƒ' => 'F', + 'Ž' => 'Z', + 'Ẃ' => 'W', + 'Ḃ' => 'B', + 'Å' => 'A', + 'Ì' => 'I', + 'Ï' => 'I', + 'Ḋ' => 'D', + 'Ť' => 'T', + 'Ŗ' => 'R', + 'Ä' => 'Ae', + 'Í' => 'I', + 'Ŕ' => 'R', + 'Ê' => 'E', + 'Ü' => 'Ue', + 'Ò' => 'O', + 'Ē' => 'E', + 'Ñ' => 'N', + 'Ń' => 'N', + 'Ĥ' => 'H', + 'Ĝ' => 'G', + 'Đ' => 'D', + 'Ĵ' => 'J', + 'Ÿ' => 'Y', + 'Ũ' => 'U', + 'Ŭ' => 'U', + 'Ư' => 'U', + 'Ţ' => 'T', + 'Ý' => 'Y', + 'Ő' => 'O', + 'Â' => 'A', + 'Ľ' => 'L', + 'Ẅ' => 'W', + 'Ż' => 'Z', + 'Ī' => 'I', + 'Ã' => 'A', + 'Ġ' => 'G', + 'Ṁ' => 'M', + 'Ō' => 'O', + 'Ĩ' => 'I', + 'Ù' => 'U', + 'Į' => 'I', + 'Ź' => 'Z', + 'Á' => 'A', + 'Û' => 'U', + 'Þ' => 'Th', + 'Ð' => 'Dh', + 'Æ' => 'Ae', + 'Ĕ' => 'E', + 'Œ' => 'Oe', + ]; + } + + $string = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $string); + } + + return $string; + } +} diff --git a/test/CsvFileIterator.php b/test/CsvFileIterator.php index ddc0b23..0783a43 100644 --- a/test/CsvFileIterator.php +++ b/test/CsvFileIterator.php @@ -19,7 +19,7 @@ public function __destruct() fclose($this->file); } - public function rewind() + public function rewind(): void { rewind($this->file); //$this->current = fgetcsv($this->file, null, "\t"); @@ -32,22 +32,22 @@ public function rewind() $this->key = 0; } - public function valid() + public function valid(): bool { return !feof($this->file); } - public function key() + public function key(): mixed { return $this->key; } - public function current() + public function current(): mixed { return $this->current; } - public function next() + public function next(): void { $line = fgets($this->file); $current = explode(' ', $line); diff --git a/test/CsvFileVerboseIterator.php b/test/CsvFileVerboseIterator.php index 25314b6..3ab02c5 100644 --- a/test/CsvFileVerboseIterator.php +++ b/test/CsvFileVerboseIterator.php @@ -3,13 +3,13 @@ class CsvFileVerboseIterator extends CsvFileIterator { - public function rewind() + public function rewind(): void { parent::rewind(); $this->_updateKey($this->current()); } - public function next() + public function next(): void { parent::next(); if ($this->valid()) { From 1090dac86d5cdd6121966bf64da2796263350a60 Mon Sep 17 00:00:00 2001 From: Hannes Papenberg Date: Wed, 11 Dec 2024 12:14:37 +0100 Subject: [PATCH 2/7] Relaxing dependencies --- composer.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer.json b/composer.json index b01b73f..d58a45c 100644 --- a/composer.json +++ b/composer.json @@ -10,8 +10,8 @@ } ], "require": { - "php": ">=8.1", - "joomla/string": ">=3.0.1" + "php": ">=7.3", + "joomla/string": ">=2.0.1" }, "require-dev":{ "phpunit/phpunit": "^9.0" From 2468aaa5119f004ae750f2bf4395c992f07d042b Mon Sep 17 00:00:00 2001 From: Hannes Papenberg Date: Wed, 11 Dec 2024 12:20:03 +0100 Subject: [PATCH 3/7] Revert changes to test files --- test/CsvFileIterator.php | 10 +++++----- test/CsvFileVerboseIterator.php | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/CsvFileIterator.php b/test/CsvFileIterator.php index 0783a43..ddc0b23 100644 --- a/test/CsvFileIterator.php +++ b/test/CsvFileIterator.php @@ -19,7 +19,7 @@ public function __destruct() fclose($this->file); } - public function rewind(): void + public function rewind() { rewind($this->file); //$this->current = fgetcsv($this->file, null, "\t"); @@ -32,22 +32,22 @@ public function rewind(): void $this->key = 0; } - public function valid(): bool + public function valid() { return !feof($this->file); } - public function key(): mixed + public function key() { return $this->key; } - public function current(): mixed + public function current() { return $this->current; } - public function next(): void + public function next() { $line = fgets($this->file); $current = explode(' ', $line); diff --git a/test/CsvFileVerboseIterator.php b/test/CsvFileVerboseIterator.php index 3ab02c5..25314b6 100644 --- a/test/CsvFileVerboseIterator.php +++ b/test/CsvFileVerboseIterator.php @@ -3,13 +3,13 @@ class CsvFileVerboseIterator extends CsvFileIterator { - public function rewind(): void + public function rewind() { parent::rewind(); $this->_updateKey($this->current()); } - public function next(): void + public function next() { parent::next(); if ($this->valid()) { From ccdd08c75068e9ca6b80d5435d1e1a62e3ce5089 Mon Sep 17 00:00:00 2001 From: Hannes Papenberg Date: Thu, 28 May 2026 11:28:44 +0200 Subject: [PATCH 4/7] Improvements from phpstan scan --- src/Stemmer/Danish.php | 2 +- src/Stemmer/Dutch.php | 4 ++-- src/Stemmer/Finnish.php | 9 +++++++++ src/Stemmer/Norwegian.php | 2 +- src/Stemmer/Romanian.php | 2 ++ src/Stemmer/Stem.php | 2 +- src/Stemmer/Swedish.php | 2 +- 7 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/Stemmer/Danish.php b/src/Stemmer/Danish.php index 5fc7507..aedd4c2 100644 --- a/src/Stemmer/Danish.php +++ b/src/Stemmer/Danish.php @@ -46,7 +46,7 @@ public function stem($word): string * Define a valid s-ending as one of * a b c d f g h j k l m n o p r t v y z å * - * @param string $ending + * @param string $word * @return boolean */ private function hasValidSEnding($word) diff --git a/src/Stemmer/Dutch.php b/src/Stemmer/Dutch.php index 6a2b563..002c2d7 100644 --- a/src/Stemmer/Dutch.php +++ b/src/Stemmer/Dutch.php @@ -61,7 +61,7 @@ public function stem($word) /** * Define a valid s-ending as a non-vowel other than j. - * @param string $ending + * @param string $word * @return boolean */ private function hasValidSEnding($word) @@ -72,7 +72,7 @@ private function hasValidSEnding($word) /** * Define a valid en-ending as a non-vowel, and not gem. - * @param string $ending + * @param string $word * @return boolean */ private function hasValidEnEnding($word) diff --git a/src/Stemmer/Finnish.php b/src/Stemmer/Finnish.php index c6487b5..d73e4de 100644 --- a/src/Stemmer/Finnish.php +++ b/src/Stemmer/Finnish.php @@ -91,6 +91,8 @@ private function step1() return true; } + + return false; } /** @@ -180,6 +182,8 @@ private function step2() } } } + + return false; } /** @@ -294,6 +298,8 @@ private function step3() $this->_removedInStep3 = true; return true; } + + return false; } /** @@ -326,6 +332,8 @@ private function step4() $this->r2(); return true; } + + return false; } /** @@ -372,6 +380,7 @@ private function step5() } } + return false; } /** diff --git a/src/Stemmer/Norwegian.php b/src/Stemmer/Norwegian.php index 627a578..83969f0 100644 --- a/src/Stemmer/Norwegian.php +++ b/src/Stemmer/Norwegian.php @@ -46,7 +46,7 @@ public function stem($word) * b c d f g h j l m n o p r t v y z, * or k not preceded by a vowel * - * @param string $ending + * @param string $word * @return boolean */ private function hasValidSEnding($word) diff --git a/src/Stemmer/Romanian.php b/src/Stemmer/Romanian.php index 87047dc..fbdd168 100644 --- a/src/Stemmer/Romanian.php +++ b/src/Stemmer/Romanian.php @@ -300,6 +300,8 @@ private function step3() } return true; } + + return false; } /** diff --git a/src/Stemmer/Stem.php b/src/Stemmer/Stem.php index 1ce7274..28d22f2 100644 --- a/src/Stemmer/Stem.php +++ b/src/Stemmer/Stem.php @@ -40,7 +40,7 @@ abstract class Stem implements Stemmer /** * R1 value - * @var integer + * @var string */ protected $r1; diff --git a/src/Stemmer/Swedish.php b/src/Stemmer/Swedish.php index ed8103c..40bb794 100644 --- a/src/Stemmer/Swedish.php +++ b/src/Stemmer/Swedish.php @@ -45,7 +45,7 @@ public function stem($word) * Define a valid s-ending as one of * b c d f g h j k l m n o p r t v y * - * @param string $ending + * @param string $word * @return boolean */ private function hasValidSEnding($word) From 813e65633c7f663db7d1d8a0af50ef149110744d Mon Sep 17 00:00:00 2001 From: Hannes Papenberg Date: Thu, 28 May 2026 11:29:09 +0200 Subject: [PATCH 5/7] Adding phpstan and CI workflow for it --- .github/workflows/tests.yml | 4 ++-- composer.json | 4 +++- phpstan.neon | 9 +++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 phpstan.neon diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2994d98..d453b90 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -35,8 +35,8 @@ jobs: max_attempts: 5 command: composer update --${{ matrix.stability }} --prefer-dist --no-interaction --no-progress - - name: Copy PHP Unit Settings - run: cp phpunit.xml.dist phpunit.xml + - name: Execute phpstan + run: vendor/bin/phpstan - name: Execute tests run: vendor/bin/phpunit --verbose diff --git a/composer.json b/composer.json index d58a45c..fb0a0f9 100644 --- a/composer.json +++ b/composer.json @@ -14,7 +14,9 @@ "joomla/string": ">=2.0.1" }, "require-dev":{ - "phpunit/phpunit": "^9.0" + "phpunit/phpunit": "^9.0", + "phpstan/phpstan": "^2", + "phpstan/phpstan-deprecation-rules": "^2" }, "autoload": { "psr-4": { diff --git a/phpstan.neon b/phpstan.neon new file mode 100644 index 0000000..4351197 --- /dev/null +++ b/phpstan.neon @@ -0,0 +1,9 @@ +includes: + - vendor/phpstan/phpstan-deprecation-rules/rules.neon + +parameters: + level: 5 + phpVersion: 70300 + reportUnmatchedIgnoredErrors: false + paths: + - src From 92a86abf47e3fa0e8a2269a2c9a96097b2fd19e8 Mon Sep 17 00:00:00 2001 From: Hannes Papenberg Date: Thu, 28 May 2026 11:32:17 +0200 Subject: [PATCH 6/7] Simplifying unittests and fixing PHP 8.5 errors --- test/CatalanTest.php | 20 +++----------------- test/CsvFileIterator.php | 5 +++++ test/CsvFileVerboseIterator.php | 6 +++--- test/DanishTest.php | 20 +++----------------- test/DutchTest.php | 20 +++----------------- test/EnglishTest.php | 20 +++----------------- test/FinnishTest.php | 20 +++----------------- test/FrenchTest.php | 20 +++----------------- test/GermanTest.php | 20 +++----------------- test/ItalianTest.php | 20 +++----------------- test/NorwegianTest.php | 20 +++----------------- test/PortugueseTest.php | 20 +++----------------- test/RomanianTest.php | 20 +++----------------- test/RussianTest.php | 20 +++----------------- test/SpanishTest.php | 20 +++----------------- test/StemmingTest.php | 24 ++++++++++++++++++++++++ test/SwedishTest.php | 20 +++----------------- 17 files changed, 74 insertions(+), 241 deletions(-) create mode 100644 test/StemmingTest.php diff --git a/test/CatalanTest.php b/test/CatalanTest.php index 2512c48..2388fdf 100644 --- a/test/CatalanTest.php +++ b/test/CatalanTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Catalan; -class CatalanTest extends TestCase +class CatalanTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Catalan(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileVerboseIterator('test/files/ca.txt'); - } + protected $class = Catalan::class; + protected $file = 'ca'; } diff --git a/test/CsvFileIterator.php b/test/CsvFileIterator.php index ddc0b23..bf1ed17 100644 --- a/test/CsvFileIterator.php +++ b/test/CsvFileIterator.php @@ -19,6 +19,7 @@ public function __destruct() fclose($this->file); } + #[\ReturnTypeWillChange] public function rewind() { rewind($this->file); @@ -32,21 +33,25 @@ public function rewind() $this->key = 0; } + #[\ReturnTypeWillChange] public function valid() { return !feof($this->file); } + #[\ReturnTypeWillChange] public function key() { return $this->key; } + #[\ReturnTypeWillChange] public function current() { return $this->current; } + #[\ReturnTypeWillChange] public function next() { $line = fgets($this->file); diff --git a/test/CsvFileVerboseIterator.php b/test/CsvFileVerboseIterator.php index 25314b6..f745ab2 100644 --- a/test/CsvFileVerboseIterator.php +++ b/test/CsvFileVerboseIterator.php @@ -3,7 +3,7 @@ class CsvFileVerboseIterator extends CsvFileIterator { - public function rewind() + public function rewind(): void { parent::rewind(); $this->_updateKey($this->current()); @@ -20,9 +20,9 @@ public function next() protected function _updateKey($value) { if ($value && sizeof($value)) { - $this->key = $value[0]; + $this->key = (int) $value[0]; } elseif (sizeof($this->current)) { - $this->key = $this->current[0]; + $this->key = (int) $this->current[0]; } } } diff --git a/test/DanishTest.php b/test/DanishTest.php index b846d72..c01c5fd 100644 --- a/test/DanishTest.php +++ b/test/DanishTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Danish; -class DanishTest extends TestCase +class DanishTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Danish(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/dk.txt'); - } + protected $class = Danish::class; + protected $file = 'dk'; } diff --git a/test/DutchTest.php b/test/DutchTest.php index 6e21f8c..8083bf4 100644 --- a/test/DutchTest.php +++ b/test/DutchTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Dutch; -class DutchTest extends TestCase +class DutchTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Dutch(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/nl.txt'); - } + protected $class = Dutch::class; + protected $file = 'nl'; } diff --git a/test/EnglishTest.php b/test/EnglishTest.php index a38fb0f..cdb05f9 100644 --- a/test/EnglishTest.php +++ b/test/EnglishTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\English; -class EnglishTest extends TestCase +class EnglishTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new English(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/en.txt'); - } + protected $class = English::class; + protected $file = 'en'; } diff --git a/test/FinnishTest.php b/test/FinnishTest.php index 17a6c33..e2ad387 100644 --- a/test/FinnishTest.php +++ b/test/FinnishTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Finnish; -class FinnishTest extends TestCase +class FinnishTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Finnish(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/fi.txt'); - } + protected $class = Finnish::class; + protected $file = 'fi'; } diff --git a/test/FrenchTest.php b/test/FrenchTest.php index a985d2a..159fbfa 100644 --- a/test/FrenchTest.php +++ b/test/FrenchTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\French; -class FrenchTest extends TestCase +class FrenchTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new French(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/fr.txt'); - } + protected $class = French::class; + protected $file = 'fr'; } diff --git a/test/GermanTest.php b/test/GermanTest.php index 3bec53d..f852ff1 100644 --- a/test/GermanTest.php +++ b/test/GermanTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\German; -class GermanTest extends TestCase +class GermanTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new German(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/de.txt'); - } + protected $class = German::class; + protected $file = 'de'; } diff --git a/test/ItalianTest.php b/test/ItalianTest.php index c77d8ac..e4e3f89 100644 --- a/test/ItalianTest.php +++ b/test/ItalianTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Italian; -class ItalianTest extends TestCase +class ItalianTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Italian(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/it.txt'); - } + protected $class = Italian::class; + protected $file = 'it'; } diff --git a/test/NorwegianTest.php b/test/NorwegianTest.php index 6265d58..d30f19c 100644 --- a/test/NorwegianTest.php +++ b/test/NorwegianTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Norwegian; -class NorwegianTest extends TestCase +class NorwegianTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Norwegian(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/no.txt'); - } + protected $class = Norwegian::class; + protected $file = 'no'; } diff --git a/test/PortugueseTest.php b/test/PortugueseTest.php index 6cf8851..267f401 100644 --- a/test/PortugueseTest.php +++ b/test/PortugueseTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Portuguese; -class PortugueseTest extends TestCase +class PortugueseTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Portuguese(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/pt.txt'); - } + protected $class = Portuguese::class; + protected $file = 'pt'; } diff --git a/test/RomanianTest.php b/test/RomanianTest.php index 150510b..ed83a55 100644 --- a/test/RomanianTest.php +++ b/test/RomanianTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Romanian; -class RomanianTest extends TestCase +class RomanianTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Romanian(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/ro.txt'); - } + protected $class = Romanian::class; + protected $file = 'ro'; } diff --git a/test/RussianTest.php b/test/RussianTest.php index e95a6c9..5583f3e 100644 --- a/test/RussianTest.php +++ b/test/RussianTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Russian; -class RussianTest extends TestCase +class RussianTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Russian(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/ru.txt'); - } + protected $class = Russian::class; + protected $file = 'ru'; } diff --git a/test/SpanishTest.php b/test/SpanishTest.php index 7b3cf40..d6f66dd 100644 --- a/test/SpanishTest.php +++ b/test/SpanishTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Spanish; -class SpanishTest extends TestCase +class SpanishTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Spanish(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/es.txt'); - } + protected $class = Spanish::class; + protected $file = 'es'; } diff --git a/test/StemmingTest.php b/test/StemmingTest.php new file mode 100644 index 0000000..8e97085 --- /dev/null +++ b/test/StemmingTest.php @@ -0,0 +1,24 @@ +class; + + $snowballStem = $o->stem($word); + + $this->assertEquals($stem, $snowballStem); + } + + public function load() + { + return new CsvFileIterator('test/files/' . $this->file . '.txt'); + } +} diff --git a/test/SwedishTest.php b/test/SwedishTest.php index f3d1f71..8a9dc3c 100644 --- a/test/SwedishTest.php +++ b/test/SwedishTest.php @@ -4,22 +4,8 @@ use PHPUnit\Framework\TestCase; use Wamania\Snowball\Stemmer\Swedish; -class SwedishTest extends TestCase +class SwedishTest extends StemmingTest { - /** - * @dataProvider load - */ - public function testStem($word, $stem) - { - $o = new Swedish(); - - $snowballStem = $o->stem($word); - - $this->assertEquals($stem, $snowballStem); - } - - public function load() - { - return new CsvFileIterator('test/files/sw.txt'); - } + protected $class = Swedish::class; + protected $file = 'sw'; } From 6a95ed44175852bde0149c5c918780e041015be5 Mon Sep 17 00:00:00 2001 From: Hannes Papenberg Date: Fri, 29 May 2026 09:28:27 +0200 Subject: [PATCH 7/7] Fixing phpstan errors in spanish --- src/Stemmer/Spanish.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Stemmer/Spanish.php b/src/Stemmer/Spanish.php index b83c040..dea6742 100644 --- a/src/Stemmer/Spanish.php +++ b/src/Stemmer/Spanish.php @@ -98,7 +98,7 @@ private function step0() // c if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) { $before = StringHelper::substr($this->word, ($position2-1), 1); - if ( (isset($before)) && ($before == 'u') ) { + if ($before == 'u') { $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -259,7 +259,7 @@ private function step2a() 'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) { $before = StringHelper::substr($this->word, ($position-1), 1); - if ( (isset($before)) && ($before == 'u') ) { + if ($before == 'u') { $this->word = StringHelper::substr($this->word, 0, $position); return true; } @@ -301,6 +301,8 @@ private function step2b() return true; } + + return false; } /** @@ -323,7 +325,7 @@ private function step3() if ( ($position2 = $this->searchIfInRv(array('u'))) != false) { $before = StringHelper::substr($this->word, ($position2-1), 1); - if ( (isset($before)) && ($before == 'g') ) { + if ($before == 'g') { $this->word = StringHelper::substr($this->word, 0, $position2); return true; }