Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# PHPUnit
/app/phpunit.xml
/phpunit.xml
.phpunit.result.cache

# Build data
/build/
Expand Down
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
],
"require": {
"php": ">=7.3",
"joomla/string": ">=2.0.1"
"voku/portable-utf8": "^5.4|^6.0"
},
"require-dev":{
"phpunit/phpunit": "^9.0"
Expand Down
21 changes: 13 additions & 8 deletions src/Stemmer/Catalan.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

namespace Wamania\Snowball\Stemmer;

use Joomla\String\StringHelper;
use voku\helper\UTF8;

/**
*
Expand Down Expand Up @@ -86,7 +86,12 @@ class Catalan extends Stem
*/
public function stem($word)
{
$this->word = StringHelper::strtolower($word);
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}

$this->word = UTF8::strtolower($word);

// Catalan stemmer does not use Rv
$this->r1();
Expand Down Expand Up @@ -122,7 +127,7 @@ private function step0()
{
if (($position = $this->search(static::$attached_pronoun)) !== false) {
if ($this->inR1($position)) {
$this->word = StringHelper::substr($this->word, 0, $position);
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
Expand All @@ -141,7 +146,7 @@ private function step1a()
// delete if in R2
if (($position = $this->search(['acions', 'ada', 'ades'])) !== false) {
if ($this->inR2($position)) {
$this->word = StringHelper::substr($this->word, 0, $position);
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
Expand All @@ -161,7 +166,7 @@ private function step1a()
// delete if in R1
if (($position = $this->search(self::$standard_suffix_1a)) !== false) {
if ($this->inR1($position)) {
$this->word = StringHelper::substr($this->word, 0, $position);
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
Expand Down Expand Up @@ -236,7 +241,7 @@ private function step1b()
// delete if in R1
if (($position = $this->search(static::$verb_suffixes)) !== false) {
if ($this->inR1($position)) {
$this->word = StringHelper::substr($this->word, 0, $position);
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
Expand All @@ -246,7 +251,7 @@ private function step1b()
// delete if in R2
if (($position = $this->search(['ando'])) !== false) {
if ($this->inR2($position)) {
$this->word = StringHelper::substr($this->word, 0, $position);
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
Expand All @@ -265,7 +270,7 @@ private function step2()
// delete if in R1
if (($position = $this->search(static::$residual_suffixes)) !== false) {
if ($this->inR1($position)) {
$this->word = StringHelper::substr($this->word, 0, $position);
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
Expand Down
33 changes: 19 additions & 14 deletions src/Stemmer/Danish.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

namespace Wamania\Snowball\Stemmer;

use Joomla\String\StringHelper;
use voku\helper\UTF8;

/**
*
Expand All @@ -22,15 +22,20 @@ class Danish extends Stem
*/
public function stem($word): string
{
$this->word = StringHelper::strtolower($word);
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}

$this->word = UTF8::strtolower($word);

// R2 is not used: R1 is defined in the same way as in the German stemmer
$this->r1();

// then R1 is adjusted so that the region before it contains at least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = StringHelper::substr($this->word, 3);
$this->r1 = UTF8::substr($this->word, 3);
}

// Do each of steps 1, 2 3 and 4.
Expand All @@ -51,7 +56,7 @@ public function stem($word): string
*/
private function hasValidSEnding($word)
{
$lastLetter = StringHelper::substr($word, -1, 1);
$lastLetter = UTF8::substr($word, -1, 1);
return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'));
}

Expand All @@ -69,14 +74,14 @@ private function step1()
'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds',
'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e'
))) !== false) {
$this->word = StringHelper::substr($this->word, 0, $position);
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}

// s
// delete if preceded by a valid s-ending
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
$word = StringHelper::substr($this->word, 0, $position);
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidSEnding($word)) {
$this->word = $word;
}
Expand All @@ -92,7 +97,7 @@ private function step1()
private function step2()
{
if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) {
$this->word = StringHelper::substr($this->word, 0, -1);
$this->word = UTF8::substr($this->word, 0, -1);
}
}

Expand All @@ -103,22 +108,22 @@ private function step3()
{
// If the word ends igst, remove the final st.
if ($this->search(array('igst')) !== false) {
$this->word = StringHelper::substr($this->word, 0, -2);
$this->word = UTF8::substr($this->word, 0, -2);
}

// Search for the longest among the following suffixes in R1, and perform the action indicated.
// ig lig elig els
// delete, and then repeat step 2
if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) {
$this->word = StringHelper::substr($this->word, 0, $position);
$this->word = UTF8::substr($this->word, 0, $position);
$this->step2();
return true;
}

// løst
// replace with løs
if ($this->searchIfInR1(array('løst')) !== false) {
$this->word = StringHelper::substr($this->word, 0, -1);
$this->word = UTF8::substr($this->word, 0, -1);
}
}

Expand All @@ -128,19 +133,19 @@ private function step3()
*/
private function step4()
{
$length = StringHelper::strlen($this->word);
$length = UTF8::strlen($this->word);
if (!$this->inR1(($length-1))) {
return false;
}

$lastLetter = StringHelper::substr($this->word, -1, 1);
$lastLetter = UTF8::substr($this->word, -1, 1);
if (in_array($lastLetter, self::$vowels)) {
return false;
}
$beforeLastLetter = StringHelper::substr($this->word, -2, 1);
$beforeLastLetter = UTF8::substr($this->word, -2, 1);

if ($lastLetter == $beforeLastLetter) {
$this->word = StringHelper::substr($this->word, 0, -1);
$this->word = UTF8::substr($this->word, 0, -1);
}
return true;
}
Expand Down
Loading
Loading