Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changes/next_release.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

- ff-680 (gh-507) — Allowed unicode characters in the native feed tags to support non-English tags that are not owned by Feeds Fun.
2 changes: 0 additions & 2 deletions changes/unreleased.md

This file was deleted.

40 changes: 20 additions & 20 deletions ffun/ffun/tags/converters.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
import re

from slugify import slugify

from ffun.domain.entities import TagUid

DISALLOWED_CHARS_PATTERN = re.compile(r"[^-a-zA-Z0-9]+")


_encode_replacements = {
"#": "-sharp-", # c# -> c-sharp
"+": "-plus-", # c++ -> c-plus-plus
Expand Down Expand Up @@ -48,25 +43,30 @@ def _decode_special_characters(tag: str) -> str:
return "".join(result)


def normalize(tag: str) -> TagUid:
def normalize(tag: str, allow_unicode: bool) -> TagUid:
tag = tag.lower()

tag = _encode_special_characters(tag)

return slugify(
tag,
entities=True,
decimal=True,
hexadecimal=True,
max_length=0,
word_boundary=False,
save_order=True,
separator="-",
stopwords=(),
regex_pattern=DISALLOWED_CHARS_PATTERN, # type: ignore
lowercase=True,
replacements=(),
allow_unicode=False,
# Note: with allow_unicode True slugify normalizes unicode to NFKC
# if in the future we'll decide to change library for slugification
# we should either ensure that behavior or renormalize tags in the database.
return TagUid(
slugify(
tag,
entities=True,
decimal=True,
hexadecimal=True,
max_length=0,
word_boundary=False,
save_order=True,
separator="-",
stopwords=(),
regex_pattern=None,
lowercase=True,
replacements=(),
allow_unicode=allow_unicode,
)
)


Expand Down
70 changes: 65 additions & 5 deletions ffun/ffun/tags/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,81 @@

from ffun.ontology.entities import NormalizedTag, RawTag
from ffun.tags import converters, utils
from ffun.tags.entities import NormalizationMode, TagInNormalization
from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization
from ffun.tags.normalizers import NormalizerInfo, normalizers


# Here is going some complicated unclear logic:
# - normalizers work with TagInNormalization
# - TagInNormalization should define how it should be processed by normalizers
# - There are two options: define explicitly or implicitly (derive from categories)
# - It may look like a good idea to define it explicitly, so we could have a normalizer
# that could say "I produce this new tag which should be processed as raw/preserve/final"
# - But this approach leads to uncertainty when we do re-normalization of tags in the database
# because we don't store the final normalization mode in the database
# (and it may be wrong in case of re-normalization)
# So, on re-normalization we use tag categories to derive the mode (again)
# We also use RawTag, not TagInNormalization as a result of running a normalizer.
# - That's why it seems more consistent to build the logic of normalizers around categories only,
# to keep the behavior consistent across the whole system.
# => We expect that a normalizer, if needed, will be able to set new categories for the tags it produces.
# For example, there may be a normalizer that detects network domains in free-form tags.
def mode_from_categories(categories: set[TagCategory]) -> NormalizationMode: # noqa: CCR001
# The order of checks is important here

if TagCategory.network_domain in categories:
return NormalizationMode.final

if TagCategory.special in categories:
return NormalizationMode.final

# We do not normalize native feed tags, because:
# - We have no control over the logic that assigns them
# - Sometimes they are (semi-)technical (special terms, domain names, codes)
# - Sometimes they are very specific, like r-sideproject (for subreddits)
# and we don't want to create a duplicated tag like r-sideprojects that actually has no meaning
if TagCategory.feed_tag in categories:
return NormalizationMode.final

if TagCategory.free_form in categories:
return NormalizationMode.raw

if TagCategory.test_final in categories:
return NormalizationMode.final

if TagCategory.test_preserve in categories:
return NormalizationMode.preserve

if TagCategory.test_raw in categories:
return NormalizationMode.raw

raise NotImplementedError(f"Tag with unknown categories: {categories}")


def prepare_for_normalization(tag: RawTag) -> TagInNormalization:
# we better normalize uids even for final tags:
# - In case all works well, they will remain unchanged
# - In case of some issues, we'll stop an error propagation here
uid = converters.normalize(tag.raw_uid)
# 1. We better normalize uids even for final tags:
# - In case all works well, they will remain unchanged
# - In case of some issues, we'll stop an error propagation here
# 2. We keep text normalization outside of the normalizers list, since:
# - it is a common step for all tags, and we don't want to repeat it in each normalizer
# - it is not a normalizer itself, but rather a preparation step for normalizers,
# so it is better to keep it outside of the normalizers list. For example,
# we fill .parts field on the base of normalized uid.

mode = mode_from_categories(tag.categories)

# We do not allow unicode characters in raw tags, they must be pure ASCII. At least for now.
# It can be changed in https://github.com/Tiendil/feeds.fun/issues/348
allow_unicode = mode != NormalizationMode.raw

uid = converters.normalize(tag.raw_uid, allow_unicode=allow_unicode)

return TagInNormalization(
uid=uid,
parts=utils.uid_to_parts(uid),
link=tag.link,
categories=set(tag.categories),
mode=mode,
)


Expand Down
48 changes: 1 addition & 47 deletions ffun/ffun/tags/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,53 +52,7 @@ class TagInNormalization(BaseEntity):

link: str | None
categories: TagCategories

# Here is going some complicated unclear logic:
# - normalizers work with TagInNormalization
# - TagInNormalization should define how it should be processed by normalizers
# - There are two options: define explicitly or implicitly (derive from categories)
# - It may look like a good idea to define it explicitly, so we could have a normalizer
# that could say "I produce this new tag which should be processed as raw/preserve/final"
# - But this approach leads to uncertainty when we doing re-normalization of tags in the database
# because we don't store the final normalization mode in the database
# (and it may be wrong in case of re-normalization)
# So, on re-normalization we use tag categories to derive the mode (again)
# We also use RawTag, not TagInNormalization as a result of running a normalizer.
# - That's why it seems more consistent to try building logic of normalizators around categories only
# To be consistent in the whole system
# => We expect, that normalizer, if it requires, will be able to set new categories for the tags it produces
# For example, there may be a normalizer that detects network domains in free-form tags
@property
def mode(self) -> NormalizationMode: # noqa: CCR001
# The order of checks is important here

if TagCategory.network_domain in self.categories:
return NormalizationMode.final

if TagCategory.special in self.categories:
return NormalizationMode.final

# We do not normalize native feed tags, because:
# - We have no control over the logic that assigns them
# - Sometimes they are (semi-)technical (special terms, domain names, codes)
# - Sometimes they are very specific, like r-sideproject (for subreddits)
# and we don't want to create a duplicated tag like r-sideprojects that actually has no meaning
if TagCategory.feed_tag in self.categories:
return NormalizationMode.final

if TagCategory.free_form in self.categories:
return NormalizationMode.raw

if TagCategory.test_final in self.categories:
return NormalizationMode.final

if TagCategory.test_preserve in self.categories:
return NormalizationMode.preserve

if TagCategory.test_raw in self.categories:
return NormalizationMode.raw

raise NotImplementedError(f"Tag with unknown categories: {self.categories}")
mode: NormalizationMode


class NormalizerType(enum.StrEnum):
Expand Down
3 changes: 2 additions & 1 deletion ffun/ffun/tags/normalizers/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ffun.domain.entities import TagUid
from ffun.ontology.entities import RawTag
from ffun.tags import utils
from ffun.tags.entities import TagCategory, TagInNormalization
from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization
from ffun.tags.normalizers.base import FakeNormalizer, NormalizerAlwaysError, NormalizerInfo


Expand Down Expand Up @@ -62,6 +62,7 @@ def tag(self) -> TagInNormalization:
parts=utils.uid_to_parts(uid),
link=None,
categories={TagCategory.test_preserve},
mode=NormalizationMode.preserve,
)

@pytest.fixture # type: ignore
Expand Down
47 changes: 44 additions & 3 deletions ffun/ffun/tags/normalizers/tests/test_form_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ffun.domain.entities import TagUid
from ffun.ontology.entities import RawTag
from ffun.tags import converters, utils
from ffun.tags.entities import TagCategory, TagInNormalization
from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization
from ffun.tags.normalizers import form_normalizer

normalizer = form_normalizer.Normalizer()
Expand Down Expand Up @@ -176,14 +176,15 @@ class TestNormalizer:
)
@pytest.mark.asyncio
async def test(self, input_uid: TagUid, expected_tag_valid: bool, expected_new_uids: list[str]) -> None:
assert converters.normalize(input_uid) == input_uid
assert all(converters.normalize(new_uid) == new_uid for new_uid in expected_new_uids)
assert converters.normalize(input_uid, allow_unicode=False) == input_uid
assert all(converters.normalize(new_uid, allow_unicode=False) == new_uid for new_uid in expected_new_uids)

input_tag = TagInNormalization(
uid=input_uid,
parts=utils.uid_to_parts(input_uid),
link="http://example.com/tag",
categories={TagCategory.test_raw},
mode=NormalizationMode.raw,
)

expected_new_tags = [
Expand All @@ -203,6 +204,45 @@ async def test(self, input_uid: TagUid, expected_tag_valid: bool, expected_new_u
assert tag_valid == expected_tag_valid
assert new_tags == expected_new_tags

@pytest.mark.parametrize(
"input_uid, expected_tag_valid, expected_new_uids",
[
("café-reviews", True, []),
("résumé-services", True, []),
("привет-миры", True, []),
("данные-аналитики", True, []),
("cafés-review", False, ["cafés-reviews"]),
],
)
@pytest.mark.asyncio
async def test_unicode_input_is_safe(
self, input_uid: TagUid, expected_tag_valid: bool, expected_new_uids: list[str]
) -> None:
assert converters.normalize(input_uid, allow_unicode=True) == input_uid
assert all(converters.normalize(new_uid, allow_unicode=True) == new_uid for new_uid in expected_new_uids)

input_tag = TagInNormalization(
uid=input_uid,
parts=utils.uid_to_parts(input_uid),
link="http://example.com/tag",
categories={TagCategory.test_preserve},
mode=NormalizationMode.preserve,
)

expected_new_tags = [
RawTag(
raw_uid=new_uid,
link=input_tag.link,
categories=input_tag.categories,
)
for new_uid in expected_new_uids
]

tag_valid, new_tags = await normalizer.normalize(input_tag)

assert tag_valid == expected_tag_valid
assert new_tags == expected_new_tags

@pytest.mark.skipif(reason="Performance test disabled by default.")
@pytest.mark.asyncio
async def test_performance(self) -> None:
Expand All @@ -214,6 +254,7 @@ async def test_performance(self) -> None:
parts=utils.uid_to_parts(TagUid(input_uid)),
link="http://example.com/tag",
categories={TagCategory.test_raw},
mode=NormalizationMode.raw,
)
for input_uid in [
"book-cover-review",
Expand Down
68 changes: 39 additions & 29 deletions ffun/ffun/tags/normalizers/tests/test_part_blacklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,60 @@
from ffun.domain.entities import TagUid
from ffun.ontology.entities import RawTag
from ffun.tags import converters, utils
from ffun.tags.entities import TagCategory, TagInNormalization
from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization
from ffun.tags.normalizers import part_blacklist

normalizer = part_blacklist.Normalizer(blacklist={"a", "the"})
normalizer = part_blacklist.Normalizer(blacklist={"a", "the", "и", "очень"})


class TestNormalizer:
@pytest.mark.parametrize(
"input_uid, expected_continue, expected_new_uids",
"unicode, input_uid, expected_continue, expected_new_uids",
[
("", False, []),
("a-the", False, []),
("no-removal", True, []),
("noremoval-at-all", True, []),
("the-best-startup", False, ["best-startup"]),
("about-the-best", False, ["about-best"]),
("about-best-the", False, ["about-best"]),
("a-or-the", False, ["or"]),
("a-the-best-of-the-best", False, ["best-of-best"]),
("athe-best", True, []),
("thea-best", True, []),
("best-thea", True, []),
("best-athe", True, []),
("know-thea-best", True, []),
("know-athe-best", True, []),
("the-the-the", False, []),
("a-a-a", False, []),
("the-a-the-a", False, []),
("a-the-a-the", False, []),
("the-a-the-a-the", False, []),
("best-the-a-the-a-the", False, ["best"]),
("math-the-a-the-a-physics", False, ["math-physics"]),
(False, "", False, []),
(False, "a-the", False, []),
(False, "no-removal", True, []),
(False, "noremoval-at-all", True, []),
(False, "the-best-startup", False, ["best-startup"]),
(False, "about-the-best", False, ["about-best"]),
(False, "about-best-the", False, ["about-best"]),
(False, "a-or-the", False, ["or"]),
(False, "a-the-best-of-the-best", False, ["best-of-best"]),
(False, "athe-best", True, []),
(False, "thea-best", True, []),
(False, "best-thea", True, []),
(False, "best-athe", True, []),
(False, "know-thea-best", True, []),
(False, "know-athe-best", True, []),
(False, "the-the-the", False, []),
(False, "a-a-a", False, []),
(False, "the-a-the-a", False, []),
(False, "a-the-a-the", False, []),
(False, "the-a-the-a-the", False, []),
(False, "best-the-a-the-a-the", False, ["best"]),
(False, "math-the-a-the-a-physics", False, ["math-physics"]),
(True, "данные-и-аналитика", False, ["данные-аналитика"]),
(True, "résumé-и-портфолио", False, ["résumé-портфолио"]),
(True, "очень-café-уютно", False, ["café-уютно"]),
(True, "café-и-bistro", False, ["café-bistro"]),
],
)
@pytest.mark.asyncio
async def test(self, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str]) -> None:
assert converters.normalize(input_uid) == input_uid
assert all(converters.normalize(new_uid) == new_uid for new_uid in expected_new_uids)
async def test(
self, unicode: bool, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str]
) -> None:
assert converters.normalize(input_uid, allow_unicode=unicode) == input_uid
assert all(converters.normalize(new_uid, allow_unicode=unicode) == new_uid for new_uid in expected_new_uids)

categories = {TagCategory.test_preserve} if unicode else {TagCategory.test_raw}
mode = NormalizationMode.preserve if unicode else NormalizationMode.raw

input_tag = TagInNormalization(
uid=input_uid,
parts=utils.uid_to_parts(input_uid),
link="http://example.com/tag",
categories={TagCategory.test_raw},
categories=categories,
mode=mode,
)

expected_new_tags = [
Expand Down
Loading
Loading