diff --git a/changes/next_release.md b/changes/next_release.md new file mode 100644 index 00000000..a178e264 --- /dev/null +++ b/changes/next_release.md @@ -0,0 +1,2 @@ + +- ff-680 (gh-507) — Allowed unicode characters in the native feed tags to support non-English tags that are not owned by Feeds Fun. diff --git a/changes/unreleased.md b/changes/unreleased.md deleted file mode 100644 index 373eb997..00000000 --- a/changes/unreleased.md +++ /dev/null @@ -1,2 +0,0 @@ - -No changes. diff --git a/ffun/ffun/tags/converters.py b/ffun/ffun/tags/converters.py index 19b786cb..6d83adfc 100644 --- a/ffun/ffun/tags/converters.py +++ b/ffun/ffun/tags/converters.py @@ -1,12 +1,7 @@ -import re - from slugify import slugify from ffun.domain.entities import TagUid -DISALLOWED_CHARS_PATTERN = re.compile(r"[^-a-zA-Z0-9]+") - - _encode_replacements = { "#": "-sharp-", # c# -> c-sharp "+": "-plus-", # c++ -> c-plus-plus @@ -48,25 +43,30 @@ def _decode_special_characters(tag: str) -> str: return "".join(result) -def normalize(tag: str) -> TagUid: +def normalize(tag: str, allow_unicode: bool) -> TagUid: tag = tag.lower() tag = _encode_special_characters(tag) - return slugify( - tag, - entities=True, - decimal=True, - hexadecimal=True, - max_length=0, - word_boundary=False, - save_order=True, - separator="-", - stopwords=(), - regex_pattern=DISALLOWED_CHARS_PATTERN, # type: ignore - lowercase=True, - replacements=(), - allow_unicode=False, + # Note: with allow_unicode True slugify normalizes unicode to NFKC + # if in the future we'll decide to change library for slugification + # we should either ensure that behavior or renormalize tags in the database. + return TagUid( + slugify( + tag, + entities=True, + decimal=True, + hexadecimal=True, + max_length=0, + word_boundary=False, + save_order=True, + separator="-", + stopwords=(), + regex_pattern=None, + lowercase=True, + replacements=(), + allow_unicode=allow_unicode, + ) ) diff --git a/ffun/ffun/tags/domain.py b/ffun/ffun/tags/domain.py index 6a46daf1..a3b7c4b3 100644 --- a/ffun/ffun/tags/domain.py +++ b/ffun/ffun/tags/domain.py @@ -2,21 +2,81 @@ from ffun.ontology.entities import NormalizedTag, RawTag from ffun.tags import converters, utils -from ffun.tags.entities import NormalizationMode, TagInNormalization +from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization from ffun.tags.normalizers import NormalizerInfo, normalizers +# Here is going some complicated unclear logic: +# - normalizers work with TagInNormalization +# - TagInNormalization should define how it should be processed by normalizers +# - There are two options: define explicitly or implicitly (derive from categories) +# - It may look like a good idea to define it explicitly, so we could have a normalizer +# that could say "I produce this new tag which should be processed as raw/preserve/final" +# - But this approach leads to uncertainty when we do re-normalization of tags in the database +# because we don't store the final normalization mode in the database +# (and it may be wrong in case of re-normalization) +# So, on re-normalization we use tag categories to derive the mode (again) +# We also use RawTag, not TagInNormalization as a result of running a normalizer. +# - That's why it seems more consistent to build the logic of normalizers around categories only, +# to keep the behavior consistent across the whole system. +# => We expect that a normalizer, if needed, will be able to set new categories for the tags it produces. +# For example, there may be a normalizer that detects network domains in free-form tags. +def mode_from_categories(categories: set[TagCategory]) -> NormalizationMode: # noqa: CCR001 + # The order of checks is important here + + if TagCategory.network_domain in categories: + return NormalizationMode.final + + if TagCategory.special in categories: + return NormalizationMode.final + + # We do not normalize native feed tags, because: + # - We have no control over the logic that assigns them + # - Sometimes they are (semi-)technical (special terms, domain names, codes) + # - Sometimes they are very specific, like r-sideproject (for subreddits) + # and we don't want to create a duplicated tag like r-sideprojects that actually has no meaning + if TagCategory.feed_tag in categories: + return NormalizationMode.final + + if TagCategory.free_form in categories: + return NormalizationMode.raw + + if TagCategory.test_final in categories: + return NormalizationMode.final + + if TagCategory.test_preserve in categories: + return NormalizationMode.preserve + + if TagCategory.test_raw in categories: + return NormalizationMode.raw + + raise NotImplementedError(f"Tag with unknown categories: {categories}") + + def prepare_for_normalization(tag: RawTag) -> TagInNormalization: - # we better normalize uids even for final tags: - # - In case all works well, they will remain unchanged - # - In case of some issues, we'll stop an error propagation here - uid = converters.normalize(tag.raw_uid) + # 1. We better normalize uids even for final tags: + # - In case all works well, they will remain unchanged + # - In case of some issues, we'll stop an error propagation here + # 2. We keep text normalization outside of the normalizers list, since: + # - it is a common step for all tags, and we don't want to repeat it in each normalizer + # - it is not a normalizer itself, but rather a preparation step for normalizers, + # so it is better to keep it outside of the normalizers list. For example, + # we fill .parts field on the base of normalized uid. + + mode = mode_from_categories(tag.categories) + + # We do not allow unicode characters in raw tags, they must be pure ASCII. At least for now. + # It can be changed in https://github.com/Tiendil/feeds.fun/issues/348 + allow_unicode = mode != NormalizationMode.raw + + uid = converters.normalize(tag.raw_uid, allow_unicode=allow_unicode) return TagInNormalization( uid=uid, parts=utils.uid_to_parts(uid), link=tag.link, categories=set(tag.categories), + mode=mode, ) diff --git a/ffun/ffun/tags/entities.py b/ffun/ffun/tags/entities.py index b800887e..7fb600e4 100644 --- a/ffun/ffun/tags/entities.py +++ b/ffun/ffun/tags/entities.py @@ -52,53 +52,7 @@ class TagInNormalization(BaseEntity): link: str | None categories: TagCategories - - # Here is going some complicated unclear logic: - # - normalizers work with TagInNormalization - # - TagInNormalization should define how it should be processed by normalizers - # - There are two options: define explicitly or implicitly (derive from categories) - # - It may look like a good idea to define it explicitly, so we could have a normalizer - # that could say "I produce this new tag which should be processed as raw/preserve/final" - # - But this approach leads to uncertainty when we doing re-normalization of tags in the database - # because we don't store the final normalization mode in the database - # (and it may be wrong in case of re-normalization) - # So, on re-normalization we use tag categories to derive the mode (again) - # We also use RawTag, not TagInNormalization as a result of running a normalizer. - # - That's why it seems more consistent to try building logic of normalizators around categories only - # To be consistent in the whole system - # => We expect, that normalizer, if it requires, will be able to set new categories for the tags it produces - # For example, there may be a normalizer that detects network domains in free-form tags - @property - def mode(self) -> NormalizationMode: # noqa: CCR001 - # The order of checks is important here - - if TagCategory.network_domain in self.categories: - return NormalizationMode.final - - if TagCategory.special in self.categories: - return NormalizationMode.final - - # We do not normalize native feed tags, because: - # - We have no control over the logic that assigns them - # - Sometimes they are (semi-)technical (special terms, domain names, codes) - # - Sometimes they are very specific, like r-sideproject (for subreddits) - # and we don't want to create a duplicated tag like r-sideprojects that actually has no meaning - if TagCategory.feed_tag in self.categories: - return NormalizationMode.final - - if TagCategory.free_form in self.categories: - return NormalizationMode.raw - - if TagCategory.test_final in self.categories: - return NormalizationMode.final - - if TagCategory.test_preserve in self.categories: - return NormalizationMode.preserve - - if TagCategory.test_raw in self.categories: - return NormalizationMode.raw - - raise NotImplementedError(f"Tag with unknown categories: {self.categories}") + mode: NormalizationMode class NormalizerType(enum.StrEnum): diff --git a/ffun/ffun/tags/normalizers/tests/test_base.py b/ffun/ffun/tags/normalizers/tests/test_base.py index 3faf8094..8fa162e4 100644 --- a/ffun/ffun/tags/normalizers/tests/test_base.py +++ b/ffun/ffun/tags/normalizers/tests/test_base.py @@ -8,7 +8,7 @@ from ffun.domain.entities import TagUid from ffun.ontology.entities import RawTag from ffun.tags import utils -from ffun.tags.entities import TagCategory, TagInNormalization +from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization from ffun.tags.normalizers.base import FakeNormalizer, NormalizerAlwaysError, NormalizerInfo @@ -62,6 +62,7 @@ def tag(self) -> TagInNormalization: parts=utils.uid_to_parts(uid), link=None, categories={TagCategory.test_preserve}, + mode=NormalizationMode.preserve, ) @pytest.fixture # type: ignore diff --git a/ffun/ffun/tags/normalizers/tests/test_form_normalizer.py b/ffun/ffun/tags/normalizers/tests/test_form_normalizer.py index 69c600c9..49d6a087 100644 --- a/ffun/ffun/tags/normalizers/tests/test_form_normalizer.py +++ b/ffun/ffun/tags/normalizers/tests/test_form_normalizer.py @@ -6,7 +6,7 @@ from ffun.domain.entities import TagUid from ffun.ontology.entities import RawTag from ffun.tags import converters, utils -from ffun.tags.entities import TagCategory, TagInNormalization +from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization from ffun.tags.normalizers import form_normalizer normalizer = form_normalizer.Normalizer() @@ -176,14 +176,15 @@ class TestNormalizer: ) @pytest.mark.asyncio async def test(self, input_uid: TagUid, expected_tag_valid: bool, expected_new_uids: list[str]) -> None: - assert converters.normalize(input_uid) == input_uid - assert all(converters.normalize(new_uid) == new_uid for new_uid in expected_new_uids) + assert converters.normalize(input_uid, allow_unicode=False) == input_uid + assert all(converters.normalize(new_uid, allow_unicode=False) == new_uid for new_uid in expected_new_uids) input_tag = TagInNormalization( uid=input_uid, parts=utils.uid_to_parts(input_uid), link="http://example.com/tag", categories={TagCategory.test_raw}, + mode=NormalizationMode.raw, ) expected_new_tags = [ @@ -203,6 +204,45 @@ async def test(self, input_uid: TagUid, expected_tag_valid: bool, expected_new_u assert tag_valid == expected_tag_valid assert new_tags == expected_new_tags + @pytest.mark.parametrize( + "input_uid, expected_tag_valid, expected_new_uids", + [ + ("café-reviews", True, []), + ("résumé-services", True, []), + ("привет-миры", True, []), + ("данные-аналитики", True, []), + ("cafés-review", False, ["cafés-reviews"]), + ], + ) + @pytest.mark.asyncio + async def test_unicode_input_is_safe( + self, input_uid: TagUid, expected_tag_valid: bool, expected_new_uids: list[str] + ) -> None: + assert converters.normalize(input_uid, allow_unicode=True) == input_uid + assert all(converters.normalize(new_uid, allow_unicode=True) == new_uid for new_uid in expected_new_uids) + + input_tag = TagInNormalization( + uid=input_uid, + parts=utils.uid_to_parts(input_uid), + link="http://example.com/tag", + categories={TagCategory.test_preserve}, + mode=NormalizationMode.preserve, + ) + + expected_new_tags = [ + RawTag( + raw_uid=new_uid, + link=input_tag.link, + categories=input_tag.categories, + ) + for new_uid in expected_new_uids + ] + + tag_valid, new_tags = await normalizer.normalize(input_tag) + + assert tag_valid == expected_tag_valid + assert new_tags == expected_new_tags + @pytest.mark.skipif(reason="Performance test disabled by default.") @pytest.mark.asyncio async def test_performance(self) -> None: @@ -214,6 +254,7 @@ async def test_performance(self) -> None: parts=utils.uid_to_parts(TagUid(input_uid)), link="http://example.com/tag", categories={TagCategory.test_raw}, + mode=NormalizationMode.raw, ) for input_uid in [ "book-cover-review", diff --git a/ffun/ffun/tags/normalizers/tests/test_part_blacklist.py b/ffun/ffun/tags/normalizers/tests/test_part_blacklist.py index 4fb3f943..050136fa 100644 --- a/ffun/ffun/tags/normalizers/tests/test_part_blacklist.py +++ b/ffun/ffun/tags/normalizers/tests/test_part_blacklist.py @@ -3,50 +3,60 @@ from ffun.domain.entities import TagUid from ffun.ontology.entities import RawTag from ffun.tags import converters, utils -from ffun.tags.entities import TagCategory, TagInNormalization +from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization from ffun.tags.normalizers import part_blacklist -normalizer = part_blacklist.Normalizer(blacklist={"a", "the"}) +normalizer = part_blacklist.Normalizer(blacklist={"a", "the", "и", "очень"}) class TestNormalizer: @pytest.mark.parametrize( - "input_uid, expected_continue, expected_new_uids", + "unicode, input_uid, expected_continue, expected_new_uids", [ - ("", False, []), - ("a-the", False, []), - ("no-removal", True, []), - ("noremoval-at-all", True, []), - ("the-best-startup", False, ["best-startup"]), - ("about-the-best", False, ["about-best"]), - ("about-best-the", False, ["about-best"]), - ("a-or-the", False, ["or"]), - ("a-the-best-of-the-best", False, ["best-of-best"]), - ("athe-best", True, []), - ("thea-best", True, []), - ("best-thea", True, []), - ("best-athe", True, []), - ("know-thea-best", True, []), - ("know-athe-best", True, []), - ("the-the-the", False, []), - ("a-a-a", False, []), - ("the-a-the-a", False, []), - ("a-the-a-the", False, []), - ("the-a-the-a-the", False, []), - ("best-the-a-the-a-the", False, ["best"]), - ("math-the-a-the-a-physics", False, ["math-physics"]), + (False, "", False, []), + (False, "a-the", False, []), + (False, "no-removal", True, []), + (False, "noremoval-at-all", True, []), + (False, "the-best-startup", False, ["best-startup"]), + (False, "about-the-best", False, ["about-best"]), + (False, "about-best-the", False, ["about-best"]), + (False, "a-or-the", False, ["or"]), + (False, "a-the-best-of-the-best", False, ["best-of-best"]), + (False, "athe-best", True, []), + (False, "thea-best", True, []), + (False, "best-thea", True, []), + (False, "best-athe", True, []), + (False, "know-thea-best", True, []), + (False, "know-athe-best", True, []), + (False, "the-the-the", False, []), + (False, "a-a-a", False, []), + (False, "the-a-the-a", False, []), + (False, "a-the-a-the", False, []), + (False, "the-a-the-a-the", False, []), + (False, "best-the-a-the-a-the", False, ["best"]), + (False, "math-the-a-the-a-physics", False, ["math-physics"]), + (True, "данные-и-аналитика", False, ["данные-аналитика"]), + (True, "résumé-и-портфолио", False, ["résumé-портфолио"]), + (True, "очень-café-уютно", False, ["café-уютно"]), + (True, "café-и-bistro", False, ["café-bistro"]), ], ) @pytest.mark.asyncio - async def test(self, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str]) -> None: - assert converters.normalize(input_uid) == input_uid - assert all(converters.normalize(new_uid) == new_uid for new_uid in expected_new_uids) + async def test( + self, unicode: bool, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str] + ) -> None: + assert converters.normalize(input_uid, allow_unicode=unicode) == input_uid + assert all(converters.normalize(new_uid, allow_unicode=unicode) == new_uid for new_uid in expected_new_uids) + + categories = {TagCategory.test_preserve} if unicode else {TagCategory.test_raw} + mode = NormalizationMode.preserve if unicode else NormalizationMode.raw input_tag = TagInNormalization( uid=input_uid, parts=utils.uid_to_parts(input_uid), link="http://example.com/tag", - categories={TagCategory.test_raw}, + categories=categories, + mode=mode, ) expected_new_tags = [ diff --git a/ffun/ffun/tags/normalizers/tests/test_part_replacer.py b/ffun/ffun/tags/normalizers/tests/test_part_replacer.py index 2ede28e3..f6c2e8b6 100644 --- a/ffun/ffun/tags/normalizers/tests/test_part_replacer.py +++ b/ffun/ffun/tags/normalizers/tests/test_part_replacer.py @@ -3,38 +3,58 @@ from ffun.domain.entities import TagUid from ffun.ontology.entities import RawTag from ffun.tags import converters, utils -from ffun.tags.entities import TagCategory, TagInNormalization +from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization from ffun.tags.normalizers import part_replacer -normalizer = part_replacer.Normalizer(replacements={"start-up": "startup", "set-up": "setup", "em": "them"}) +normalizer = part_replacer.Normalizer( + replacements={ + "start-up": "startup", + "set-up": "setup", + "em": "them", + "старт-ап": "стартап", + "веб-сайт": "вебсайт", + } +) class TestNormalizer: @pytest.mark.parametrize( - "input_uid, expected_continue, expected_new_uids", + "unicode, input_uid, expected_continue, expected_new_uids", [ - ("", False, []), - ("nohtingtodo", True, []), - ("nohting-to-do", True, []), - ("set-up-for-success", False, ["setup-for-success"]), - ("best-start-up-ever", False, ["best-startup-ever"]), - ("how-to-start-up", False, ["how-to-startup"]), - ("let-set-up-for-start-up", False, ["let-setup-for-start-up", "let-set-up-for-startup"]), - ("let-start-up-start-up", False, ["let-startup-startup"]), - ("let-start-up-or-not-start-up", False, ["let-startup-or-not-startup"]), - ("let-em-go", False, ["let-them-go"]), + (False, "", False, []), + (False, "nohtingtodo", True, []), + (False, "nohting-to-do", True, []), + (False, "set-up-for-success", False, ["setup-for-success"]), + (False, "best-start-up-ever", False, ["best-startup-ever"]), + (False, "how-to-start-up", False, ["how-to-startup"]), + (False, "let-set-up-for-start-up", False, ["let-setup-for-start-up", "let-set-up-for-startup"]), + (False, "let-start-up-start-up", False, ["let-startup-startup"]), + (False, "let-start-up-or-not-start-up", False, ["let-startup-or-not-startup"]), + (False, "let-em-go", False, ["let-them-go"]), + (True, "café-start-up-guide", False, ["café-startup-guide"]), + (True, "let-em-идти", False, ["let-them-идти"]), + (True, "данные-set-up-доклад", False, ["данные-setup-доклад"]), + (True, "привет-start-up-мир", False, ["привет-startup-мир"]), + (True, "привет-старт-ап-мир", False, ["привет-стартап-мир"]), + (True, "мой-веб-сайт", False, ["мой-вебсайт"]), ], ) @pytest.mark.asyncio - async def test(self, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str]) -> None: - assert converters.normalize(input_uid) == input_uid - assert all(converters.normalize(new_uid) == new_uid for new_uid in expected_new_uids) + async def test( + self, unicode: bool, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str] + ) -> None: + assert converters.normalize(input_uid, allow_unicode=unicode) == input_uid + assert all(converters.normalize(new_uid, allow_unicode=unicode) == new_uid for new_uid in expected_new_uids) + + categories = {TagCategory.test_preserve} if unicode else {TagCategory.test_raw} + mode = NormalizationMode.preserve if unicode else NormalizationMode.raw input_tag = TagInNormalization( uid=input_uid, parts=utils.uid_to_parts(input_uid), link="http://example.com/tag", - categories={TagCategory.test_raw}, + categories=categories, + mode=mode, ) expected_new_tags = [ diff --git a/ffun/ffun/tags/normalizers/tests/test_splitter.py b/ffun/ffun/tags/normalizers/tests/test_splitter.py index 7e1ac215..273884fd 100644 --- a/ffun/ffun/tags/normalizers/tests/test_splitter.py +++ b/ffun/ffun/tags/normalizers/tests/test_splitter.py @@ -3,58 +3,76 @@ from ffun.domain.entities import TagUid from ffun.ontology.entities import RawTag from ffun.tags import converters, utils -from ffun.tags.entities import TagCategory, TagInNormalization +from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization from ffun.tags.normalizers import splitter -normalizer = splitter.Normalizer(separators=["for", "impact-on"]) +normalizer = splitter.Normalizer(separators=["for", "impact-on", "через", "влияние-на"]) class TestNormalizer: @pytest.mark.parametrize( - "input_uid, expected_continue, expected_new_uids", + "unicode, input_uid, expected_continue, expected_new_uids", [ - ("", False, []), - ("nohtingtodo", True, []), - ("nohting-to-do", True, []), - ("checkforinside", True, []), - ("checkimpact-oninside", True, []), - ("set-up-for-success", False, ["set-up", "success"]), - ("for-x", False, ["x"]), - ("x-for-y", False, ["x", "y"]), - ("x-for", False, ["x"]), - ("social-media-impact-on-innovation", False, ["social-media", "innovation"]), - ("impact-on-innovation", False, ["innovation"]), + (False, "", False, []), + (False, "nohtingtodo", True, []), + (False, "nohting-to-do", True, []), + (False, "checkforinside", True, []), + (False, "checkimpact-oninside", True, []), + (False, "set-up-for-success", False, ["set-up", "success"]), + (False, "for-x", False, ["x"]), + (False, "x-for-y", False, ["x", "y"]), + (False, "x-for", False, ["x"]), + (False, "social-media-impact-on-innovation", False, ["social-media", "innovation"]), + (False, "impact-on-innovation", False, ["innovation"]), ( + False, "rest-api-for-graph-processing-impact-on-innovation", False, ["rest-api", "graph-processing-impact-on-innovation", "rest-api-for-graph-processing", "innovation"], ), - ("for-impact-on", False, ["impact-on", "for"]), - ("for-for-impact-on", False, ["for", "for-impact-on", "for-for", "impact-on"]), - ("x-for-y-for-z", False, ["x", "z", "y-for-z", "x-for-y"]), + (False, "for-impact-on", False, ["impact-on", "for"]), + (False, "for-for-impact-on", False, ["for", "for-impact-on", "for-for", "impact-on"]), + (False, "x-for-y-for-z", False, ["x", "z", "y-for-z", "x-for-y"]), ( + False, "impact-on-x-impact-on-y-impact-on", False, ["x-impact-on-y-impact-on", "impact-on-x", "y-impact-on", "impact-on-x-impact-on-y"], ), - ("x-impact-on-impact-on-y", False, ["x", "impact-on-y", "x-impact-on", "y"]), + (False, "x-impact-on-impact-on-y", False, ["x", "impact-on-y", "x-impact-on", "y"]), ( + False, "for-for-impact-on-impact-on", False, ["for", "for-for", "impact-on-impact-on", "impact-on", "for-impact-on-impact-on", "for-for-impact-on"], ), + (True, "café-for-bistro", False, ["café", "bistro"]), + (True, "for-café", False, ["café"]), + (True, "данные-for-аналитика", False, ["данные", "аналитика"]), + (True, "привет-impact-on-мир", False, ["привет", "мир"]), + (True, "impact-on-метрика", False, ["метрика"]), + (True, "данные-через-аналитика", False, ["данные", "аналитика"]), + (True, "через-метрика", False, ["метрика"]), + (True, "привет-влияние-на-мир", False, ["привет", "мир"]), + (True, "влияние-на-метрика", False, ["метрика"]), ], ) @pytest.mark.asyncio - async def test(self, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str]) -> None: - assert converters.normalize(input_uid) == input_uid - assert all(converters.normalize(new_uid) == new_uid for new_uid in expected_new_uids) + async def test( + self, unicode: bool, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str] + ) -> None: + assert converters.normalize(input_uid, allow_unicode=unicode) == input_uid + assert all(converters.normalize(new_uid, allow_unicode=unicode) == new_uid for new_uid in expected_new_uids) + + categories = {TagCategory.test_preserve} if unicode else {TagCategory.test_raw} + mode = NormalizationMode.preserve if unicode else NormalizationMode.raw input_tag = TagInNormalization( uid=input_uid, parts=utils.uid_to_parts(input_uid), link="http://example.com/tag", - categories={TagCategory.test_raw}, + categories=categories, + mode=mode, ) expected_new_tags = [ diff --git a/ffun/ffun/tags/tests/test_converters.py b/ffun/ffun/tags/tests/test_converters.py index f99efca3..31d40c8d 100644 --- a/ffun/ffun/tags/tests/test_converters.py +++ b/ffun/ffun/tags/tests/test_converters.py @@ -20,17 +20,40 @@ def test(self, tag: str, expected: str) -> None: class TestNormalize: @pytest.mark.parametrize( - "tag, expected", + "tag, allow_unicode, expected", [ - ("abc", "abc"), - ("abc def", "abc-def"), - ("c++", "c-plus-plus"), - ("c#", "c-sharp"), - ("www.example.com", "www-dot-example-dot-com"), + ("abc", False, "abc"), + ("abc def", False, "abc-def"), + ("c++", False, "c-plus-plus"), + ("c#", False, "c-sharp"), + ("www.example.com", False, "www-dot-example-dot-com"), + ("abc", True, "abc"), + ("Café au lait", False, "cafe-au-lait"), + ("Café au lait", True, "café-au-lait"), + ("Привет мир", True, "привет-мир"), + ("ABC", True, "abc"), + ("① ② ③", True, "1-2-3"), + ("㍍", True, "メートル"), + ("file name", True, "file-name"), ], ) - def test(self, tag: str, expected: str) -> None: - assert normalize(tag) == expected + def test(self, tag: str, allow_unicode: bool, expected: str) -> None: + assert normalize(tag, allow_unicode=allow_unicode) == expected + + @pytest.mark.parametrize( + "left, right, expected", + [ + ("Café au lait", "Cafe\u0301 au lait", "café-au-lait"), + ("ABC", "ABC", "abc"), + ("Ångström", "Ångström", "ångström"), + ("① ② ③", "1 2 3", "1-2-3"), + ("㍍", "メートル", "メートル"), + ("file name", "file name", "file-name"), + ], + ) + def test_unicode_forms_collapse_to_single_slug(self, left: str, right: str, expected: str) -> None: + assert normalize(left, allow_unicode=True) == expected + assert normalize(right, allow_unicode=True) == expected class TestDecodeSpecialCharacters: diff --git a/ffun/ffun/tags/tests/test_domain.py b/ffun/ffun/tags/tests/test_domain.py index cf4683c1..101241ca 100644 --- a/ffun/ffun/tags/tests/test_domain.py +++ b/ffun/ffun/tags/tests/test_domain.py @@ -3,12 +3,35 @@ from ffun.domain.entities import TagUid, TagUidPart from ffun.ontology.entities import NormalizedTag, RawTag -from ffun.tags.domain import apply_normalizers, normalize, prepare_for_normalization +from ffun.tags.domain import apply_normalizers, mode_from_categories, normalize, prepare_for_normalization from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization from ffun.tags.normalizers import FakeNormalizer, NormalizerAlwaysError, NormalizerInfo +from ffun.tags.normalizers.base import Normalizer from ffun.tags.utils import uid_to_parts +class TestModeFromCategories: + @pytest.mark.parametrize("category", TagCategory) + def test_each_category_has_mode(self, category: TagCategory) -> None: + assert mode_from_categories({category}) in NormalizationMode + + @pytest.mark.parametrize( + "categories, expected_mode", + [ + ({TagCategory.network_domain}, NormalizationMode.final), + ({TagCategory.special}, NormalizationMode.final), + ({TagCategory.feed_tag}, NormalizationMode.final), + ({TagCategory.free_form}, NormalizationMode.raw), + ({TagCategory.test_final}, NormalizationMode.final), + ({TagCategory.test_preserve}, NormalizationMode.preserve), + ({TagCategory.test_raw}, NormalizationMode.raw), + ({TagCategory.network_domain, TagCategory.free_form}, NormalizationMode.final), + ], + ) + def test(self, categories: set[TagCategory], expected_mode: NormalizationMode) -> None: + assert mode_from_categories(categories) == expected_mode + + class TestPrepareForNormalization: def test(self) -> None: @@ -25,8 +48,25 @@ def test(self) -> None: parts=[TagUidPart("example"), TagUidPart("tag")], link=raw_tag.link, categories=raw_tag.categories, + mode=NormalizationMode.final, ) + @pytest.mark.parametrize( + "categories, expected_mode, expected_uid", + [ + ({TagCategory.test_raw}, NormalizationMode.raw, TagUid("cafe-au-lait")), + ({TagCategory.test_preserve}, NormalizationMode.preserve, TagUid("café-au-lait")), + ({TagCategory.test_final}, NormalizationMode.final, TagUid("café-au-lait")), + ], + ) + def test_allow_unicode_depends_on_mode( + self, categories: set[TagCategory], expected_mode: NormalizationMode, expected_uid: TagUid + ) -> None: + prepared_tag = prepare_for_normalization(RawTag(raw_uid="Café au lait", link=None, categories=categories)) + + assert prepared_tag.mode == expected_mode + assert prepared_tag.uid == expected_uid + class TestApplyNormalizers: @@ -38,6 +78,7 @@ def tag(self) -> TagInNormalization: parts=uid_to_parts(uid), link=None, categories={TagCategory.test_preserve}, + mode=NormalizationMode.preserve, ) @pytest.fixture # type: ignore @@ -62,7 +103,7 @@ async def test_no_normalizers(self, tag: TagInNormalization) -> None: async def test_single_normalizer__preserve( self, tag_valid: bool, tag: TagInNormalization, raw_tags: list[RawTag] ) -> None: - tag = tag.replace(categories={TagCategory.test_preserve}) + tag = tag.replace(categories={TagCategory.test_preserve}, mode=NormalizationMode.preserve) normalizer = FakeNormalizer(tag_valid, raw_tags) info = NormalizerInfo(id=1, name="fake", normalizer=normalizer) @@ -76,7 +117,7 @@ async def test_single_normalizer__preserve( async def test_single_normalizer__raw( self, tag_valid: bool, tag: TagInNormalization, raw_tags: list[RawTag] ) -> None: - tag = tag.replace(categories={TagCategory.test_raw}) + tag = tag.replace(categories={TagCategory.test_raw}, mode=NormalizationMode.raw) normalizer = FakeNormalizer(tag_valid, raw_tags) info = NormalizerInfo(id=1, name="fake", normalizer=normalizer) @@ -90,7 +131,7 @@ async def test_single_normalizer__raw( async def test_single_normalizer__final( self, tag_valid: bool, tag: TagInNormalization, raw_tags: list[RawTag] ) -> None: - tag = tag.replace(categories={TagCategory.test_final}) + tag = tag.replace(categories={TagCategory.test_final}, mode=NormalizationMode.final) normalizer = FakeNormalizer(tag_valid, raw_tags) info = NormalizerInfo(id=1, name="fake", normalizer=normalizer) @@ -101,7 +142,7 @@ async def test_single_normalizer__final( @pytest.mark.asyncio async def test_chain_of_normalizers__preserve(self, tag: TagInNormalization, raw_tags: list[RawTag]) -> None: - tag = tag.replace(categories={TagCategory.test_preserve}) + tag = tag.replace(categories={TagCategory.test_preserve}, mode=NormalizationMode.preserve) normalizers = [ FakeNormalizer(True, [raw_tags[0]]), @@ -119,7 +160,7 @@ async def test_chain_of_normalizers__preserve(self, tag: TagInNormalization, raw @pytest.mark.asyncio async def test_chain_of_normalizers__not_preserve(self, tag: TagInNormalization, raw_tags: list[RawTag]) -> None: - tag = tag.replace(categories={TagCategory.test_raw}) + tag = tag.replace(categories={TagCategory.test_raw}, mode=NormalizationMode.raw) normalizers = [ FakeNormalizer(True, [raw_tags[0]]), @@ -177,6 +218,24 @@ def fake_normalize(_self: object, _tag: TagInNormalization) -> tuple[bool, list[ async def test_no_tags(self) -> None: assert await normalize([]) == [] + @pytest.mark.asyncio + async def test_final_tag_skips_normalizers(self) -> None: + calls = [] + + class RecordingNormalizer(Normalizer): + async def normalize(self, tag: TagInNormalization) -> tuple[bool, list[RawTag]]: + calls.append(tag.uid) + return True, [RawTag(raw_uid="unexpected-child", categories={TagCategory.test_raw})] + + info = NormalizerInfo(id=1, name="recording", normalizer=RecordingNormalizer()) + + result = await normalize([RawTag(raw_uid="Café au lait", categories={TagCategory.test_final})], [info]) + + assert calls == [] + assert result == [ + NormalizedTag(uid=TagUid("café-au-lait"), link=None, categories={TagCategory.test_final}), + ] + @pytest.mark.parametrize( "raw_uid,norm_uid", [ @@ -295,7 +354,6 @@ async def test_no_normalizers(self) -> None: assert resulted == expected - # TODO: add test_final somewhere here? @pytest.mark.asyncio async def test_tags_chain(self, mocker: MockerFixture) -> None: # pylint: disable=R0914 tag_1 = RawTag(raw_uid="tag-1", link=None, categories={TagCategory.test_preserve}) diff --git a/ffun/ffun/tags/tests/test_entities.py b/ffun/ffun/tags/tests/test_entities.py deleted file mode 100644 index 8c7dbba1..00000000 --- a/ffun/ffun/tags/tests/test_entities.py +++ /dev/null @@ -1,18 +0,0 @@ -import pytest - -from ffun.domain.entities import TagUid, TagUidPart -from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization - - -class TestTagInNormalization: - - @pytest.mark.parametrize("category", TagCategory) - def test_each_tag_category_has_mode(self, category: TagCategory) -> None: - tag = TagInNormalization( - uid=TagUid("example-tag"), - parts=[TagUidPart("example"), TagUidPart("tag")], - link=None, - categories={category}, - ) - - assert tag.mode in NormalizationMode