Tiendil · Tiendil · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/changes/next_release.md b/changes/next_release.md
@@ -0,0 +1,2 @@
+
+- ff-680 (gh-507) — Allowed unicode characters in the native feed tags to support non-English tags that are not owned by Feeds Fun.
diff --git a/changes/unreleased.md b/changes/unreleased.md
diff --git a/ffun/ffun/tags/converters.py b/ffun/ffun/tags/converters.py
@@ -1,12 +1,7 @@
-import re
-
 from slugify import slugify
 
 from ffun.domain.entities import TagUid
 
-DISALLOWED_CHARS_PATTERN = re.compile(r"[^-a-zA-Z0-9]+")
-
-
 _encode_replacements = {
     "#": "-sharp-",  # c# -> c-sharp
     "+": "-plus-",  # c++ -> c-plus-plus
@@ -48,25 +43,30 @@ def _decode_special_characters(tag: str) -> str:
     return "".join(result)
 
 
-def normalize(tag: str) -> TagUid:
+def normalize(tag: str, allow_unicode: bool) -> TagUid:
     tag = tag.lower()
 
     tag = _encode_special_characters(tag)
 
-    return slugify(
-        tag,
-        entities=True,
-        decimal=True,
-        hexadecimal=True,
-        max_length=0,
-        word_boundary=False,
-        save_order=True,
-        separator="-",
-        stopwords=(),
-        regex_pattern=DISALLOWED_CHARS_PATTERN,  # type: ignore
-        lowercase=True,
-        replacements=(),
-        allow_unicode=False,
+    # Note: with allow_unicode True slugify normalizes unicode to NFKC
+    #       if in the future we'll decide to change library for slugification
+    #       we should either ensure that behavior or renormalize tags in the database.
+    return TagUid(
+        slugify(
+            tag,
+            entities=True,
+            decimal=True,
+            hexadecimal=True,
+            max_length=0,
+            word_boundary=False,
+            save_order=True,
+            separator="-",
+            stopwords=(),
+            regex_pattern=None,
+            lowercase=True,
+            replacements=(),
+            allow_unicode=allow_unicode,
+        )
     )
 
 

diff --git a/ffun/ffun/tags/domain.py b/ffun/ffun/tags/domain.py
@@ -2,21 +2,81 @@
 
 from ffun.ontology.entities import NormalizedTag, RawTag
 from ffun.tags import converters, utils
-from ffun.tags.entities import NormalizationMode, TagInNormalization
+from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization
 from ffun.tags.normalizers import NormalizerInfo, normalizers
 
 
+# Here is going some complicated unclear logic:
+# - normalizers work with TagInNormalization
+# - TagInNormalization should define how it should be processed by normalizers
+# - There are two options: define explicitly or implicitly (derive from categories)
+# - It may look like a good idea to define it explicitly, so we could have a normalizer
+#   that could say "I produce this new tag which should be processed as raw/preserve/final"
+# - But this approach leads to uncertainty when we do re-normalization of tags in the database
+#   because we don't store the final normalization mode in the database
+#   (and it may be wrong in case of re-normalization)
+#   So, on re-normalization we use tag categories to derive the mode (again)
+#   We also use RawTag, not TagInNormalization as a result of running a normalizer.
+# - That's why it seems more consistent to build the logic of normalizers around categories only,
+#   to keep the behavior consistent across the whole system.
+# => We expect that a normalizer, if needed, will be able to set new categories for the tags it produces.
+#    For example, there may be a normalizer that detects network domains in free-form tags.
+def mode_from_categories(categories: set[TagCategory]) -> NormalizationMode:  # noqa: CCR001
+    # The order of checks is important here
+
+    if TagCategory.network_domain in categories:
+        return NormalizationMode.final
+
+    if TagCategory.special in categories:
+        return NormalizationMode.final
+
+    # We do not normalize native feed tags, because:
+    # - We have no control over the logic that assigns them
+    # - Sometimes they are (semi-)technical (special terms, domain names, codes)
+    # - Sometimes they are very specific, like r-sideproject (for subreddits)
+    #   and we don't want to create a duplicated tag like r-sideprojects that actually has no meaning
+    if TagCategory.feed_tag in categories:
+        return NormalizationMode.final
+
+    if TagCategory.free_form in categories:
+        return NormalizationMode.raw
+
+    if TagCategory.test_final in categories:
+        return NormalizationMode.final
+
+    if TagCategory.test_preserve in categories:
+        return NormalizationMode.preserve
+
+    if TagCategory.test_raw in categories:
+        return NormalizationMode.raw
+
+    raise NotImplementedError(f"Tag with unknown categories: {categories}")
+
+
 def prepare_for_normalization(tag: RawTag) -> TagInNormalization:
-    # we better normalize uids even for final tags:
-    # - In case all works well, they will remain unchanged
-    # - In case of some issues, we'll stop an error propagation here
-    uid = converters.normalize(tag.raw_uid)
+    # 1. We better normalize uids even for final tags:
+    #    - In case all works well, they will remain unchanged
+    #    - In case of some issues, we'll stop an error propagation here
+    # 2. We keep text normalization outside of the normalizers list, since:
+    #    - it is a common step for all tags, and we don't want to repeat it in each normalizer
+    #    - it is not a normalizer itself, but rather a preparation step for normalizers,
+    #      so it is better to keep it outside of the normalizers list. For example,
+    #      we fill .parts field on the base of normalized uid.
+
+    mode = mode_from_categories(tag.categories)
+
+    # We do not allow unicode characters in raw tags, they must be pure ASCII. At least for now.
+    # It can be changed in https://github.com/Tiendil/feeds.fun/issues/348
+    allow_unicode = mode != NormalizationMode.raw
+
+    uid = converters.normalize(tag.raw_uid, allow_unicode=allow_unicode)
 
     return TagInNormalization(
         uid=uid,
         parts=utils.uid_to_parts(uid),
         link=tag.link,
         categories=set(tag.categories),
+        mode=mode,
     )
 
 

diff --git a/ffun/ffun/tags/entities.py b/ffun/ffun/tags/entities.py
@@ -52,53 +52,7 @@ class TagInNormalization(BaseEntity):
 
     link: str | None
     categories: TagCategories
-
-    # Here is going some complicated unclear logic:
-    # - normalizers work with TagInNormalization
-    # - TagInNormalization should define how it should be processed by normalizers
-    # - There are two options: define explicitly or implicitly (derive from categories)
-    # - It may look like a good idea to define it explicitly, so we could have a normalizer
-    #   that could say "I produce this new tag which should be processed as raw/preserve/final"
-    # - But this approach leads to uncertainty when we doing re-normalization of tags in the database
-    #   because we don't store the final normalization mode in the database
-    #   (and it may be wrong in case of re-normalization)
-    #   So, on re-normalization we use tag categories to derive the mode (again)
-    #   We also use RawTag, not TagInNormalization as a result of running a normalizer.
-    # - That's why it seems more consistent to try building logic of normalizators around categories only
-    #   To be consistent in the whole system
-    # => We expect, that normalizer, if it requires, will be able to set new categories for the tags it produces
-    #    For example, there may be a normalizer that detects network domains in free-form tags
-    @property
-    def mode(self) -> NormalizationMode:  # noqa: CCR001
-        # The order of checks is important here
-
-        if TagCategory.network_domain in self.categories:
-            return NormalizationMode.final
-
-        if TagCategory.special in self.categories:
-            return NormalizationMode.final
-
-        # We do not normalize native feed tags, because:
-        # - We have no control over the logic that assigns them
-        # - Sometimes they are (semi-)technical (special terms, domain names, codes)
-        # - Sometimes they are very specific, like r-sideproject (for subreddits)
-        #   and we don't want to create a duplicated tag like r-sideprojects that actually has no meaning
-        if TagCategory.feed_tag in self.categories:
-            return NormalizationMode.final
-
-        if TagCategory.free_form in self.categories:
-            return NormalizationMode.raw
-
-        if TagCategory.test_final in self.categories:
-            return NormalizationMode.final
-
-        if TagCategory.test_preserve in self.categories:
-            return NormalizationMode.preserve
-
-        if TagCategory.test_raw in self.categories:
-            return NormalizationMode.raw
-
-        raise NotImplementedError(f"Tag with unknown categories: {self.categories}")
+    mode: NormalizationMode
 
 
 class NormalizerType(enum.StrEnum):

diff --git a/ffun/ffun/tags/normalizers/tests/test_base.py b/ffun/ffun/tags/normalizers/tests/test_base.py
@@ -8,7 +8,7 @@
 from ffun.domain.entities import TagUid
 from ffun.ontology.entities import RawTag
 from ffun.tags import utils
-from ffun.tags.entities import TagCategory, TagInNormalization
+from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization
 from ffun.tags.normalizers.base import FakeNormalizer, NormalizerAlwaysError, NormalizerInfo
 
 
@@ -62,6 +62,7 @@ def tag(self) -> TagInNormalization:
             parts=utils.uid_to_parts(uid),
             link=None,
             categories={TagCategory.test_preserve},
+            mode=NormalizationMode.preserve,
         )
 
     @pytest.fixture  # type: ignore

diff --git a/ffun/ffun/tags/normalizers/tests/test_form_normalizer.py b/ffun/ffun/tags/normalizers/tests/test_form_normalizer.py
@@ -6,7 +6,7 @@
 from ffun.domain.entities import TagUid
 from ffun.ontology.entities import RawTag
 from ffun.tags import converters, utils
-from ffun.tags.entities import TagCategory, TagInNormalization
+from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization
 from ffun.tags.normalizers import form_normalizer
 
 normalizer = form_normalizer.Normalizer()
@@ -176,14 +176,15 @@ class TestNormalizer:
     )
     @pytest.mark.asyncio
     async def test(self, input_uid: TagUid, expected_tag_valid: bool, expected_new_uids: list[str]) -> None:
-        assert converters.normalize(input_uid) == input_uid
-        assert all(converters.normalize(new_uid) == new_uid for new_uid in expected_new_uids)
+        assert converters.normalize(input_uid, allow_unicode=False) == input_uid
+        assert all(converters.normalize(new_uid, allow_unicode=False) == new_uid for new_uid in expected_new_uids)
 
         input_tag = TagInNormalization(
             uid=input_uid,
             parts=utils.uid_to_parts(input_uid),
             link="http://example.com/tag",
             categories={TagCategory.test_raw},
+            mode=NormalizationMode.raw,
         )
 
         expected_new_tags = [
@@ -203,6 +204,45 @@ async def test(self, input_uid: TagUid, expected_tag_valid: bool, expected_new_u
         assert tag_valid == expected_tag_valid
         assert new_tags == expected_new_tags
 
+    @pytest.mark.parametrize(
+        "input_uid, expected_tag_valid, expected_new_uids",
+        [
+            ("café-reviews", True, []),
+            ("résumé-services", True, []),
+            ("привет-миры", True, []),
+            ("данные-аналитики", True, []),
+            ("cafés-review", False, ["cafés-reviews"]),
+        ],
+    )
+    @pytest.mark.asyncio
+    async def test_unicode_input_is_safe(
+        self, input_uid: TagUid, expected_tag_valid: bool, expected_new_uids: list[str]
+    ) -> None:
+        assert converters.normalize(input_uid, allow_unicode=True) == input_uid
+        assert all(converters.normalize(new_uid, allow_unicode=True) == new_uid for new_uid in expected_new_uids)
+
+        input_tag = TagInNormalization(
+            uid=input_uid,
+            parts=utils.uid_to_parts(input_uid),
+            link="http://example.com/tag",
+            categories={TagCategory.test_preserve},
+            mode=NormalizationMode.preserve,
+        )
+
+        expected_new_tags = [
+            RawTag(
+                raw_uid=new_uid,
+                link=input_tag.link,
+                categories=input_tag.categories,
+            )
+            for new_uid in expected_new_uids
+        ]
+
+        tag_valid, new_tags = await normalizer.normalize(input_tag)
+
+        assert tag_valid == expected_tag_valid
+        assert new_tags == expected_new_tags
+
     @pytest.mark.skipif(reason="Performance test disabled by default.")
     @pytest.mark.asyncio
     async def test_performance(self) -> None:
@@ -214,6 +254,7 @@ async def test_performance(self) -> None:
                 parts=utils.uid_to_parts(TagUid(input_uid)),
                 link="http://example.com/tag",
                 categories={TagCategory.test_raw},
+                mode=NormalizationMode.raw,
             )
             for input_uid in [
                 "book-cover-review",

diff --git a/ffun/ffun/tags/normalizers/tests/test_part_blacklist.py b/ffun/ffun/tags/normalizers/tests/test_part_blacklist.py
@@ -3,50 +3,60 @@
 from ffun.domain.entities import TagUid
 from ffun.ontology.entities import RawTag
 from ffun.tags import converters, utils
-from ffun.tags.entities import TagCategory, TagInNormalization
+from ffun.tags.entities import NormalizationMode, TagCategory, TagInNormalization
 from ffun.tags.normalizers import part_blacklist
 
-normalizer = part_blacklist.Normalizer(blacklist={"a", "the"})
+normalizer = part_blacklist.Normalizer(blacklist={"a", "the", "и", "очень"})
 
 
 class TestNormalizer:
     @pytest.mark.parametrize(
-        "input_uid, expected_continue, expected_new_uids",
+        "unicode, input_uid, expected_continue, expected_new_uids",
         [
-            ("", False, []),
-            ("a-the", False, []),
-            ("no-removal", True, []),
-            ("noremoval-at-all", True, []),
-            ("the-best-startup", False, ["best-startup"]),
-            ("about-the-best", False, ["about-best"]),
-            ("about-best-the", False, ["about-best"]),
-            ("a-or-the", False, ["or"]),
-            ("a-the-best-of-the-best", False, ["best-of-best"]),
-            ("athe-best", True, []),
-            ("thea-best", True, []),
-            ("best-thea", True, []),
-            ("best-athe", True, []),
-            ("know-thea-best", True, []),
-            ("know-athe-best", True, []),
-            ("the-the-the", False, []),
-            ("a-a-a", False, []),
-            ("the-a-the-a", False, []),
-            ("a-the-a-the", False, []),
-            ("the-a-the-a-the", False, []),
-            ("best-the-a-the-a-the", False, ["best"]),
-            ("math-the-a-the-a-physics", False, ["math-physics"]),
+            (False, "", False, []),
+            (False, "a-the", False, []),
+            (False, "no-removal", True, []),
+            (False, "noremoval-at-all", True, []),
+            (False, "the-best-startup", False, ["best-startup"]),
+            (False, "about-the-best", False, ["about-best"]),
+            (False, "about-best-the", False, ["about-best"]),
+            (False, "a-or-the", False, ["or"]),
+            (False, "a-the-best-of-the-best", False, ["best-of-best"]),
+            (False, "athe-best", True, []),
+            (False, "thea-best", True, []),
+            (False, "best-thea", True, []),
+            (False, "best-athe", True, []),
+            (False, "know-thea-best", True, []),
+            (False, "know-athe-best", True, []),
+            (False, "the-the-the", False, []),
+            (False, "a-a-a", False, []),
+            (False, "the-a-the-a", False, []),
+            (False, "a-the-a-the", False, []),
+            (False, "the-a-the-a-the", False, []),
+            (False, "best-the-a-the-a-the", False, ["best"]),
+            (False, "math-the-a-the-a-physics", False, ["math-physics"]),
+            (True, "данные-и-аналитика", False, ["данные-аналитика"]),
+            (True, "résumé-и-портфолио", False, ["résumé-портфолио"]),
+            (True, "очень-café-уютно", False, ["café-уютно"]),
+            (True, "café-и-bistro", False, ["café-bistro"]),
         ],
     )
     @pytest.mark.asyncio
-    async def test(self, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str]) -> None:
-        assert converters.normalize(input_uid) == input_uid
-        assert all(converters.normalize(new_uid) == new_uid for new_uid in expected_new_uids)
+    async def test(
+        self, unicode: bool, input_uid: TagUid, expected_continue: bool, expected_new_uids: list[str]
+    ) -> None:
+        assert converters.normalize(input_uid, allow_unicode=unicode) == input_uid
+        assert all(converters.normalize(new_uid, allow_unicode=unicode) == new_uid for new_uid in expected_new_uids)
+
+        categories = {TagCategory.test_preserve} if unicode else {TagCategory.test_raw}
+        mode = NormalizationMode.preserve if unicode else NormalizationMode.raw
 
         input_tag = TagInNormalization(
             uid=input_uid,
             parts=utils.uid_to_parts(input_uid),
             link="http://example.com/tag",
-            categories={TagCategory.test_raw},
+            categories=categories,
+            mode=mode,
         )
 
         expected_new_tags = [
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		- ff-680 (gh-507) — Allowed unicode characters in the native feed tags to support non-English tags that are not owned by Feeds Fun.