From f86a1fe91cf3179201c9c3d525274c805f4191bd Mon Sep 17 00:00:00 2001 From: fatelei Date: Tue, 30 Dec 2025 17:44:26 +0800 Subject: [PATCH 1/3] fix: fix extractor loses hyperlink text and target URL --- api/core/rag/cleaner/clean_processor.py | 44 +++++++++++++------ api/core/tools/utils/text_processing_utils.py | 6 +++ 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/api/core/rag/cleaner/clean_processor.py b/api/core/rag/cleaner/clean_processor.py index 9cb009035bfc9d..e182c35b9990ee 100644 --- a/api/core/rag/cleaner/clean_processor.py +++ b/api/core/rag/cleaner/clean_processor.py @@ -27,26 +27,44 @@ def clean(cls, text: str, process_rule: dict) -> str: pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" text = re.sub(pattern, "", text) - # Remove URL but keep Markdown image URLs - # First, temporarily replace Markdown image URLs with a placeholder - markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)" - placeholders: list[str] = [] + # Remove URL but keep Markdown image URLs and link URLs + # Replace the ENTIRE markdown link/image with a single placeholder to protect + # the link text (which might also be a URL) from being removed + markdown_link_pattern = r"\[([^\]]*)\]\((https?://[^)]+)\)" + markdown_image_pattern = r"!\[.*?\]\((https?://[^)]+)\)" + placeholders: list[tuple[str, str, str]] = [] # (type, text, url) - def replace_with_placeholder(match, placeholders=placeholders): + def replace_markdown_with_placeholder(match, placeholders=placeholders): + link_type = "link" + link_text = match.group(1) + url = match.group(2) + placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__" + placeholders.append((link_type, link_text, url)) + return placeholder + + def replace_image_with_placeholder(match, placeholders=placeholders): + link_type = "image" url = match.group(1) - placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__" - placeholders.append(url) - return f"![image]({placeholder})" + placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__" + placeholders.append((link_type, "image", url)) + return placeholder - text = re.sub(markdown_image_pattern, replace_with_placeholder, text) + # Protect markdown links first + text = re.sub(markdown_link_pattern, replace_markdown_with_placeholder, text) + # Then protect markdown images + text = re.sub(markdown_image_pattern, replace_image_with_placeholder, text) # Now remove all remaining URLs - url_pattern = r"https?://[^\s)]+" + url_pattern = r"https?://\S+" text = re.sub(url_pattern, "", text) - # Finally, restore the Markdown image URLs - for i, url in enumerate(placeholders): - text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url) + # Restore the Markdown links and images + for i, (link_type, text_or_alt, url) in enumerate(placeholders): + placeholder = f"__MARKDOWN_PLACEHOLDER_{i}__" + if link_type == "link": + text = text.replace(placeholder, f"[{text_or_alt}]({url})") + else: # image + text = text.replace(placeholder, f"![{text_or_alt}]({url})") return text def filter_string(self, text): diff --git a/api/core/tools/utils/text_processing_utils.py b/api/core/tools/utils/text_processing_utils.py index 0f9a91a111f89d..43425d78a55a2f 100644 --- a/api/core/tools/utils/text_processing_utils.py +++ b/api/core/tools/utils/text_processing_utils.py @@ -4,6 +4,7 @@ def remove_leading_symbols(text: str) -> str: """ Remove leading punctuation or symbols from the given text. + Preserves markdown links like [text](url) at the start. Args: text (str): The input text to process. @@ -11,6 +12,11 @@ def remove_leading_symbols(text: str) -> str: Returns: str: The text with leading punctuation or symbols removed. """ + # Check if text starts with a markdown link - preserve it + markdown_link_pattern = r'^\[([^\]]+)\]\((https?://[^)]+)\)' + if re.match(markdown_link_pattern, text): + return text + # Match Unicode ranges for punctuation and symbols # FIXME this pattern is confused quick fix for #11868 maybe refactor it later pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+' From 5a8635c492972fb14d2171800db04d42479ca391 Mon Sep 17 00:00:00 2001 From: fatelei Date: Tue, 30 Dec 2025 23:10:24 +0800 Subject: [PATCH 2/3] chore: add unittest --- .../unit_tests/core/rag/cleaner/__init__.py | 0 .../core/rag/cleaner/test_clean_processor.py | 268 ++++++++++++++++++ .../unit_tests/utils/test_text_processing.py | 5 + 3 files changed, 273 insertions(+) create mode 100644 api/tests/unit_tests/core/rag/cleaner/__init__.py create mode 100644 api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py diff --git a/api/tests/unit_tests/core/rag/cleaner/__init__.py b/api/tests/unit_tests/core/rag/cleaner/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py b/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py new file mode 100644 index 00000000000000..8d172118ebc790 --- /dev/null +++ b/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py @@ -0,0 +1,268 @@ + +from core.rag.cleaner.clean_processor import CleanProcessor + + +class TestCleanProcessor: + """Test cases for CleanProcessor.clean method.""" + + def test_clean_default_removal_of_invalid_symbols(self): + """Test default cleaning removes invalid symbols.""" + # Test <| replacement + assert CleanProcessor.clean("text<|with<|invalid", None) == "text replacement + assert CleanProcessor.clean("text|>with|>invalid", None) == "text>with>invalid" + + # Test removal of control characters + text_with_control = "normal\x00text\x1Fwith\x07control\x7Fchars" + expected = "normaltextwithcontrolchars" + assert CleanProcessor.clean(text_with_control, None) == expected + + # Test U+FFFE removal + text_with_ufffe = "normal\ufffepadding" + expected = "normalpadding" + assert CleanProcessor.clean(text_with_ufffe, None) == expected + + def test_clean_with_none_process_rule(self): + """Test cleaning with None process_rule - only default cleaning applied.""" + text = "Hello<|World\x00" + expected = "Hello becomes >, control chars and U+FFFE are removed + assert CleanProcessor.clean(text, None) == "<<>>" + + def test_clean_multiple_markdown_links_preserved(self): + """Test multiple markdown links are all preserved.""" + process_rule = { + "rules": { + "pre_processing_rules": [ + {"id": "remove_urls_emails", "enabled": True} + ] + } + } + + text = "[One](https://one.com) [Two](http://two.org) [Three](https://three.net)" + expected = "[One](https://one.com) [Two](http://two.org) [Three](https://three.net)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_text_as_url(self): + """Test markdown link where the link text itself is a URL.""" + process_rule = { + "rules": { + "pre_processing_rules": [ + {"id": "remove_urls_emails", "enabled": True} + ] + } + } + + # Link text that looks like URL should be preserved + text = "[https://text-url.com](https://actual-url.com)" + expected = "[https://text-url.com](https://actual-url.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + # Text URL without markdown should be removed + text = "https://text-url.com https://actual-url.com" + expected = " " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_complex_markdown_link_content(self): + """Test markdown links with complex content - known limitation with brackets in link text.""" + process_rule = { + "rules": { + "pre_processing_rules": [ + {"id": "remove_urls_emails", "enabled": True} + ] + } + } + + # Note: The regex pattern [^\]]* cannot handle ] within link text + # This is a known limitation - the pattern stops at the first ] + text = "[Text with [brackets] and (parens)](https://example.com)" + # Actual behavior: only matches up to first ], URL gets removed + expected = "[Text with [brackets] and (parens)](" + assert CleanProcessor.clean(text, process_rule) == expected + + # Test that properly formatted markdown links work + text = "[Text with (parens) and symbols](https://example.com)" + expected = "[Text with (parens) and symbols](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected diff --git a/api/tests/unit_tests/utils/test_text_processing.py b/api/tests/unit_tests/utils/test_text_processing.py index 11e017464adc4b..bf61162a66bf8e 100644 --- a/api/tests/unit_tests/utils/test_text_processing.py +++ b/api/tests/unit_tests/utils/test_text_processing.py @@ -15,6 +15,11 @@ ("", ""), (" ", " "), ("【测试】", "【测试】"), + # Markdown link preservation - should be preserved if text starts with a markdown link + ("[Google](https://google.com) is a search engine", "[Google](https://google.com) is a search engine"), + ("[Example](http://example.com) some text", "[Example](http://example.com) some text"), + # Leading symbols before markdown link are removed, including the opening bracket [ + ("@[Test](https://example.com)", "Test](https://example.com)"), ], ) def test_remove_leading_symbols(input_text, expected_output): From aa293bb0c28d8a4d359252c422b787ba71436da0 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Wed, 31 Dec 2025 01:58:12 +0000 Subject: [PATCH 3/3] [autofix.ci] apply automated fixes --- api/core/tools/utils/text_processing_utils.py | 2 +- .../core/rag/cleaner/test_clean_processor.py | 79 +++---------------- 2 files changed, 13 insertions(+), 68 deletions(-) diff --git a/api/core/tools/utils/text_processing_utils.py b/api/core/tools/utils/text_processing_utils.py index 43425d78a55a2f..4bfaa5e49bd345 100644 --- a/api/core/tools/utils/text_processing_utils.py +++ b/api/core/tools/utils/text_processing_utils.py @@ -13,7 +13,7 @@ def remove_leading_symbols(text: str) -> str: str: The text with leading punctuation or symbols removed. """ # Check if text starts with a markdown link - preserve it - markdown_link_pattern = r'^\[([^\]]+)\]\((https?://[^)]+)\)' + markdown_link_pattern = r"^\[([^\]]+)\]\((https?://[^)]+)\)" if re.match(markdown_link_pattern, text): return text diff --git a/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py b/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py index 8d172118ebc790..65ee62b8dd2032 100644 --- a/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py +++ b/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py @@ -1,4 +1,3 @@ - from core.rag.cleaner.clean_processor import CleanProcessor @@ -14,7 +13,7 @@ def test_clean_default_removal_of_invalid_symbols(self): assert CleanProcessor.clean("text|>with|>invalid", None) == "text>with>invalid" # Test removal of control characters - text_with_control = "normal\x00text\x1Fwith\x07control\x7Fchars" + text_with_control = "normal\x00text\x1fwith\x07control\x7fchars" expected = "normaltextwithcontrolchars" assert CleanProcessor.clean(text_with_control, None) == expected @@ -43,13 +42,7 @@ def test_clean_with_empty_rules(self): def test_clean_remove_extra_spaces_enabled(self): """Test remove_extra_spaces rule when enabled.""" - process_rule = { - "rules": { - "pre_processing_rules": [ - {"id": "remove_extra_spaces", "enabled": True} - ] - } - } + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} # Test multiple newlines reduced to two text = "Line1\n\n\n\n\nLine2" @@ -68,13 +61,7 @@ def test_clean_remove_extra_spaces_enabled(self): def test_clean_remove_extra_spaces_disabled(self): """Test remove_extra_spaces rule when disabled.""" - process_rule = { - "rules": { - "pre_processing_rules": [ - {"id": "remove_extra_spaces", "enabled": False} - ] - } - } + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": False}]}} text = "Line1\n\n\n\n\nLine2 with spaces" # Should only apply default cleaning (no invalid symbols here) @@ -82,13 +69,7 @@ def test_clean_remove_extra_spaces_disabled(self): def test_clean_remove_urls_emails_enabled(self): """Test remove_urls_emails rule when enabled.""" - process_rule = { - "rules": { - "pre_processing_rules": [ - {"id": "remove_urls_emails", "enabled": True} - ] - } - } + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} # Test email removal text = "Contact us at test@example.com for more info" @@ -107,13 +88,7 @@ def test_clean_remove_urls_emails_enabled(self): def test_clean_preserve_markdown_links_and_images(self): """Test that markdown links and images are preserved when removing URLs.""" - process_rule = { - "rules": { - "pre_processing_rules": [ - {"id": "remove_urls_emails", "enabled": True} - ] - } - } + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} # Test markdown link preservation text = "Check [Google](https://google.com) for info" @@ -142,13 +117,7 @@ def test_clean_preserve_markdown_links_and_images(self): def test_clean_remove_urls_emails_disabled(self): """Test remove_urls_emails rule when disabled.""" - process_rule = { - "rules": { - "pre_processing_rules": [ - {"id": "remove_urls_emails", "enabled": False} - ] - } - } + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": False}]}} text = "Email test@example.com visit https://example.com" # Should only apply default cleaning @@ -160,7 +129,7 @@ def test_clean_both_rules_enabled(self): "rules": { "pre_processing_rules": [ {"id": "remove_extra_spaces", "enabled": True}, - {"id": "remove_urls_emails", "enabled": True} + {"id": "remove_urls_emails", "enabled": True}, ] } } @@ -175,7 +144,7 @@ def test_clean_with_markdown_link_and_extra_spaces(self): "rules": { "pre_processing_rules": [ {"id": "remove_extra_spaces", "enabled": True}, - {"id": "remove_urls_emails", "enabled": True} + {"id": "remove_urls_emails", "enabled": True}, ] } } @@ -186,13 +155,7 @@ def test_clean_with_markdown_link_and_extra_spaces(self): def test_clean_unknown_rule_id_ignored(self): """Test that unknown rule IDs are silently ignored.""" - process_rule = { - "rules": { - "pre_processing_rules": [ - {"id": "unknown_rule", "enabled": True} - ] - } - } + process_rule = {"rules": {"pre_processing_rules": [{"id": "unknown_rule", "enabled": True}]}} text = "Hello<|World\x00" expected = "Hello