TaCertoIssoAI · caue-paiva · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/app/agentic_ai/conftest.py b/app/agentic_ai/conftest.py
@@ -0,0 +1,7 @@
+import os
+import sys
+
+
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+if ROOT_DIR not in sys.path:
+    sys.path.insert(0, ROOT_DIR)
diff --git a/app/agentic_ai/context/__init__.py b/app/agentic_ai/context/__init__.py
@@ -0,0 +1,2 @@
+"""Shared context utilities for agentic_ai."""
+
diff --git a/app/ai/context/factcheckapi/__init__.py → ...entic_ai/context/factcheckapi/__init__.py b/app/ai/context/factcheckapi/__init__.py → ...entic_ai/context/factcheckapi/__init__.py
@@ -1,6 +1,4 @@
 from .google_factcheck_gatherer import GoogleFactCheckGatherer
 
+__all__ = ["GoogleFactCheckGatherer"]
 
-__all__ = [
-    "GoogleFactCheckGatherer"
-]
diff --git a/...factcheckapi/google_factcheck_gatherer.py → ...factcheckapi/google_factcheck_gatherer.py b/...factcheckapi/google_factcheck_gatherer.py → ...factcheckapi/google_factcheck_gatherer.py
diff --git a/...pi/test/google_factcheck_gatherer_test.py → ...i/tests/google_factcheck_gatherer_test.py b/...pi/test/google_factcheck_gatherer_test.py → ...i/tests/google_factcheck_gatherer_test.py
@@ -13,7 +13,7 @@
 
 pytest_plugins = ('pytest_asyncio',)
 
-from app.ai.context.factcheckapi import GoogleFactCheckGatherer
+from app.agentic_ai.context.factcheckapi import GoogleFactCheckGatherer
 from app.models import ExtractedClaim, ClaimSource
 
 
@@ -358,120 +358,3 @@ async def test_gather_portuguese_claim():
 
     print(f"{'=' * 80}\n")
 
-
-# ===== INTEGRATION WITH EVIDENCE RETRIEVAL PIPELINE =====
-
-@pytest.mark.asyncio
-async def test_compose_with_other_gatherers():
-    """should work alongside other evidence gatherers"""
-    from app.ai.pipeline.evidence_retrieval import gather_evidence_async
-    from app.models import EvidenceRetrievalInput
-
-    claim = ExtractedClaim(
-        id="claim-compose-001",
-        text="The moon landing was faked",
-        source=ClaimSource(
-            source_type="original_text",
-            source_id="msg-compose-001"
-        )
-    )
-
-    retrieval_input = EvidenceRetrievalInput(claims=[claim])
-
-    # use google fact-check gatherer
-    google_gatherer = GoogleFactCheckGatherer(max_results=3)
-
-    result = await gather_evidence_async(
-        retrieval_input,
-        gatherers=[google_gatherer]
-    )
-
-    # validate result
-    assert claim.id in result.claim_evidence_map
-    enriched = result.claim_evidence_map[claim.id]
-
-    # all citations should be from google with proper rating mapping
-    print(f"\n{'=' * 80}")
-    print(f"TEST: Compose Google Gatherer with Pipeline")
-    print(f"{'=' * 80}")
-    print(f"Claim: {enriched.text}")
-    print(f"Citations from Google: {len(enriched.citations)}")
-
-    for i, citation in enumerate(enriched.citations, 1):
-        print(f"  Citation {i}: {citation.title[:60]}...")
-        print(f"    Rating: {citation.rating}")
-        assert citation.source == "google_fact_checking_api"
-        if citation.rating:
-            print(f"    ✓ Rating mapped to Portuguese: {citation.rating}")
-            assert citation.rating in ["Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar"]
-        else:
-            print(f"    ⚠ No rating available")
-
-    print(f"{'=' * 80}\n")
-
-
-@pytest.mark.asyncio
-async def test_combine_google_and_web_search():
-    """should combine google fact-check with web search results"""
-    from app.ai.pipeline.evidence_retrieval import (
-        gather_evidence_async,
-        WebSearchGatherer
-    )
-    from app.models import EvidenceRetrievalInput
-
-    claim = ExtractedClaim(
-        id="claim-multi-001",
-        text="Drinking lemon water helps weight loss",
-        source=ClaimSource(
-            source_type="original_text",
-            source_id="msg-multi-001"
-        )
-    )
-
-    retrieval_input = EvidenceRetrievalInput(claims=[claim])
-
-    # use both gatherers
-    google_gatherer = GoogleFactCheckGatherer(max_results=3)
-    web_gatherer = WebSearchGatherer(max_results=3)
-
-    result = await gather_evidence_async(
-        retrieval_input,
-        gatherers=[google_gatherer, web_gatherer]
-    )
-
-    enriched = result.claim_evidence_map[claim.id]
-
-    # should have citations from both sources
-    sources = {cit.source for cit in enriched.citations}
-
-    print(f"\n{'=' * 80}")
-    print(f"TEST: Combine Google + Web Search")
-    print(f"{'=' * 80}")
-    print(f"Claim: {enriched.text}")
-    print(f"Total citations: {len(enriched.citations)}")
-    print(f"Sources used: {sources}")
-
-    # count citations by source and validate google ratings
-    google_count = 0
-    web_count = 0
-
-    print(f"\nCitation details:")
-    for i, cit in enumerate(enriched.citations, 1):
-        if cit.source == "google_fact_checking_api":
-            google_count += 1
-            print(f"  {i}. [Google] {cit.title[:50]}...")
-            print(f"     Rating: {cit.rating}")
-            # validate rating mapping for google citations
-            if cit.rating:
-                print(f"     ✓ Rating mapped to Portuguese: {cit.rating}")
-                assert cit.rating in ["Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar"]
-            else:
-                print(f"     ⚠ No rating available")
-        elif cit.source == "apify_web_search":
-            web_count += 1
-            print(f"  {i}. [Web Search] {cit.title[:50]}...")
-
-    print(f"\nSummary:")
-    print(f"  Google Fact-Check: {google_count}")
-    print(f"  Web Search: {web_count}")
-    print(f"{'=' * 80}\n")
diff --git a/app/agentic_ai/context/web/__init__.py b/app/agentic_ai/context/web/__init__.py
@@ -0,0 +1,2 @@
+"""agentic_ai web context utilities."""
+
diff --git a/app/ai/context/web/apify_utils.py → app/agentic_ai/context/web/apify_utils.py b/app/ai/context/web/apify_utils.py → app/agentic_ai/context/web/apify_utils.py
@@ -14,7 +14,7 @@
 from bs4 import BeautifulSoup
 from apify_client import ApifyClientAsync
 
-from app.ai.context.web.news_scrapers import (
+from app.agentic_ai.context.web.news_scrapers import (
     scrape_g1_article,
     scrape_estadao_article,
     scrape_folha_article,

diff --git a/app/ai/context/web/models.py → app/agentic_ai/context/web/models.py b/app/ai/context/web/models.py → app/agentic_ai/context/web/models.py
diff --git a/app/ai/context/web/news_scrapers.py → app/agentic_ai/context/web/news_scrapers.py b/app/ai/context/web/news_scrapers.py → app/agentic_ai/context/web/news_scrapers.py
diff --git a/...i/context/web/tests/test_news_scrapers.py → ...i/context/web/tests/test_news_scrapers.py b/...i/context/web/tests/test_news_scrapers.py → ...i/context/web/tests/test_news_scrapers.py
@@ -6,8 +6,8 @@
 from unittest.mock import patch, MagicMock
 import asyncio
 
-from app.ai.context.web.apify_utils import detectPlatform, PlatformType, scrapeGenericUrl
-from app.ai.context.web.news_scrapers import (
+from app.agentic_ai.context.web.apify_utils import detectPlatform, PlatformType, scrapeGenericUrl
+from app.agentic_ai.context.web.news_scrapers import (
     scrape_g1_article,
     scrape_estadao_article,
     scrape_folha_article,
@@ -124,8 +124,8 @@ def test_tiktok_still_works(self):
 
 class TestG1Scraper:
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
-    @patch("app.ai.context.web.news_scrapers.trafilatura")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
     def test_successful_extraction(self, mock_traf, mock_session):
         mock_resp = MagicMock()
         mock_resp.text = G1_HTML
@@ -141,8 +141,8 @@ def test_successful_extraction(self, mock_traf, mock_session):
         assert "primeiro parágrafo" in result["content"]
         assert result["error"] is None
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
-    @patch("app.ai.context.web.news_scrapers.trafilatura")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
     def test_stops_at_nav_marker(self, mock_traf, mock_session):
         mock_resp = MagicMock()
         mock_resp.text = G1_HTML
@@ -154,8 +154,8 @@ def test_stops_at_nav_marker(self, mock_traf, mock_session):
         # "Veja também" intertitle should stop extraction
         assert "Veja também" not in result["content"]
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
-    @patch("app.ai.context.web.news_scrapers.trafilatura")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
     def test_falls_back_to_trafilatura(self, mock_traf, mock_session):
         mock_resp = MagicMock()
         mock_resp.text = EMPTY_BODY_HTML
@@ -167,7 +167,7 @@ def test_falls_back_to_trafilatura(self, mock_traf, mock_session):
         assert result["success"] is True
         assert result["content"] == "A" * 60
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
     def test_http_error(self, mock_session):
         mock_session.get.side_effect = Exception("connection refused")
 
@@ -178,8 +178,8 @@ def test_http_error(self, mock_session):
 
 class TestEstadaoScraper:
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
-    @patch("app.ai.context.web.news_scrapers.trafilatura")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
     def test_successful_extraction(self, mock_traf, mock_session):
         mock_resp = MagicMock()
         mock_resp.text = ESTADAO_HTML
@@ -197,7 +197,7 @@ def test_successful_extraction(self, mock_traf, mock_session):
         assert "Manchete de ruído" not in result["content"]
         assert result["error"] is None
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
     def test_http_error(self, mock_session):
         mock_session.get.side_effect = Exception("timeout")
 
@@ -208,8 +208,8 @@ def test_http_error(self, mock_session):
 
 class TestFolhaScraper:
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
-    @patch("app.ai.context.web.news_scrapers.trafilatura")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
     def test_successful_extraction(self, mock_traf, mock_session):
         mock_resp = MagicMock()
         mock_resp.content = FOLHA_HTML.encode("utf-8")
@@ -226,8 +226,8 @@ def test_successful_extraction(self, mock_traf, mock_session):
         # noise class paragraph should be excluded
         assert "deve ser ignorado" not in result["content"]
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
-    @patch("app.ai.context.web.news_scrapers.trafilatura")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
     def test_url_normalization(self, mock_traf, mock_session):
         mock_resp = MagicMock()
         mock_resp.content = FOLHA_HTML.encode("utf-8")
@@ -240,7 +240,7 @@ def test_url_normalization(self, mock_traf, mock_session):
         call_url = mock_session.get.call_args[0][0]
         assert "www1.folha.uol.com.br" in call_url
 
-    @patch("app.ai.context.web.news_scrapers._SESSION")
+    @patch("app.agentic_ai.context.web.news_scrapers._SESSION")
     def test_http_error(self, mock_session):
         mock_session.get.side_effect = Exception("ssl error")
 
@@ -250,8 +250,8 @@ def test_http_error(self, mock_session):
 
 class TestAosFatosScraper:
 
-    @patch("app.ai.context.web.news_scrapers._fetch_aosfatos")
-    @patch("app.ai.context.web.news_scrapers.trafilatura")
+    @patch("app.agentic_ai.context.web.news_scrapers._fetch_aosfatos")
+    @patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
     def test_successful_extraction(self, mock_traf, mock_fetch):
         mock_fetch.return_value = (AOSFATOS_HTML, 200)
         mock_traf.extract.return_value = ""
@@ -265,7 +265,7 @@ def test_successful_extraction(self, mock_traf, mock_fetch):
         # noise class paragraph should be excluded
         assert "deve ser ignorado" not in result["content"]
 
-    @patch("app.ai.context.web.news_scrapers._fetch_aosfatos")
+    @patch("app.agentic_ai.context.web.news_scrapers._fetch_aosfatos")
     def test_http_error(self, mock_fetch):
         mock_fetch.side_effect = Exception("UNEXPECTED_EOF")
 
@@ -307,7 +307,7 @@ def test_fails_when_content_too_short(self):
 class TestScrapeGenericUrlRouting:
 
     @pytest.mark.asyncio
-    @patch("app.ai.context.web.apify_utils.scrape_g1_article")
+    @patch("app.agentic_ai.context.web.apify_utils.scrape_g1_article")
     async def test_routes_g1(self, mock_scraper):
         mock_scraper.return_value = {
             "success": True, "content": "g1 content", "metadata": {"extraction_tool": "g1_scraper"}, "error": None
@@ -320,7 +320,7 @@ async def test_routes_g1(self, mock_scraper):
         assert result["metadata"]["extraction_tool"] == "g1_scraper"
 
     @pytest.mark.asyncio
-    @patch("app.ai.context.web.apify_utils.scrape_estadao_article")
+    @patch("app.agentic_ai.context.web.apify_utils.scrape_estadao_article")
     async def test_routes_estadao(self, mock_scraper):
         mock_scraper.return_value = {
             "success": True, "content": "estadao content", "metadata": {"extraction_tool": "estadao_scraper"}, "error": None
@@ -332,7 +332,7 @@ async def test_routes_estadao(self, mock_scraper):
         assert result["metadata"]["extraction_tool"] == "estadao_scraper"
 
     @pytest.mark.asyncio
-    @patch("app.ai.context.web.apify_utils.scrape_folha_article")
+    @patch("app.agentic_ai.context.web.apify_utils.scrape_folha_article")
     async def test_routes_folha(self, mock_scraper):
         mock_scraper.return_value = {
             "success": True, "content": "folha content", "metadata": {"extraction_tool": "folha_scraper"}, "error": None
@@ -344,7 +344,7 @@ async def test_routes_folha(self, mock_scraper):
         assert result["metadata"]["extraction_tool"] == "folha_scraper"
 
     @pytest.mark.asyncio
-    @patch("app.ai.context.web.apify_utils.scrape_aosfatos_article")
+    @patch("app.agentic_ai.context.web.apify_utils.scrape_aosfatos_article")
     async def test_routes_aosfatos(self, mock_scraper):
         mock_scraper.return_value = {
             "success": True, "content": "aosfatos content", "metadata": {"extraction_tool": "aosfatos_scraper"}, "error": None
@@ -356,8 +356,8 @@ async def test_routes_aosfatos(self, mock_scraper):
         assert result["metadata"]["extraction_tool"] == "aosfatos_scraper"
 
     @pytest.mark.asyncio
-    @patch("app.ai.context.web.apify_utils.scrapeGenericSimple")
-    @patch("app.ai.context.web.apify_utils.scrape_g1_article")
+    @patch("app.agentic_ai.context.web.apify_utils.scrapeGenericSimple")
+    @patch("app.agentic_ai.context.web.apify_utils.scrape_g1_article")
     async def test_fallback_on_scraper_failure(self, mock_g1, mock_generic):
         mock_g1.return_value = {
             "success": False, "content": "", "metadata": {}, "error": "extraction failed"

diff --git a/app/agentic_ai/nodes/format_input.py b/app/agentic_ai/nodes/format_input.py
@@ -9,6 +9,7 @@
 
 from __future__ import annotations
 
+import re
 import uuid
 
 from langchain_core.messages import HumanMessage
@@ -18,10 +19,38 @@
     expand_all_links,
     fire_link_expansion,
 )
-from app.ai.pipeline.link_context_expander import extract_links
 from app.models.commondata import DataSource
 
 
+def extract_links(text: str) -> list[str]:
+    """
+    Extract all URLs from text using regex.
+
+    Supports http, https protocols and common URL patterns.
+    Returns list of unique URLs found in the text, preserving order.
+    """
+    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
+
+    urls = re.findall(url_pattern, text)
+
+    trailing_punctuation = '.,:;!?)]}'
+    cleaned_urls: list[str] = []
+    for url in urls:
+        while url and url[-1] in trailing_punctuation:
+            url = url[:-1]
+        if url:
+            cleaned_urls.append(url)
+
+    seen: set[str] = set()
+    unique_urls: list[str] = []
+    for url in cleaned_urls:
+        if url not in seen:
+            seen.add(url)
+            unique_urls.append(url)
+
+    return unique_urls
+
+
 def _is_links_only(text: str, urls: list[str]) -> bool:
     """check if original text contains only URLs with no meaningful claim text."""
     remaining = text
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		"""Shared context utilities for agentic_ai."""