Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions app/agentic_ai/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os
import sys


ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if ROOT_DIR not in sys.path:
sys.path.insert(0, ROOT_DIR)
2 changes: 2 additions & 0 deletions app/agentic_ai/context/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""Shared context utilities for agentic_ai."""

Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from .google_factcheck_gatherer import GoogleFactCheckGatherer

__all__ = ["GoogleFactCheckGatherer"]

__all__ = [
"GoogleFactCheckGatherer"
]
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

pytest_plugins = ('pytest_asyncio',)

from app.ai.context.factcheckapi import GoogleFactCheckGatherer
from app.agentic_ai.context.factcheckapi import GoogleFactCheckGatherer
from app.models import ExtractedClaim, ClaimSource


Expand Down Expand Up @@ -358,120 +358,3 @@ async def test_gather_portuguese_claim():

print(f"{'=' * 80}\n")


# ===== INTEGRATION WITH EVIDENCE RETRIEVAL PIPELINE =====

@pytest.mark.asyncio
async def test_compose_with_other_gatherers():
"""should work alongside other evidence gatherers"""
from app.ai.pipeline.evidence_retrieval import gather_evidence_async
from app.models import EvidenceRetrievalInput

claim = ExtractedClaim(
id="claim-compose-001",
text="The moon landing was faked",
source=ClaimSource(
source_type="original_text",
source_id="msg-compose-001"
)
)

retrieval_input = EvidenceRetrievalInput(claims=[claim])

# use google fact-check gatherer
google_gatherer = GoogleFactCheckGatherer(max_results=3)

result = await gather_evidence_async(
retrieval_input,
gatherers=[google_gatherer]
)

# validate result
assert claim.id in result.claim_evidence_map
enriched = result.claim_evidence_map[claim.id]

# all citations should be from google with proper rating mapping
print(f"\n{'=' * 80}")
print(f"TEST: Compose Google Gatherer with Pipeline")
print(f"{'=' * 80}")
print(f"Claim: {enriched.text}")
print(f"Citations from Google: {len(enriched.citations)}")

for i, citation in enumerate(enriched.citations, 1):
print(f" Citation {i}: {citation.title[:60]}...")
print(f" Rating: {citation.rating}")
assert citation.source == "google_fact_checking_api"
if citation.rating:
print(f" ✓ Rating mapped to Portuguese: {citation.rating}")
assert citation.rating in ["Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar"]
else:
print(f" ⚠ No rating available")

print(f"{'=' * 80}\n")


@pytest.mark.asyncio
async def test_combine_google_and_web_search():
"""should combine google fact-check with web search results"""
from app.ai.pipeline.evidence_retrieval import (
gather_evidence_async,
WebSearchGatherer
)
from app.models import EvidenceRetrievalInput

claim = ExtractedClaim(
id="claim-multi-001",
text="Drinking lemon water helps weight loss",
source=ClaimSource(
source_type="original_text",
source_id="msg-multi-001"
)
)

retrieval_input = EvidenceRetrievalInput(claims=[claim])

# use both gatherers
google_gatherer = GoogleFactCheckGatherer(max_results=3)
web_gatherer = WebSearchGatherer(max_results=3)

result = await gather_evidence_async(
retrieval_input,
gatherers=[google_gatherer, web_gatherer]
)

enriched = result.claim_evidence_map[claim.id]

# should have citations from both sources
sources = {cit.source for cit in enriched.citations}

print(f"\n{'=' * 80}")
print(f"TEST: Combine Google + Web Search")
print(f"{'=' * 80}")
print(f"Claim: {enriched.text}")
print(f"Total citations: {len(enriched.citations)}")
print(f"Sources used: {sources}")

# count citations by source and validate google ratings
google_count = 0
web_count = 0

print(f"\nCitation details:")
for i, cit in enumerate(enriched.citations, 1):
if cit.source == "google_fact_checking_api":
google_count += 1
print(f" {i}. [Google] {cit.title[:50]}...")
print(f" Rating: {cit.rating}")
# validate rating mapping for google citations
if cit.rating:
print(f" ✓ Rating mapped to Portuguese: {cit.rating}")
assert cit.rating in ["Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar"]
else:
print(f" ⚠ No rating available")
elif cit.source == "apify_web_search":
web_count += 1
print(f" {i}. [Web Search] {cit.title[:50]}...")

print(f"\nSummary:")
print(f" Google Fact-Check: {google_count}")
print(f" Web Search: {web_count}")
print(f"{'=' * 80}\n")
2 changes: 2 additions & 0 deletions app/agentic_ai/context/web/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""agentic_ai web context utilities."""

Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from bs4 import BeautifulSoup
from apify_client import ApifyClientAsync

from app.ai.context.web.news_scrapers import (
from app.agentic_ai.context.web.news_scrapers import (
scrape_g1_article,
scrape_estadao_article,
scrape_folha_article,
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from unittest.mock import patch, MagicMock
import asyncio

from app.ai.context.web.apify_utils import detectPlatform, PlatformType, scrapeGenericUrl
from app.ai.context.web.news_scrapers import (
from app.agentic_ai.context.web.apify_utils import detectPlatform, PlatformType, scrapeGenericUrl
from app.agentic_ai.context.web.news_scrapers import (
scrape_g1_article,
scrape_estadao_article,
scrape_folha_article,
Expand Down Expand Up @@ -124,8 +124,8 @@ def test_tiktok_still_works(self):

class TestG1Scraper:

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.ai.context.web.news_scrapers.trafilatura")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
def test_successful_extraction(self, mock_traf, mock_session):
mock_resp = MagicMock()
mock_resp.text = G1_HTML
Expand All @@ -141,8 +141,8 @@ def test_successful_extraction(self, mock_traf, mock_session):
assert "primeiro parágrafo" in result["content"]
assert result["error"] is None

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.ai.context.web.news_scrapers.trafilatura")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
def test_stops_at_nav_marker(self, mock_traf, mock_session):
mock_resp = MagicMock()
mock_resp.text = G1_HTML
Expand All @@ -154,8 +154,8 @@ def test_stops_at_nav_marker(self, mock_traf, mock_session):
# "Veja também" intertitle should stop extraction
assert "Veja também" not in result["content"]

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.ai.context.web.news_scrapers.trafilatura")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
def test_falls_back_to_trafilatura(self, mock_traf, mock_session):
mock_resp = MagicMock()
mock_resp.text = EMPTY_BODY_HTML
Expand All @@ -167,7 +167,7 @@ def test_falls_back_to_trafilatura(self, mock_traf, mock_session):
assert result["success"] is True
assert result["content"] == "A" * 60

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
def test_http_error(self, mock_session):
mock_session.get.side_effect = Exception("connection refused")

Expand All @@ -178,8 +178,8 @@ def test_http_error(self, mock_session):

class TestEstadaoScraper:

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.ai.context.web.news_scrapers.trafilatura")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
def test_successful_extraction(self, mock_traf, mock_session):
mock_resp = MagicMock()
mock_resp.text = ESTADAO_HTML
Expand All @@ -197,7 +197,7 @@ def test_successful_extraction(self, mock_traf, mock_session):
assert "Manchete de ruído" not in result["content"]
assert result["error"] is None

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
def test_http_error(self, mock_session):
mock_session.get.side_effect = Exception("timeout")

Expand All @@ -208,8 +208,8 @@ def test_http_error(self, mock_session):

class TestFolhaScraper:

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.ai.context.web.news_scrapers.trafilatura")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
def test_successful_extraction(self, mock_traf, mock_session):
mock_resp = MagicMock()
mock_resp.content = FOLHA_HTML.encode("utf-8")
Expand All @@ -226,8 +226,8 @@ def test_successful_extraction(self, mock_traf, mock_session):
# noise class paragraph should be excluded
assert "deve ser ignorado" not in result["content"]

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.ai.context.web.news_scrapers.trafilatura")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
def test_url_normalization(self, mock_traf, mock_session):
mock_resp = MagicMock()
mock_resp.content = FOLHA_HTML.encode("utf-8")
Expand All @@ -240,7 +240,7 @@ def test_url_normalization(self, mock_traf, mock_session):
call_url = mock_session.get.call_args[0][0]
assert "www1.folha.uol.com.br" in call_url

@patch("app.ai.context.web.news_scrapers._SESSION")
@patch("app.agentic_ai.context.web.news_scrapers._SESSION")
def test_http_error(self, mock_session):
mock_session.get.side_effect = Exception("ssl error")

Expand All @@ -250,8 +250,8 @@ def test_http_error(self, mock_session):

class TestAosFatosScraper:

@patch("app.ai.context.web.news_scrapers._fetch_aosfatos")
@patch("app.ai.context.web.news_scrapers.trafilatura")
@patch("app.agentic_ai.context.web.news_scrapers._fetch_aosfatos")
@patch("app.agentic_ai.context.web.news_scrapers.trafilatura")
def test_successful_extraction(self, mock_traf, mock_fetch):
mock_fetch.return_value = (AOSFATOS_HTML, 200)
mock_traf.extract.return_value = ""
Expand All @@ -265,7 +265,7 @@ def test_successful_extraction(self, mock_traf, mock_fetch):
# noise class paragraph should be excluded
assert "deve ser ignorado" not in result["content"]

@patch("app.ai.context.web.news_scrapers._fetch_aosfatos")
@patch("app.agentic_ai.context.web.news_scrapers._fetch_aosfatos")
def test_http_error(self, mock_fetch):
mock_fetch.side_effect = Exception("UNEXPECTED_EOF")

Expand Down Expand Up @@ -307,7 +307,7 @@ def test_fails_when_content_too_short(self):
class TestScrapeGenericUrlRouting:

@pytest.mark.asyncio
@patch("app.ai.context.web.apify_utils.scrape_g1_article")
@patch("app.agentic_ai.context.web.apify_utils.scrape_g1_article")
async def test_routes_g1(self, mock_scraper):
mock_scraper.return_value = {
"success": True, "content": "g1 content", "metadata": {"extraction_tool": "g1_scraper"}, "error": None
Expand All @@ -320,7 +320,7 @@ async def test_routes_g1(self, mock_scraper):
assert result["metadata"]["extraction_tool"] == "g1_scraper"

@pytest.mark.asyncio
@patch("app.ai.context.web.apify_utils.scrape_estadao_article")
@patch("app.agentic_ai.context.web.apify_utils.scrape_estadao_article")
async def test_routes_estadao(self, mock_scraper):
mock_scraper.return_value = {
"success": True, "content": "estadao content", "metadata": {"extraction_tool": "estadao_scraper"}, "error": None
Expand All @@ -332,7 +332,7 @@ async def test_routes_estadao(self, mock_scraper):
assert result["metadata"]["extraction_tool"] == "estadao_scraper"

@pytest.mark.asyncio
@patch("app.ai.context.web.apify_utils.scrape_folha_article")
@patch("app.agentic_ai.context.web.apify_utils.scrape_folha_article")
async def test_routes_folha(self, mock_scraper):
mock_scraper.return_value = {
"success": True, "content": "folha content", "metadata": {"extraction_tool": "folha_scraper"}, "error": None
Expand All @@ -344,7 +344,7 @@ async def test_routes_folha(self, mock_scraper):
assert result["metadata"]["extraction_tool"] == "folha_scraper"

@pytest.mark.asyncio
@patch("app.ai.context.web.apify_utils.scrape_aosfatos_article")
@patch("app.agentic_ai.context.web.apify_utils.scrape_aosfatos_article")
async def test_routes_aosfatos(self, mock_scraper):
mock_scraper.return_value = {
"success": True, "content": "aosfatos content", "metadata": {"extraction_tool": "aosfatos_scraper"}, "error": None
Expand All @@ -356,8 +356,8 @@ async def test_routes_aosfatos(self, mock_scraper):
assert result["metadata"]["extraction_tool"] == "aosfatos_scraper"

@pytest.mark.asyncio
@patch("app.ai.context.web.apify_utils.scrapeGenericSimple")
@patch("app.ai.context.web.apify_utils.scrape_g1_article")
@patch("app.agentic_ai.context.web.apify_utils.scrapeGenericSimple")
@patch("app.agentic_ai.context.web.apify_utils.scrape_g1_article")
async def test_fallback_on_scraper_failure(self, mock_g1, mock_generic):
mock_g1.return_value = {
"success": False, "content": "", "metadata": {}, "error": "extraction failed"
Expand Down
31 changes: 30 additions & 1 deletion app/agentic_ai/nodes/format_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from __future__ import annotations

import re
import uuid

from langchain_core.messages import HumanMessage
Expand All @@ -18,10 +19,38 @@
expand_all_links,
fire_link_expansion,
)
from app.ai.pipeline.link_context_expander import extract_links
from app.models.commondata import DataSource


def extract_links(text: str) -> list[str]:
"""
Extract all URLs from text using regex.

Supports http, https protocols and common URL patterns.
Returns list of unique URLs found in the text, preserving order.
"""
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'

urls = re.findall(url_pattern, text)

trailing_punctuation = '.,:;!?)]}'
cleaned_urls: list[str] = []
for url in urls:
while url and url[-1] in trailing_punctuation:
url = url[:-1]
if url:
cleaned_urls.append(url)

seen: set[str] = set()
unique_urls: list[str] = []
for url in cleaned_urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)

return unique_urls


def _is_links_only(text: str, urls: list[str]) -> bool:
"""check if original text contains only URLs with no meaningful claim text."""
remaining = text
Expand Down
Loading