diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 4da09d6..1231516 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -26,7 +26,9 @@ "WebFetch(domain:www.tavily.com)", "WebFetch(domain:brave.com)", "Bash(pip3 list:*)", - "Bash(./venv/bin/python -m pytest:*)" + "Bash(./venv/bin/python -m pytest:*)", + "Bash(pip install:*)", + "Bash(python -m pip install:*)", ], "deny": [], "ask": [] diff --git a/app/agentic_ai/conftest.py b/app/agentic_ai/conftest.py new file mode 100644 index 0000000..04d2ec7 --- /dev/null +++ b/app/agentic_ai/conftest.py @@ -0,0 +1,7 @@ +import os +import sys + + +ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +if ROOT_DIR not in sys.path: + sys.path.insert(0, ROOT_DIR) diff --git a/app/agentic_ai/context/__init__.py b/app/agentic_ai/context/__init__.py new file mode 100644 index 0000000..20c4ddd --- /dev/null +++ b/app/agentic_ai/context/__init__.py @@ -0,0 +1,2 @@ +"""Shared context utilities for agentic_ai.""" + diff --git a/app/ai/context/factcheckapi/__init__.py b/app/agentic_ai/context/factcheckapi/__init__.py similarity index 60% rename from app/ai/context/factcheckapi/__init__.py rename to app/agentic_ai/context/factcheckapi/__init__.py index 0035b70..163b34c 100644 --- a/app/ai/context/factcheckapi/__init__.py +++ b/app/agentic_ai/context/factcheckapi/__init__.py @@ -1,6 +1,4 @@ from .google_factcheck_gatherer import GoogleFactCheckGatherer +__all__ = ["GoogleFactCheckGatherer"] -__all__ = [ - "GoogleFactCheckGatherer" -] \ No newline at end of file diff --git a/app/ai/context/factcheckapi/google_factcheck_gatherer.py b/app/agentic_ai/context/factcheckapi/google_factcheck_gatherer.py similarity index 100% rename from app/ai/context/factcheckapi/google_factcheck_gatherer.py rename to app/agentic_ai/context/factcheckapi/google_factcheck_gatherer.py diff --git a/app/ai/context/factcheckapi/test/google_factcheck_gatherer_test.py b/app/agentic_ai/context/factcheckapi/tests/google_factcheck_gatherer_test.py similarity index 75% rename from app/ai/context/factcheckapi/test/google_factcheck_gatherer_test.py rename to app/agentic_ai/context/factcheckapi/tests/google_factcheck_gatherer_test.py index 62a2284..a2eb8ed 100644 --- a/app/ai/context/factcheckapi/test/google_factcheck_gatherer_test.py +++ b/app/agentic_ai/context/factcheckapi/tests/google_factcheck_gatherer_test.py @@ -13,7 +13,7 @@ pytest_plugins = ('pytest_asyncio',) -from app.ai.context.factcheckapi import GoogleFactCheckGatherer +from app.agentic_ai.context.factcheckapi import GoogleFactCheckGatherer from app.models import ExtractedClaim, ClaimSource @@ -358,120 +358,3 @@ async def test_gather_portuguese_claim(): print(f"{'=' * 80}\n") - -# ===== INTEGRATION WITH EVIDENCE RETRIEVAL PIPELINE ===== - -@pytest.mark.asyncio -async def test_compose_with_other_gatherers(): - """should work alongside other evidence gatherers""" - from app.ai.pipeline.evidence_retrieval import gather_evidence_async - from app.models import EvidenceRetrievalInput - - claim = ExtractedClaim( - id="claim-compose-001", - text="The moon landing was faked", - source=ClaimSource( - source_type="original_text", - source_id="msg-compose-001" - ) - ) - - retrieval_input = EvidenceRetrievalInput(claims=[claim]) - - # use google fact-check gatherer - google_gatherer = GoogleFactCheckGatherer(max_results=3) - - result = await gather_evidence_async( - retrieval_input, - gatherers=[google_gatherer] - ) - - # validate result - assert claim.id in result.claim_evidence_map - enriched = result.claim_evidence_map[claim.id] - - # all citations should be from google with proper rating mapping - print(f"\n{'=' * 80}") - print(f"TEST: Compose Google Gatherer with Pipeline") - print(f"{'=' * 80}") - print(f"Claim: {enriched.text}") - print(f"Citations from Google: {len(enriched.citations)}") - - for i, citation in enumerate(enriched.citations, 1): - print(f" Citation {i}: {citation.title[:60]}...") - print(f" Rating: {citation.rating}") - assert citation.source == "google_fact_checking_api" - if citation.rating: - print(f" ✓ Rating mapped to Portuguese: {citation.rating}") - assert citation.rating in ["Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar"] - else: - print(f" ⚠ No rating available") - - print(f"{'=' * 80}\n") - - -@pytest.mark.asyncio -async def test_combine_google_and_web_search(): - """should combine google fact-check with web search results""" - from app.ai.pipeline.evidence_retrieval import ( - gather_evidence_async, - WebSearchGatherer - ) - from app.models import EvidenceRetrievalInput - - claim = ExtractedClaim( - id="claim-multi-001", - text="Drinking lemon water helps weight loss", - source=ClaimSource( - source_type="original_text", - source_id="msg-multi-001" - ) - ) - - retrieval_input = EvidenceRetrievalInput(claims=[claim]) - - # use both gatherers - google_gatherer = GoogleFactCheckGatherer(max_results=3) - web_gatherer = WebSearchGatherer(max_results=3) - - result = await gather_evidence_async( - retrieval_input, - gatherers=[google_gatherer, web_gatherer] - ) - - enriched = result.claim_evidence_map[claim.id] - - # should have citations from both sources - sources = {cit.source for cit in enriched.citations} - - print(f"\n{'=' * 80}") - print(f"TEST: Combine Google + Web Search") - print(f"{'=' * 80}") - print(f"Claim: {enriched.text}") - print(f"Total citations: {len(enriched.citations)}") - print(f"Sources used: {sources}") - - # count citations by source and validate google ratings - google_count = 0 - web_count = 0 - - print(f"\nCitation details:") - for i, cit in enumerate(enriched.citations, 1): - if cit.source == "google_fact_checking_api": - google_count += 1 - print(f" {i}. [Google] {cit.title[:50]}...") - print(f" Rating: {cit.rating}") - # validate rating mapping for google citations - if cit.rating: - print(f" ✓ Rating mapped to Portuguese: {cit.rating}") - assert cit.rating in ["Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar"] - else: - print(f" ⚠ No rating available") - elif cit.source == "apify_web_search": - web_count += 1 - print(f" {i}. [Web Search] {cit.title[:50]}...") - - print(f"\nSummary:") - print(f" Google Fact-Check: {google_count}") - print(f" Web Search: {web_count}") - print(f"{'=' * 80}\n") diff --git a/app/agentic_ai/context/web/__init__.py b/app/agentic_ai/context/web/__init__.py new file mode 100644 index 0000000..70e720f --- /dev/null +++ b/app/agentic_ai/context/web/__init__.py @@ -0,0 +1,2 @@ +"""agentic_ai web context utilities.""" + diff --git a/app/ai/context/web/apify_utils.py b/app/agentic_ai/context/web/apify_utils.py similarity index 99% rename from app/ai/context/web/apify_utils.py rename to app/agentic_ai/context/web/apify_utils.py index ffdb404..72899ba 100644 --- a/app/ai/context/web/apify_utils.py +++ b/app/agentic_ai/context/web/apify_utils.py @@ -14,7 +14,7 @@ from bs4 import BeautifulSoup from apify_client import ApifyClientAsync -from app.ai.context.web.news_scrapers import ( +from app.agentic_ai.context.web.news_scrapers import ( scrape_g1_article, scrape_estadao_article, scrape_folha_article, diff --git a/app/ai/context/web/models.py b/app/agentic_ai/context/web/models.py similarity index 100% rename from app/ai/context/web/models.py rename to app/agentic_ai/context/web/models.py diff --git a/app/ai/context/web/news_scrapers.py b/app/agentic_ai/context/web/news_scrapers.py similarity index 100% rename from app/ai/context/web/news_scrapers.py rename to app/agentic_ai/context/web/news_scrapers.py diff --git a/app/ai/context/web/tests/test_news_scrapers.py b/app/agentic_ai/context/web/tests/test_news_scrapers.py similarity index 88% rename from app/ai/context/web/tests/test_news_scrapers.py rename to app/agentic_ai/context/web/tests/test_news_scrapers.py index 9d811cb..2dc21c9 100644 --- a/app/ai/context/web/tests/test_news_scrapers.py +++ b/app/agentic_ai/context/web/tests/test_news_scrapers.py @@ -6,8 +6,8 @@ from unittest.mock import patch, MagicMock import asyncio -from app.ai.context.web.apify_utils import detectPlatform, PlatformType, scrapeGenericUrl -from app.ai.context.web.news_scrapers import ( +from app.agentic_ai.context.web.apify_utils import detectPlatform, PlatformType, scrapeGenericUrl +from app.agentic_ai.context.web.news_scrapers import ( scrape_g1_article, scrape_estadao_article, scrape_folha_article, @@ -124,8 +124,8 @@ def test_tiktok_still_works(self): class TestG1Scraper: - @patch("app.ai.context.web.news_scrapers._SESSION") - @patch("app.ai.context.web.news_scrapers.trafilatura") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers.trafilatura") def test_successful_extraction(self, mock_traf, mock_session): mock_resp = MagicMock() mock_resp.text = G1_HTML @@ -141,8 +141,8 @@ def test_successful_extraction(self, mock_traf, mock_session): assert "primeiro parágrafo" in result["content"] assert result["error"] is None - @patch("app.ai.context.web.news_scrapers._SESSION") - @patch("app.ai.context.web.news_scrapers.trafilatura") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers.trafilatura") def test_stops_at_nav_marker(self, mock_traf, mock_session): mock_resp = MagicMock() mock_resp.text = G1_HTML @@ -154,8 +154,8 @@ def test_stops_at_nav_marker(self, mock_traf, mock_session): # "Veja também" intertitle should stop extraction assert "Veja também" not in result["content"] - @patch("app.ai.context.web.news_scrapers._SESSION") - @patch("app.ai.context.web.news_scrapers.trafilatura") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers.trafilatura") def test_falls_back_to_trafilatura(self, mock_traf, mock_session): mock_resp = MagicMock() mock_resp.text = EMPTY_BODY_HTML @@ -167,7 +167,7 @@ def test_falls_back_to_trafilatura(self, mock_traf, mock_session): assert result["success"] is True assert result["content"] == "A" * 60 - @patch("app.ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") def test_http_error(self, mock_session): mock_session.get.side_effect = Exception("connection refused") @@ -178,8 +178,8 @@ def test_http_error(self, mock_session): class TestEstadaoScraper: - @patch("app.ai.context.web.news_scrapers._SESSION") - @patch("app.ai.context.web.news_scrapers.trafilatura") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers.trafilatura") def test_successful_extraction(self, mock_traf, mock_session): mock_resp = MagicMock() mock_resp.text = ESTADAO_HTML @@ -197,7 +197,7 @@ def test_successful_extraction(self, mock_traf, mock_session): assert "Manchete de ruído" not in result["content"] assert result["error"] is None - @patch("app.ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") def test_http_error(self, mock_session): mock_session.get.side_effect = Exception("timeout") @@ -208,8 +208,8 @@ def test_http_error(self, mock_session): class TestFolhaScraper: - @patch("app.ai.context.web.news_scrapers._SESSION") - @patch("app.ai.context.web.news_scrapers.trafilatura") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers.trafilatura") def test_successful_extraction(self, mock_traf, mock_session): mock_resp = MagicMock() mock_resp.content = FOLHA_HTML.encode("utf-8") @@ -226,8 +226,8 @@ def test_successful_extraction(self, mock_traf, mock_session): # noise class paragraph should be excluded assert "deve ser ignorado" not in result["content"] - @patch("app.ai.context.web.news_scrapers._SESSION") - @patch("app.ai.context.web.news_scrapers.trafilatura") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers.trafilatura") def test_url_normalization(self, mock_traf, mock_session): mock_resp = MagicMock() mock_resp.content = FOLHA_HTML.encode("utf-8") @@ -240,7 +240,7 @@ def test_url_normalization(self, mock_traf, mock_session): call_url = mock_session.get.call_args[0][0] assert "www1.folha.uol.com.br" in call_url - @patch("app.ai.context.web.news_scrapers._SESSION") + @patch("app.agentic_ai.context.web.news_scrapers._SESSION") def test_http_error(self, mock_session): mock_session.get.side_effect = Exception("ssl error") @@ -250,8 +250,8 @@ def test_http_error(self, mock_session): class TestAosFatosScraper: - @patch("app.ai.context.web.news_scrapers._fetch_aosfatos") - @patch("app.ai.context.web.news_scrapers.trafilatura") + @patch("app.agentic_ai.context.web.news_scrapers._fetch_aosfatos") + @patch("app.agentic_ai.context.web.news_scrapers.trafilatura") def test_successful_extraction(self, mock_traf, mock_fetch): mock_fetch.return_value = (AOSFATOS_HTML, 200) mock_traf.extract.return_value = "" @@ -265,7 +265,7 @@ def test_successful_extraction(self, mock_traf, mock_fetch): # noise class paragraph should be excluded assert "deve ser ignorado" not in result["content"] - @patch("app.ai.context.web.news_scrapers._fetch_aosfatos") + @patch("app.agentic_ai.context.web.news_scrapers._fetch_aosfatos") def test_http_error(self, mock_fetch): mock_fetch.side_effect = Exception("UNEXPECTED_EOF") @@ -307,7 +307,7 @@ def test_fails_when_content_too_short(self): class TestScrapeGenericUrlRouting: @pytest.mark.asyncio - @patch("app.ai.context.web.apify_utils.scrape_g1_article") + @patch("app.agentic_ai.context.web.apify_utils.scrape_g1_article") async def test_routes_g1(self, mock_scraper): mock_scraper.return_value = { "success": True, "content": "g1 content", "metadata": {"extraction_tool": "g1_scraper"}, "error": None @@ -320,7 +320,7 @@ async def test_routes_g1(self, mock_scraper): assert result["metadata"]["extraction_tool"] == "g1_scraper" @pytest.mark.asyncio - @patch("app.ai.context.web.apify_utils.scrape_estadao_article") + @patch("app.agentic_ai.context.web.apify_utils.scrape_estadao_article") async def test_routes_estadao(self, mock_scraper): mock_scraper.return_value = { "success": True, "content": "estadao content", "metadata": {"extraction_tool": "estadao_scraper"}, "error": None @@ -332,7 +332,7 @@ async def test_routes_estadao(self, mock_scraper): assert result["metadata"]["extraction_tool"] == "estadao_scraper" @pytest.mark.asyncio - @patch("app.ai.context.web.apify_utils.scrape_folha_article") + @patch("app.agentic_ai.context.web.apify_utils.scrape_folha_article") async def test_routes_folha(self, mock_scraper): mock_scraper.return_value = { "success": True, "content": "folha content", "metadata": {"extraction_tool": "folha_scraper"}, "error": None @@ -344,7 +344,7 @@ async def test_routes_folha(self, mock_scraper): assert result["metadata"]["extraction_tool"] == "folha_scraper" @pytest.mark.asyncio - @patch("app.ai.context.web.apify_utils.scrape_aosfatos_article") + @patch("app.agentic_ai.context.web.apify_utils.scrape_aosfatos_article") async def test_routes_aosfatos(self, mock_scraper): mock_scraper.return_value = { "success": True, "content": "aosfatos content", "metadata": {"extraction_tool": "aosfatos_scraper"}, "error": None @@ -356,8 +356,8 @@ async def test_routes_aosfatos(self, mock_scraper): assert result["metadata"]["extraction_tool"] == "aosfatos_scraper" @pytest.mark.asyncio - @patch("app.ai.context.web.apify_utils.scrapeGenericSimple") - @patch("app.ai.context.web.apify_utils.scrape_g1_article") + @patch("app.agentic_ai.context.web.apify_utils.scrapeGenericSimple") + @patch("app.agentic_ai.context.web.apify_utils.scrape_g1_article") async def test_fallback_on_scraper_failure(self, mock_g1, mock_generic): mock_g1.return_value = { "success": False, "content": "", "metadata": {}, "error": "extraction failed" diff --git a/app/agentic_ai/nodes/format_input.py b/app/agentic_ai/nodes/format_input.py index d761aca..29a20cb 100644 --- a/app/agentic_ai/nodes/format_input.py +++ b/app/agentic_ai/nodes/format_input.py @@ -9,6 +9,7 @@ from __future__ import annotations +import re import uuid from langchain_core.messages import HumanMessage @@ -18,10 +19,38 @@ expand_all_links, fire_link_expansion, ) -from app.ai.pipeline.link_context_expander import extract_links from app.models.commondata import DataSource +def extract_links(text: str) -> list[str]: + """ + Extract all URLs from text using regex. + + Supports http, https protocols and common URL patterns. + Returns list of unique URLs found in the text, preserving order. + """ + url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+' + + urls = re.findall(url_pattern, text) + + trailing_punctuation = '.,:;!?)]}' + cleaned_urls: list[str] = [] + for url in urls: + while url and url[-1] in trailing_punctuation: + url = url[:-1] + if url: + cleaned_urls.append(url) + + seen: set[str] = set() + unique_urls: list[str] = [] + for url in cleaned_urls: + if url not in seen: + seen.add(url) + unique_urls.append(url) + + return unique_urls + + def _is_links_only(text: str, urls: list[str]) -> bool: """check if original text contains only URLs with no meaningful claim text.""" remaining = text diff --git a/app/agentic_ai/tests/nodes/test_extract_links.py b/app/agentic_ai/tests/nodes/test_extract_links.py new file mode 100644 index 0000000..6ea2de9 --- /dev/null +++ b/app/agentic_ai/tests/nodes/test_extract_links.py @@ -0,0 +1,124 @@ +import pytest + +from app.agentic_ai.nodes.format_input import extract_links + + +def test_extract_single_https_url(): + """should extract a single https URL from text""" + text = "Check out this article at https://example.com for more info." + result = extract_links(text) + assert result == ["https://example.com"] + + +def test_extract_single_http_url(): + """should extract a single http URL from text""" + text = "Visit http://test.org to learn more." + result = extract_links(text) + assert result == ["http://test.org"] + + +def test_extract_multiple_urls(): + """should extract multiple URLs from text""" + text = "Check https://example.com and http://test.org for details." + result = extract_links(text) + assert result == ["https://example.com", "http://test.org"] + + +def test_extract_urls_with_paths(): + """should extract URLs with paths and query parameters""" + text = "See https://example.com/article/123?ref=social and http://test.org/page" + result = extract_links(text) + assert result == ["https://example.com/article/123?ref=social", "http://test.org/page"] + + +def test_remove_duplicate_urls(): + """should remove duplicate URLs while preserving order""" + text = "Visit https://example.com and also https://example.com again." + result = extract_links(text) + assert result == ["https://example.com"] + assert len(result) == 1 + + +def test_empty_text(): + """should return empty list for empty text""" + result = extract_links("") + assert result == [] + + +def test_text_without_urls(): + """should return empty list when no URLs are present""" + text = "This is just plain text with no links at all." + result = extract_links(text) + assert result == [] + + +def test_url_with_special_characters(): + """should handle URLs with hyphens, underscores and other valid chars""" + text = "Check https://my-site.example.com/path_to/resource-123" + result = extract_links(text) + assert result == ["https://my-site.example.com/path_to/resource-123"] + + +def test_multiple_urls_preserves_order(): + """should preserve the order of URLs as they appear in text""" + text = "First https://first.com then https://second.com and https://third.com" + result = extract_links(text) + assert result == ["https://first.com", "https://second.com", "https://third.com"] + + +def test_url_at_end_of_sentence(): + """should extract URL that ends with punctuation""" + text = "Visit our website at https://example.com." + result = extract_links(text) + assert result == ["https://example.com"] + + +def test_url_in_parentheses(): + """should extract URL surrounded by parentheses""" + text = "See the docs (https://docs.example.com) for details." + result = extract_links(text) + assert result == ["https://docs.example.com"] + + +def test_multiple_protocols_mixed(): + """should handle mix of http and https URLs""" + text = "http://old.example.com and https://secure.example.com" + result = extract_links(text) + assert result == ["http://old.example.com", "https://secure.example.com"] + + +def test_url_with_port(): + """should extract URLs with port numbers""" + text = "Connect to https://localhost:8080/api for testing." + result = extract_links(text) + assert result == ["https://localhost:8080/api"] + + +def test_url_with_fragment(): + """should extract URLs with fragments/anchors""" + text = "Jump to https://example.com/page#section-2 directly." + result = extract_links(text) + assert result == ["https://example.com/page#section-2"] + + +def test_multiline_text_with_urls(): + """should extract URLs from multiline text""" + text = """First line with https://example.com +Second line with http://test.org +Third line with https://another.com""" + result = extract_links(text) + assert result == ["https://example.com", "http://test.org", "https://another.com"] + + +def test_urls_without_protocol_not_extracted(): + """should not extract URLs without http/https protocol""" + text = "Visit www.example.com or example.com for info." + result = extract_links(text) + assert result == [] + + +def test_real_world_whatsapp_message(): + """should handle typical WhatsApp message with URLs""" + text = "Olha essa notícia importante: https://g1.globo.com/economia/noticia.html compartilha aí!" + result = extract_links(text) + assert result == ["https://g1.globo.com/economia/noticia.html"] diff --git a/app/agentic_ai/tests/nodes/test_link_expander.py b/app/agentic_ai/tests/nodes/test_link_expander.py index 07d0dae..5133fcf 100644 --- a/app/agentic_ai/tests/nodes/test_link_expander.py +++ b/app/agentic_ai/tests/nodes/test_link_expander.py @@ -14,7 +14,7 @@ _pending_link_tasks, ) from app.agentic_ai.config import MAX_LINKS_TO_EXPAND -from app.ai.context.web.models import WebContentResult +from app.agentic_ai.context.web.models import WebContentResult from app.models.commondata import DataSource diff --git a/app/agentic_ai/tools/fact_check_search.py b/app/agentic_ai/tools/fact_check_search.py index 041f343..d578645 100644 --- a/app/agentic_ai/tools/fact_check_search.py +++ b/app/agentic_ai/tools/fact_check_search.py @@ -1,7 +1,7 @@ """ fact-check search tool — wraps the Google Fact-Check API. -reuses parsing logic from app.ai.context.factcheckapi.google_factcheck_gatherer. +reuses parsing logic from app.agentic_ai.context.factcheckapi.google_factcheck_gatherer. """ import asyncio @@ -13,7 +13,7 @@ import httpx from app.models.agenticai import FactCheckApiContext, SourceReliability -from app.ai.context.factcheckapi.google_factcheck_gatherer import ( +from app.agentic_ai.context.factcheckapi.google_factcheck_gatherer import ( map_english_rating_to_portuguese, ) diff --git a/app/agentic_ai/tools/page_scraper.py b/app/agentic_ai/tools/page_scraper.py index bcca91b..e0f15c7 100644 --- a/app/agentic_ai/tools/page_scraper.py +++ b/app/agentic_ai/tools/page_scraper.py @@ -1,7 +1,7 @@ """ page scraper tool — extracts content from web pages. -reuses scrapeGenericUrl() from app.ai.context.web.apify_utils. +reuses scrapeGenericUrl() from app.agentic_ai.context.web.apify_utils. """ import asyncio @@ -9,7 +9,7 @@ from uuid import uuid4 from app.models.agenticai import ScrapeTarget, WebScrapeContext, SourceReliability -from app.ai.context.web.apify_utils import scrapeGenericUrl +from app.agentic_ai.context.web.apify_utils import scrapeGenericUrl from app.agentic_ai.config import SCRAPE_TIMEOUT_PER_PAGE diff --git a/app/agentic_ai/tools/web_search.py b/app/agentic_ai/tools/web_search.py index 0b66b49..742cab5 100644 --- a/app/agentic_ai/tools/web_search.py +++ b/app/agentic_ai/tools/web_search.py @@ -1,8 +1,7 @@ """ web search tool — runs parallel searches across general + domain-specific sources. -reuses google_search() from app.ai.context.web.google_search -and trusted domains from app.config.trusted_domains. +uses the custom search server for all web searches. """ import asyncio @@ -15,6 +14,7 @@ from app.models.agenticai import GoogleSearchContext, SourceReliability from app.config.trusted_domains import get_trusted_domains +from app.clients.web_search_cache import cached_custom_search from app.agentic_ai.config import DOMAIN_SEARCHES, SEARCH_TIMEOUT_PER_QUERY @@ -122,7 +122,7 @@ async def _search_single( effective_query = query # for general search, use trusted domains via server params - items = await _custom_search( + items = await _cached_custom_search( query=effective_query, num=min(max_results, 50), domains=domains, @@ -211,3 +211,20 @@ async def _custom_search( ) return mapped + + +async def _cached_custom_search( + query: str, + *, + num: int, + domains: list[str] | None, + timeout: float, +) -> list[dict]: + """cache-through wrapper that delegates to cached_custom_search.""" + return await cached_custom_search( + query, + num=num, + domains=domains, + timeout=timeout, + original_search_fn=_custom_search, + ) diff --git a/app/agentic_ai/utils/link_expander.py b/app/agentic_ai/utils/link_expander.py index f7064ea..6a56dba 100644 --- a/app/agentic_ai/utils/link_expander.py +++ b/app/agentic_ai/utils/link_expander.py @@ -18,8 +18,8 @@ from uuid import uuid4 from app.agentic_ai.config import LINK_SCRAPE_TIMEOUT_PER_URL, MAX_LINKS_TO_EXPAND -from app.ai.context.web.apify_utils import scrapeGenericUrl -from app.ai.context.web.models import WebContentResult +from app.agentic_ai.context.web.apify_utils import scrapeGenericUrl +from app.agentic_ai.context.web.models import WebContentResult from app.models.commondata import DataSource logger = logging.getLogger(__name__) diff --git a/app/ai/AGENTIC_PIPELINE.md b/app/ai/AGENTIC_PIPELINE.md deleted file mode 100644 index 0fbd107..0000000 --- a/app/ai/AGENTIC_PIPELINE.md +++ /dev/null @@ -1,637 +0,0 @@ -# Agentic AI — Context Search Loop Implementation Plan - -## Context - -The current fact-checking pipeline uses a fixed, linear flow: extract claims → retrieve evidence → adjudicate. This works but has limitations — it always runs all retrievers regardless of evidence quality, can't loop back when sources are insufficient, and separates claim understanding from search. - -This plan implements an **agentic state machine** using LangGraph where a context-gathering agent decides what to search, evaluates whether it has enough sources, and only proceeds to adjudication when confident. The agent holds typed context dataclasses, uses tools for search/scraping, and builds its evidence iteratively through a loop. - -**Scope**: Context agent graph only. Adjudication agent is out of scope but we define the output contract for it. - ---- - -## 1. Graph Nodes and Edges - -``` - ┌──────────┐ - │ START │ - └────┬─────┘ - │ - ▼ - ┌────────────────┐ - │ context_agent │◄─────────────────┐ - │ (LLM + tools) │ │ - └───────┬────────┘ │ - │ │ - ▼ │ - ┌────────────────┐ ┌─────────────┴──────┐ - │ check_edges │───►│ wait_for_async │ - │ (router) │ │ (wait + re-enter) │ - └───────┬────────┘ └────────────────────┘ - │ - ▼ - ┌────────────────┐ - │ END │ - └────────────────┘ -``` - -### Nodes - -| Node | Type | Description | -|------|------|-------------| -| `context_agent` | LangGraph `ToolNode` + LLM | The core agent. Receives system prompt with formatted context, calls tools, accumulates `ContextEntry` objects in state. Runs until it makes 0 tool calls (signals "done"). | -| `check_edges` | Python function (router) | Evaluates 3 conditions after `context_agent` returns with no tool calls. | -| `wait_for_async` | Python function | Blocks until pending async DataSources resolve, appends them to state, resets `no_tool_call` flag, routes back to `context_agent`. | - -### Edges - -| From | To | Condition | -|------|------|-----------| -| `START` | `context_agent` | Always | -| `context_agent` | `context_agent` | Tool calls pending (LangGraph built-in tool loop) | -| `context_agent` | `check_edges` | No tool calls (agent returned text-only) | -| `check_edges` | `wait_for_async` | `pending_async_count > 0` | -| `check_edges` | `END` | `pending_async_count == 0` OR `iteration_count >= MAX_ITERATIONS` | -| `wait_for_async` | `context_agent` | Always (after async resolves) | - ---- - -## 2. Tools - -### 2.1 Tool: `search_fact_check_api` - -**Interface:** -```python -def search_fact_check_api(queries: list[str]) -> list[FactCheckApiContext] -``` - -**Implementation:** Wraps existing `GoogleFactCheckGatherer._parse_response()` logic. For each query, calls the Fact-Check API endpoint, parses results into `FactCheckApiContext` objects with `reliability = MUITO_CONFIAVEL`. - -**Reuses:** -- `app/ai/context/factcheckapi/google_factcheck_gatherer.py` — API call logic, rating mapping (`map_english_rating_to_portuguese`) -- Base URL and parameter structure already implemented - -**Async:** Uses `httpx.AsyncClient` (already async in existing code). Runs all queries concurrently with `asyncio.gather()`. - -### 2.2 Tool: `search_web` - -**Interface:** -```python -def search_web(queries: list[str], max_results_per_search: int = 5) -> dict[str, list[GoogleSearchContext]] -``` - -**Returns:** Dict keyed by domain group: -```python -{ - "geral": [GoogleSearchContext(...)], # reliability = NEUTRO - "g1": [GoogleSearchContext(...)], # reliability = NEUTRO - "estadao": [GoogleSearchContext(...)], # reliability = NEUTRO - "aosfatos": [GoogleSearchContext(...)], # reliability = MUITO_CONFIAVEL - "folha": [GoogleSearchContext(...)], # reliability = NEUTRO -} -``` - -**Implementation:** For each query, fires 5 parallel searches: -1. General (no domain filter) — uses trusted domains from `app/config/trusted_domains.py` via `_build_search_query_with_domains()` pattern from `WebSearchGatherer` -2. `site_search="g1.globo.com", site_search_filter="i"` -3. `site_search="estadao.com.br", site_search_filter="i"` -4. `site_search="aosfatos.org", site_search_filter="i"` -5. `site_search="folha.uol.com.br", site_search_filter="i"` - -Domains are hardcoded inside the tool. `aosfatos` gets `MUITO_CONFIAVEL` (it's a fact-checker), the rest get `NEUTRO`. - -**Reuses:** -- `app/ai/context/web/google_search.py` — `google_search()` function (already async, supports `site_search` param) -- `app/ai/context/web/web_search_gatherer.py` — `_build_search_query_with_domains()` pattern for general search -- `app/config/trusted_domains.py` — `get_trusted_domains()` for allowlisted domains -- `scripts/playground/google/google_search_cli.py` — domain filtering pattern reference - -**Async:** `google_search()` is already async. Uses `asyncio.gather()` to run all 5 searches × N queries concurrently. - -### 2.3 Tool: `scrape_pages` - -**Interface:** -```python -class ScrapeTarget(BaseModel): - url: str - title: str - -def scrape_pages(targets: list[ScrapeTarget]) -> list[WebScrapeContext] -``` - -**Implementation:** For each target, scrapes the page content and returns a `WebScrapeContext`. Sets `parent_id` from the `GoogleSearchContext.id` that produced the URL (tracked via state). - -**Reuses:** -- `app/ai/context/web/apify_utils.py` — `scrapeGenericUrl()` (production-ready, handles platform detection, fallbacks) -- `app/ai/context/web/models.py` — `WebContentResult` for intermediate parsing -- `app/ai/pipeline/link_context_expander.py` — `expand_link_context()` async wrapper pattern -- Domain-specific extractors from `scripts/playground/` (g1, estadao, folha, aosfatos) as reference for field extraction - -**Async:** `scrapeGenericUrl()` is already async. Uses `asyncio.gather()` with per-target timeout. - -### 2.4 Tool Protocol (for mocking) - -All tools are wrapped behind a protocol so they can be swapped in tests: - -```python -class FactCheckSearchProtocol(Protocol): - async def search(self, queries: list[str]) -> list[FactCheckApiContext]: ... - -class WebSearchProtocol(Protocol): - async def search(self, queries: list[str], max_results_per_search: int = 5) -> dict[str, list[GoogleSearchContext]]: ... - -class PageScraperProtocol(Protocol): - async def scrape(self, targets: list[ScrapeTarget]) -> list[WebScrapeContext]: ... -``` - ---- - -## 3. Agent State - -### 3.1 Graph State Schema - -```python -from langgraph.graph import MessagesState - -class ContextAgentState(MessagesState): - # inputs - data_sources: list[DataSource] - - # accumulated context (append-only) - fact_check_results: list[FactCheckApiContext] - search_results: dict[str, list[GoogleSearchContext]] - scraped_pages: list[WebScrapeContext] - - # control flow - iteration_count: int # incremented each time context_agent runs - pending_async_count: int # decremented as async DataSources arrive - new_data_sources: list[DataSource] # async DataSources that just arrived -``` - -`MessagesState` gives us the built-in `messages` list for the LLM conversation. The typed context fields sit alongside it. - -### 3.2 Context Data Storage - -Each tool call appends to the corresponding typed list: -- `search_fact_check_api` → appends to `fact_check_results` -- `search_web` → merges into `search_results` dict (extends each domain key's list) -- `scrape_pages` → appends to `scraped_pages` - -Context is **append-only** — the LLM naturally ignores weak entries when reasoning. All entries are preserved for logging/analytics. - -### 3.3 Context Dataclasses - -**File:** `app/models/agenticai.py` - -```python -from dataclasses import dataclass, field -from enum import Enum -from typing import Optional - -class SourceReliability(str, Enum): - MUITO_CONFIAVEL = "Muito confiável" - POUCO_CONFIAVEL = "Pouco Confiável" - NEUTRO = "Neutro" - -@dataclass -class BaseContext: - id: str - url: str - parent_id: Optional[str] - reliability: SourceReliability - -@dataclass -class FactCheckApiContext(BaseContext): - title: str = "" - publisher: str = "" - rating: str = "" # VerdictType value - rating_comment: Optional[str] = None - claim_text: str = "" # the claim as seen by the fact-checker - review_date: Optional[str] = None - -@dataclass -class GoogleSearchContext(BaseContext): - title: str = "" - snippet: str = "" - domain: str = "" - position: int = 0 # rank in search results - -@dataclass -class WebScrapeContext(BaseContext): - title: str = "" - content: str = "" - extraction_status: str = "" # success, failed, timeout - extraction_tool: str = "" # apify, beautifulsoup, trafilatura - -class ScrapeTarget(BaseModel): - url: str - title: str -``` - -### 3.4 Output Contract (for future adjudication agent) - -```python -@dataclass -class ContextNodeOutput: - data_sources: list[DataSource] - fact_check_results: list[FactCheckApiContext] - search_results: dict[str, list[GoogleSearchContext]] - scraped_pages: list[WebScrapeContext] -``` - -Built from graph state after the context agent finishes. The adjudication agent receives only structured dataclasses — no prompt artifacts. - ---- - -## 4. Context Agent System Prompt - -### 4.1 Prompt Structure (Portuguese) - -The system prompt is assembled dynamically from a template + formatted context sections: - -``` - -Você é um agente de pesquisa para verificação de fatos. Sua tarefa é reunir -fontes suficientes para que um agente adjudicador possa emitir um veredito -sobre o conteúdo recebido. - - -Ferramentas disponíveis: -1. search_fact_check_api(queries) — busca em bases de fact-checking (Muito confiável) -2. search_web(queries, max_results_per_search) — busca web geral + domínios específicos (G1, Estadão, Aos Fatos, Folha) -3. scrape_pages(targets) — extrai conteúdo completo de páginas web - - -Critérios para considerar fontes SUFICIENTES: -- Para cada afirmação identificada no conteúdo, deve existir ao menos: - • 1 fonte "Muito confiável" que cubra o tema, OU - • 2+ fontes "Neutro" que corroborem a mesma informação, sem fontes de - confiabilidade igual ou maior dizendo o contrário -- Fontes "Pouco confiável" NUNCA são suficientes sozinhas -- Todas as afirmações verificáveis devem ter cobertura - -Se esses critérios estão atendidos, NÃO chame mais ferramentas. -Se NÃO estão atendidos, faça mais buscas com queries diferentes ou mais específicas. - - -Iteração atual: {iteration_count}/{max_iterations} - - -## Conteúdo recebido para verificação -{formatted_data_sources} - - -{formatted_context} -``` - -### 4.2 Context Formatting Function - -The `formatted_context` is built by a function that reads the typed lists and assembles sections ordered by reliability: - -``` -## Fontes — Muito confiável - -### Fact-Check API -[1] Publisher: Agência Lupa | Rating: Falso - URL: https://lupa.uol.com.br/... - Afirmação verificada: "Vacina X causa infertilidade" - Data da revisão: 2025-01-10 - -[2] ... - -### Busca Web — Aos Fatos -[3] Title: "Verificamos: vacina X não causa infertilidade" - URL: https://aosfatos.org/... | Domain: aosfatos.org - Snippet: "Segundo estudos clínicos..." - -## Fontes — Neutro - -### Busca Web — Geral -[4] Title: "Estudo sobre segurança de vacinas" - URL: https://bbc.com/... | Domain: bbc.com - Snippet: "..." - -### Busca Web — G1 -[5] ... - -### Busca Web — Estadão -[6] ... - -### Busca Web — Folha -[7] ... - -## Fontes — Pouco confiável - -### Conteúdo Extraído de Páginas -[8] Title: "..." | URL: https://... - Status: success | Ferramenta: beautifulsoup - Conteúdo (primeiros 500 chars): "..." -``` - -**Global numbering** across all sections so the agent can reference `[N]` unambiguously. Each entry is formatted by a `format_*` method on its dataclass. - -### 4.3 New DataSource Prompt Section - -When `wait_for_async` adds new DataSources and re-enters the agent: - -``` -## ⚠️ NOVO CONTEÚDO RECEBIDO — requer verificação -[N+1] (link_context) URL: https://folha.uol.com.br/... - "Conteúdo completo extraído da página: ..." - -## Contexto já coletado (não buscar novamente) -(existing sections listed normally) -``` - -This section is only present when `new_data_sources` is non-empty in state. After the agent processes it, `new_data_sources` is cleared. - ---- - -## 5. Conditional Edges - -### 5.1 `check_edges` Router Logic - -```python -def check_edges(state: ContextAgentState) -> str: - # safety cap - if state["iteration_count"] >= MAX_ITERATIONS: - return "end" - - # async DataSources still pending - if state["pending_async_count"] > 0: - return "wait_for_async" - - # done - return "end" -``` - -### 5.2 Async DataSource Handling - -The `wait_for_async` node: -1. Blocks on an `asyncio.Event` or polls a shared queue until async DataSources resolve -2. Appends resolved DataSources to `state["data_sources"]` and `state["new_data_sources"]` -3. Decrements `pending_async_count` -4. Increments `iteration_count` -5. Returns to `context_agent` - -The `context_agent` node sees the new DataSources highlighted in its prompt (section 4.3) and decides whether to make additional tool calls. - -### 5.3 Iteration Counter - -- `iteration_count` starts at 0, incremented each time `context_agent` runs -- `MAX_ITERATIONS` defaults to 5 (configurable) -- When hit, the graph proceeds to END regardless of source sufficiency -- The iteration count is shown to the agent in the prompt so it can prioritize - -### 5.4 No-Tool-Call Detection - -LangGraph's built-in `tools_condition` handles the tool loop. When the agent returns a message without tool calls, it routes to `check_edges`. This is standard LangGraph behavior — no custom logic needed. - ---- - -## 6. Implementation Plan - -### 6.1 File Structure - -``` -app/agentic-ai/ -├── __init__.py -├── graph.py # LangGraph graph definition (nodes, edges, compilation) -├── state.py # ContextAgentState TypedDict -├── nodes/ -│ ├── __init__.py -│ ├── context_agent.py # context_agent node (builds prompt, invokes LLM) -│ ├── check_edges.py # check_edges router function -│ └── wait_for_async.py # wait_for_async node -├── tools/ -│ ├── __init__.py -│ ├── protocols.py # Tool protocols (FactCheckSearchProtocol, etc.) -│ ├── fact_check_search.py # search_fact_check_api implementation -│ ├── web_search.py # search_web implementation -│ └── page_scraper.py # scrape_pages implementation -├── prompts/ -│ ├── __init__.py -│ ├── system_prompt.py # system prompt template and builder -│ └── context_formatter.py # format context entries into prompt sections -├── config.py # agentic pipeline config (MAX_ITERATIONS, timeouts, etc.) -└── cli.py # CLI for testing the context agent - -app/models/agenticai.py # SourceReliability, BaseContext, FactCheckApiContext, - # GoogleSearchContext, WebScrapeContext, ScrapeTarget, - # ContextNodeOutput -``` - -### 6.2 New Files — What Each Does - -**`app/models/agenticai.py`** — Data models (section 3.3). Pure dataclasses, no IO. - -**`app/agentic-ai/state.py`** — `ContextAgentState(MessagesState)` TypedDict (section 3.1). - -**`app/agentic-ai/tools/protocols.py`** — Protocol classes for all 3 tools (section 2.4). Used for dependency injection and testing. - -**`app/agentic-ai/tools/fact_check_search.py`** — Implements `FactCheckSearchProtocol`. -- Reuses: `google_factcheck_gatherer.py` API call logic and `map_english_rating_to_portuguese()` -- Creates `FactCheckApiContext` objects instead of `Citation` -- All entries get `reliability = MUITO_CONFIAVEL` - -**`app/agentic-ai/tools/web_search.py`** — Implements `WebSearchProtocol`. -- Reuses: `google_search()` from `app/ai/context/web/google_search.py` -- Reuses: `get_trusted_domains()` from `app/config/trusted_domains.py` for general search -- Reuses: Domain filtering pattern from `scripts/playground/google/google_search_cli.py` (the `site_search` + `site_search_filter="i"` pattern) -- Hardcoded domains: `g1.globo.com`, `estadao.com.br`, `aosfatos.org`, `folha.uol.com.br` -- `aosfatos` → `MUITO_CONFIAVEL`, rest → `NEUTRO` - -**`app/agentic-ai/tools/page_scraper.py`** — Implements `PageScraperProtocol`. -- Reuses: `scrapeGenericUrl()` from `app/ai/context/web/apify_utils.py` -- Reuses: `WebContentResult.from_dict()` from `app/ai/context/web/models.py` -- Reuses: domain-specific extraction patterns from `scripts/playground/` (g1, estadao, folha, aosfatos explorers) -- Sets `parent_id` linking to the search result that provided the URL - -**`app/agentic-ai/prompts/system_prompt.py`** — Template string and `build_system_prompt(state)` function (section 4.1). - -**`app/agentic-ai/prompts/context_formatter.py`** — `format_context(state) -> str` function (section 4.2). Reads typed lists, groups by reliability, applies global numbering. - -**`app/agentic-ai/nodes/context_agent.py`** — The main agent node. -- Builds system prompt via `build_system_prompt(state)` -- Binds tools to the LLM via `model.bind_tools([...])` -- Increments `iteration_count` -- Returns updated messages + state - -**`app/agentic-ai/nodes/check_edges.py`** — Router (section 5.1). Pure function, no IO. - -**`app/agentic-ai/nodes/wait_for_async.py`** — Waits for async DataSources (section 5.2). Appends to state, sets `new_data_sources` flag. - -**`app/agentic-ai/graph.py`** — Compiles the LangGraph `StateGraph`: -```python -graph = StateGraph(ContextAgentState) -graph.add_node("context_agent", context_agent_node) -graph.add_node("check_edges", check_edges_node) -graph.add_node("wait_for_async", wait_for_async_node) - -graph.add_edge(START, "context_agent") -graph.add_conditional_edges("context_agent", tools_condition, { - "tools": "context_agent", # built-in tool loop - "__end__": "check_edges", # no tool calls -}) -graph.add_conditional_edges("check_edges", check_edges_router, { - "wait_for_async": "wait_for_async", - "end": END, -}) -graph.add_edge("wait_for_async", "context_agent") -``` - -**`app/agentic-ai/config.py`** — Constants and config: -- `MAX_ITERATIONS = 5` -- `SEARCH_TIMEOUT_PER_QUERY = 15.0` -- `SCRAPE_TIMEOUT_PER_PAGE = 30.0` -- Domain mappings and reliability assignments - -**`app/agentic-ai/cli.py`** — Interactive CLI (section 9). - -### 6.3 Existing Code Reuse Map - -| New Module | Reuses From | What | -|------------|-------------|------| -| `tools/fact_check_search.py` | `app/ai/context/factcheckapi/google_factcheck_gatherer.py` | API call, `_parse_response`, `map_english_rating_to_portuguese` | -| `tools/web_search.py` | `app/ai/context/web/google_search.py` | `google_search()` async function | -| `tools/web_search.py` | `app/config/trusted_domains.py` | `get_trusted_domains()` for general search domain filter | -| `tools/web_search.py` | `app/ai/context/web/web_search_gatherer.py` | `_build_search_query_with_domains()` pattern | -| `tools/page_scraper.py` | `app/ai/context/web/apify_utils.py` | `scrapeGenericUrl()`, platform detection | -| `tools/page_scraper.py` | `app/ai/context/web/models.py` | `WebContentResult` | -| `tools/page_scraper.py` | `app/ai/pipeline/link_context_expander.py` | `extract_links()`, async wrapper pattern | -| `cli.py` | `scripts/playground/common.py` | `Colors`, `print_header`, `print_section`, `with_spinner`, `prompt_input`, `Menu` | -| `models/agenticai.py` | `app/models/factchecking.py` | `VerdictType` for rating field | - ---- - -## 7. Testing - -Every module gets unit tests. Test files mirror source structure: - -``` -app/agentic-ai/tests/ -├── __init__.py -├── test_state.py # state initialization, append semantics -├── test_graph.py # graph compilation, edge routing -├── nodes/ -│ ├── test_context_agent.py # prompt building, tool binding, iteration increment -│ ├── test_check_edges.py # all 3 routing conditions -│ └── test_wait_for_async.py # async resolution, state update -├── tools/ -│ ├── test_fact_check_search.py # API parsing, rating mapping, error handling -│ ├── test_web_search.py # 5-way parallel search, domain filtering, result grouping -│ └── test_page_scraper.py # scraping, timeout handling, parent_id linking -└── prompts/ - ├── test_system_prompt.py # prompt assembly, section ordering - └── test_context_formatter.py # formatting, global numbering, reliability grouping -``` - -**Key test scenarios:** -- **Tools**: Mock `httpx` / `scrapeGenericUrl` via protocols. Test happy path, timeouts, empty results, API errors. -- **Router**: Test all 3 edge conditions: `pending_async > 0`, `iteration >= MAX`, normal end. -- **Prompt**: Test that context sections appear in reliability order, global numbering is contiguous, new DataSources get the warning header. -- **Graph integration**: Use mock tool protocols to run the full graph with fake API responses and verify state accumulation. - ---- - -## 8. Code Modelling — Mockable IO - -All IO operations are behind protocols (defined in `tools/protocols.py`): - -```python -class FactCheckSearchProtocol(Protocol): - async def search(self, queries: list[str]) -> list[FactCheckApiContext]: ... - -class WebSearchProtocol(Protocol): - async def search(self, queries: list[str], max_results_per_search: int = 5) -> dict[str, list[GoogleSearchContext]]: ... - -class PageScraperProtocol(Protocol): - async def scrape(self, targets: list[ScrapeTarget]) -> list[WebScrapeContext]: ... - -class LLMProtocol(Protocol): - def bind_tools(self, tools: list) -> Any: ... - async def ainvoke(self, messages: list) -> Any: ... -``` - -The graph accepts these protocols via dependency injection: - -```python -def build_graph( - llm: LLMProtocol, - fact_check_searcher: FactCheckSearchProtocol, - web_searcher: WebSearchProtocol, - page_scraper: PageScraperProtocol, -) -> CompiledGraph: -``` - -Tests inject mock implementations that return canned data without hitting any external APIs. - ---- - -## 9. CLI - -**File:** `app/agentic-ai/cli.py` - -Interactive CLI for testing the context agent from the command line. Reuses CLI utilities from `scripts/playground/common.py`. - -**Features:** -- Input text directly or paste a URL -- Watch the agent loop in real-time (show tool calls, context accumulation, iteration count) -- Display final `ContextNodeOutput` formatted with context sections -- Configurable: model, max iterations, timeouts - -**Structure:** -```python -# usage: python -m app.agentic-ai.cli - -def main(): - print_header("Context Agent — Interactive CLI") - - # menu: enter text, enter URL, configure, quit - menu = Menu("Context Agent") - menu.add_option("Verificar texto", handle_text_input) - menu.add_option("Verificar URL", handle_url_input) - menu.add_option("Configurações", show_config) - menu.run() - -def handle_text_input(): - text = prompt_multiline("Cole o texto para verificar") - data_sources = [DataSource(id=uuid4(), source_type="original_text", original_text=text)] - result = with_spinner(lambda: asyncio.run(run_context_agent(data_sources)), "Pesquisando...") - display_result(result) - -def display_result(output: ContextNodeOutput): - print_section("Fact-Check API Results") - for entry in output.fact_check_results: - # formatted display - print_section("Web Search Results") - for domain, entries in output.search_results.items(): - # formatted display per domain - print_section("Scraped Pages") - for entry in output.scraped_pages: - # formatted display -``` - ---- - -## Verification - -### How to test end-to-end: - -1. **Unit tests**: `pytest app/agentic-ai/tests/ -v` -2. **CLI smoke test**: `python -m app.agentic-ai.cli` → paste a claim → verify tools are called, context accumulates, agent stops when satisfied -3. **Integration test with mocks**: Run the compiled graph with mock tool protocols, assert state contains expected context entries after N iterations -4. **Edge case tests**: - - Claim with 0 fact-check results → agent should fall back to web search + scraping - - All API timeouts → agent should stop at MAX_ITERATIONS with whatever it has - - Async DataSource arrives after agent's first pass → verify re-entry with new content highlighted - -### Environment requirements: -```bash -export GOOGLE_SEARCH_API_KEY=... -export GOOGLE_CSE_CX=... -export GOOGLE_API_KEY=... -export APIFY_TOKEN=... -export OPENAI_API_KEY=... -pip install langgraph # if not already installed -``` diff --git a/app/ai/CLAUDE.md b/app/ai/CLAUDE.md deleted file mode 100644 index e8b2944..0000000 --- a/app/ai/CLAUDE.md +++ /dev/null @@ -1,239 +0,0 @@ -# LangChain (Python) — Practical Guide - -> A condensed, deployment-first overview of the LangChain Python stack. Focuses on structure, core concepts, useful primitives, best practices, coding standards, and common pitfalls. Inspired by the “Claude.md” sketch you shared. - ---- - -## 2) Deployment‑First - -- **Expose runnables as REST** with LangServe + FastAPI. Start with a single `/invoke` or `/stream` endpoint per chain, then layer auth/rate limits. citeturn0search7 -- **Trace everything** to LangSmith from day one for debugging and offline evaluation. Store datasets and evaluators early. citeturn0search8turn0search18 -- **Environment isolation:** pin model providers and versions, keep model config in code (temperature, max tokens), and set timeouts/retries. (See callbacks and retry helpers.) citeturn0search0 - ---- - -## 3) Core Principles - -- **Runnable interface + LCEL composition.** Build chains declaratively and get streaming, retries, parallelism, and batch for free. -- **Retrieval as a first‑class boundary.** Wrap vector stores as **retrievers**; swap implementations without touching prompts. -- **Structured outputs.** Bind schemas so models return typed objects; prefer `with_structured_output(...)` or Pydantic parsers. -- **Streaming UX.** Stream tokens and tool events via callbacks for responsive UIs. citeturn0search4turn0search22 -- **Agents when needed.** For tool‑using workflows or multi‑step control, use LangGraph‑based agents. - ---- - -## 4) Tool Usage (Primitives & Concepts) - -### 4.1 Models and LCEL -```python -from langchain_openai import ChatOpenAI -from langchain_core.prompts import ChatPromptTemplate -from langchain_core.output_parsers import StrOutputParser - -prompt = ChatPromptTemplate.from_messages([ - ("system", "You are a concise assistant."), - ("user", "{question}") -]) - -model = ChatOpenAI(model="gpt-4o-mini", temperature=0) -chain = prompt | model | StrOutputParser() - -print(chain.invoke({"question": "What is LCEL?"})) -``` - -LCEL composes `Runnable` stages with `|`, and the same chain supports `.invoke`, `.stream`, `.batch`, and `.ainvoke`. citeturn0search1 - -### 4.2 Retrieval -```python -from langchain_community.vectorstores import FAISS -from langchain_openai import OpenAIEmbeddings -from langchain_core.runnables import RunnableParallel, RunnablePassthrough - -emb = OpenAIEmbeddings() -vs = FAISS.from_texts(["alpha", "beta about LCEL", "gamma docs"], emb) -retriever = vs.as_retriever() # standard retriever interface - -rag = ( - {"context": retriever, "question": RunnablePassthrough()} - | ChatPromptTemplate.from_template("Use context: {context}\nQuestion: {question}") - | model - | StrOutputParser() -) -``` - -Use the retriever interface instead of calling the vector store directly for portability. citeturn0search2 - -### 4.3 Tools for Agents -```python -from langchain_core.tools import tool - -@tool -def search_docs(q: str) -> str: - "Search internal docs and return a short snippet." - return "stub result" -``` - -Create tools with `@tool` to package a function and schema for agent/tool‑calling models. citeturn0search20 - -### 4.4 Structured Output -```python -from pydantic import BaseModel, Field - -class Answer(BaseModel): - verdict: bool = Field(..., description="true if answer is found in context") - gist: str - -typed_chain = (prompt | model).with_structured_output(Answer) -result = typed_chain.invoke({"question": "Is LCEL declarative?"}) -``` - -Use `with_structured_output` or `PydanticOutputParser` to get validated objects. citeturn0search23turn0search17 - -### 4.5 Streaming and Callbacks -```python -from langchain_core.callbacks import StreamingStdOutCallbackHandler - -streaming_model = ChatOpenAI(streaming=True, callbacks=[StreamingStdOutCallbackHandler()]) -list(streaming_model.stream([("user", "stream please")])) # yields chunks -``` - -Streaming is enabled via model support and callback handlers. citeturn0search4turn0search15 - ---- - -## 5) Best Practices - -- **Keep chains stateless;** pass state explicitly or attach to graph state, not globals. Use retrievers not raw store calls. citeturn0search2 -- **Prefer LCEL over ad‑hoc Python.** You get retry, streaming, parallel, and tracing hooks consistently. citeturn0search1 -- **Validate outputs.** Bind JSON schema or Pydantic models; fail fast on parse errors. citeturn0search23 -- **Stream user‑facing flows.** Surface intermediate events for better UX and lower perceived latency. citeturn0search4 -- **Evaluate continuously.** Use LangSmith datasets and evaluators to prevent regressions. citeturn0search24 -- **Deploy early with LangServe.** Treat chains as APIs; version them and pin configs. citeturn0search7 - ---- - -## 6) Coding Standards - -- **Type everything** (tool inputs, outputs, chain configs). Prefer `TypedDict` or `pydantic` models for IO. citeturn0search23 -- **Consistent message handling.** Always construct prompts with `ChatPromptTemplate` and message tuples to avoid format drift. citeturn0search10 -- **Callbacks policy.** Register standard handlers for tracing, streaming, and error logging across all runnables. citeturn0search0 -- **Separation of concerns.** Keep prompt templates, models, and parsers in distinct modules; prefer dependency injection in tests. -- **Async first.** Use `.ainvoke` and `.astream` where IO bound; avoid mixing sync and async in the same call path. - -**References:** Concepts overview; Callbacks; LCEL. citeturn0search10turn0search0turn0search1 - ---- - -## 7) Common Pitfalls and Anti‑Patterns - -- **Using deprecated agent APIs.** Modern agents should be built on **LangGraph**; migrate away from older `AgentExecutor` stacks. citeturn0search3 -- **Incorrect interrupt or state updates** in agent workflows. Model the state machine explicitly with graph nodes and typed state. citeturn0search9 -- **Assuming free‑form text outputs.** Without schema binding, outputs drift and downstream code breaks. Use structured outputs. citeturn0search23 -- **Tight coupling to a single vector store.** Wrap behind the retriever interface to swap FAISS/Milvus/etc. without code churn. citeturn0search19 -- **No streaming or tracing.** Harder debugging and poor UX; enable callbacks and LangSmith from the start. citeturn0search4turn0search8 - ---- - -# Pipeline structure - -# Fact Checking Pipeline - Architecture and Data Model - -This document explains the fact checking pipeline using: - -- The updated architecture diagrams (multi modal intake, claim extraction per modality, tools, final LLM). -- The Pydantic schema that defines the data contracts between each step. - -It is meant as a deployment friendly, implementation ready description of how data flows from the user message to the final fact check answer and analytics. - ---- - -## 1. High level architecture - -At a high level the system treats **claims** as the main entity. -All modalities (text, links, images, audio, video) are converted into text segments from which claims are extracted. - -1. **Channel intake and multi modal preprocessing** - - - Input from WhatsApp ("ZAP"), web, etc. - - User can send: - - text - - links (text-only or text + image) - - images - - audio - - video - - A preprocessor outside the LLM pipeline turns this into: - - a canonical "original message text" for the full message - - a list of typed text segments, each with its origin - (original text, link article, image caption/OCR, audio transcript, video transcript) - -2. **LLM pipeline (text only, claim centered)** - - Once we have textual representations, the core pipeline runs: - - 1. **User input step** - - Registers the original message as seen by the pipeline. - - Builds `CommonPipelineData` (message id, text, locale, timestamp). - 2. **Context expansion** - - Detects links in the message. - - Extracts and enriches content for each link (scraping, readers, APIs). - - Produces an expanded context block and `EnrichedLink` objects. - - Produces `ClaimExtractionInput` records for each text source that will be used in claim extraction (original text, link context, image captions, transcripts, etc). - 3. **Claim extraction (LLM)** - - For each `ClaimExtractionInput`, uses an LLM to extract fact checkable claims. - - Each claim is created as an `ExtractedClaim` with a `ClaimSource` that tells which modality and which source produced it. - - All claims from all modalities are merged into a single list. - 4. **Evidence gathering (multi step, per claim)** - - For each claim: - - Hit fact checking APIs. - - Do web searches. - - Call internal tools (for example via MCP). - - Aggregate all evidence into an `EnrichedClaim` per claim id. - - Collect all of them in an `EvidenceRetrievalResult`. - 5. **Final adjudication (LLM)** - - A stronger model receives: - - original message text (from `CommonPipelineData`) - - the full set of claims - - evidence per claim - - any additional context - - Produces a `FactCheckResult` with a user friendly explanation and per claim discussion (initially as rich text, can be structured later). - -3. **Output and analytics** - - - The final answer is sent back to the user (for example through WhatsApp). - - A rich analytics snapshot is stored with: - - All step outputs (claims, evidence, adjudication). - - All citations and enriched links. - - Engineering timings and token usage per step. - - Analytics entries are indexed **per claim**, but only written after the final adjudication, so each record has the full context of the decision. - ---- - -## 2. Cross cutting data and engineering analytics - -Two helper models travel across many steps and are not tied to a single stage. - -### 2.1 CommonPipelineData - -```python -from typing import Optional -from pydantic import BaseModel, Field, ConfigDict - -class CommonPipelineData(BaseModel): - """Common data that is crucial for the fact-checking pipeline but is used at several different steps""" - message_id: str = Field(..., description="Internal id for the request") - message_text: str = Field(..., description="Original message text") - - locale: str = Field(default="pt-BR", description="Language locale") - timestamp: Optional[str] = Field(None, description="When the message was sent") - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "message_id": "msg-2024-09-20-001", - "message_text": "I heard that vaccine X causes infertility in women, is this true?", - "locale": "pt-BR", - "timestamp": "2024-09-20T15:30:00Z", - } - } - ) -``` \ No newline at end of file diff --git a/app/ai/PIPELINE.md b/app/ai/PIPELINE.md deleted file mode 100644 index cd970eb..0000000 --- a/app/ai/PIPELINE.md +++ /dev/null @@ -1,406 +0,0 @@ -# Fact-Checking Pipeline - -## Overview - -The fact-checking pipeline is a multi-stage, streaming system that receives user content (text, links, images, video transcripts), extracts verifiable claims, gathers evidence from multiple sources, and returns structured verdicts with citations. - -The pipeline is designed for high parallelism using a **fire-and-forget streaming** model: stages do not wait for each other to fully complete before kicking off downstream work. As soon as a claim is extracted, evidence retrieval begins for that claim — without waiting for all other claims to be extracted. - ---- - -## Pipeline Stages - -``` -Input: List[DataSource] - │ - ▼ -[Stage 1] Link Context Expansion - – Scrape URLs found in text - – Produce new DataSources with scraped content - │ - ▼ -[Stage 2] Claim Extraction (parallel per DataSource) - – LLM extracts fact-checkable claims from each source - – Produces ExtractedClaim objects - │ - ▼ -[Stage 3] Evidence Retrieval (parallel per claim, per gatherer) - – Google Fact-Check API - – Web Search (Google Custom Search) - – Produces EnrichedClaim (claim + citations) - │ - ├─────────────────────────────────┐ - ▼ ▼ -[Stage 4A] Adjudication [Stage 4B] Adjudication with Search - (evidence-based, primary) (OpenAI real-time search, fallback) - │ │ - └──────────────┬──────────────────┘ - ▼ - Output: FactCheckResult -``` - -If no claims are extracted from any source, a **no-claims fallback** is triggered, returning a user-friendly explanation. - ---- - -## Stage 1 — Link Context Expansion - -**File**: `pipeline/link_context_expander.py` - -### What it does - -Finds URLs in the original message text, scrapes their content, and adds that content as additional `DataSource` objects so downstream stages can also fact-check claims from the linked pages. - -### Input - -`DataSource` objects with `source_type = "original_text"`. - -### Process - -1. Extracts URLs using a regex pattern (`https?://...`), stripping trailing punctuation. -2. Submits one async scraping job per URL via `ThreadPoolManager`. -3. Enforces per-link and total timeouts; skips URLs that time out. - -### Output - -`List[DataSource]` with `source_type = "link_context"`. Each source includes: -- Scraped page text -- URL and domain metadata -- Social media platform info (author, likes, shares) when available -- Success/error status flag - -### Configuration - -| Key | Description | -|-----|-------------| -| `max_links_to_expand` | Max number of links processed | -| `link_content_expander_timeout_per_link` | Per-URL timeout | -| `link_content_expander_timeout_total` | Timeout for all link expansion | - ---- - -## Stage 2 — Claim Extraction - -**File**: `pipeline/claim_extractor.py` - -### What it does - -Uses an LLM to identify and normalize fact-checkable claims from a data source. Only concrete, verifiable assertions are extracted — not opinions, greetings, or vague statements. - -### Input - -`ClaimExtractionInput`: -```python -data_source: DataSource # text content + source_type + metadata -``` - -### Process - -1. Selects a prompt based on `source_type`: - - `original_text` → general claim extraction - - `link_context` → web article extraction - - `image` → OCR/visual description extraction - - `video_transcript` → transcript extraction -2. Builds a LangChain LCEL chain: `prompt | model.with_structured_output(...)`. -3. Normalizes claims — e.g., "X is being shared as Y" → extracts "Y happened". -4. Assigns each claim a UUID and tracks its originating source. -5. Deduplicates and removes empty claims. - -**Extraction directives**: prefer fewer, richer claims over many vague ones. Every claim must be self-contained (include WHO, WHAT, WHEN, WHERE) and relate to something independently verifiable. - -### Output - -`ClaimExtractionOutput`: -```python -data_source: DataSource -claims: List[ExtractedClaim] - -# ExtractedClaim fields: -id: str # UUID -text: str # Normalized claim text -source: ClaimSource # Reference to originating DataSource -entities: List[str] # Named entities extracted -llm_comment: str # LLM reasoning on why claim is fact-checkable -``` - ---- - -## Stage 3 — Evidence Retrieval - -**File**: `pipeline/evidence_retrieval.py` - -### What it does - -Queries multiple external sources in parallel to gather citations and evidence for each extracted claim. - -### Input - -`EvidenceRetrievalInput`: -```python -claims: List[ExtractedClaim] -``` - -### Process - -For each claim, all configured evidence gatherers run concurrently with a per-gatherer timeout. Errors or timeouts on one gatherer do not block the others. - -**Gatherer 1 — Google Fact-Check API** (`context/google_factcheck_gatherer.py`) -- Queries `https://factchecktools.googleapis.com/v1alpha1/claims:search`. -- Returns structured fact-checks from organizations such as PolitiFact, FactCheck.org, Agência Lupa, etc. -- Translates English ratings to the verdict vocabulary used by the pipeline: - - "True / Correct / Mostly True" → `Verdadeiro` - - "False / Incorrect / Pants on Fire" → `Falso` - - "Misleading / Missing Context" → `Fora de Contexto` - - "Unverifiable / Satire" → `Fontes insuficientes para verificar` - -**Gatherer 2 — Web Search** (`context/web_search_gatherer.py`) -- Queries Google Custom Search API. -- Supports language filtering (`pt-BR`) and trusted domain constraints. -- Returns title, URL, snippet, and domain. - -After all gatherers complete, citations are optionally: -- Deduplicated by URL. -- Filtered by quality (minimum snippet length, required fields). - -### Output - -`EvidenceRetrievalResult`: -```python -claim_evidence_map: Dict[str, EnrichedClaim] -# maps claim_id → EnrichedClaim - -# EnrichedClaim adds to ExtractedClaim: -citations: List[Citation] - -# Citation fields: -url: str -title: str -publisher: str -citation_text: str # Relevant excerpt -source: str # Which gatherer produced this -rating: Optional[str] # Fact-check rating label -rating_comment: str # Additional context -date: Optional[str] # Publication/review date -``` - ---- - -## Stage 4A — Adjudication (Primary) - -**File**: `pipeline/judgement.py` - -### What it does - -Uses an LLM to reason over each claim and its gathered citations, then produces a structured verdict with justification. - -### Input - -`AdjudicationInput`: -```python -sources_with_claims: List[DataSourceWithClaims] -# each contains: data_source + list of EnrichedClaims (claim + citations) -additional_context: Optional[str] -``` - -### Process - -1. Formats all claims and their citations into a structured text block. -2. Builds a LangChain LCEL chain with `structured_output` binding. -3. Calls the LLM with the current date, the formatted evidence, and any additional context. -4. Maps LLM output back to original claim IDs (by ID, with positional fallback). - -**Adjudication directives** (from `pipeline/prompts.py`): -- Assess individual claims first, then evaluate overall context. -- Look for temporal/spatial descontextualization (true fact, wrong time or place). -- Prioritize specialized fact-checking organizations. -- Favor more recent sources when evidence conflicts. -- For atemporal facts (math, definitions), internal knowledge may supplement. -- Cite sources using `[1]`, `[2]`, `[3]` inline notation in the justification. - -### Verdict categories - -| Verdict | Portuguese | Meaning | -|---------|-----------|---------| -| True | Verdadeiro | Confirmed by reliable evidence | -| False | Falso | Contradicted by reliable sources | -| Out of Context | Fora de Contexto | Factually true but misleadingly framed | -| Insufficient Sources | Fontes insuficientes para verificar | Cannot confirm or refute | - -### Output - -`FactCheckResult`: -```python -results: List[DataSourceResult] -overall_summary: Optional[str] # 3–4 line summary across all verdicts -sources_with_claims: List[...] # Full lineage for traceability - -# DataSourceResult fields: -data_source_id: str -source_type: str -claim_verdicts: List[ClaimVerdict] - -# ClaimVerdict fields: -claim_id: str -claim_text: str -verdict: str # One of the 4 categories above -justification: str # Reasoning with inline [N] citations -citations_used: List[int] # Indices of citations used -``` - ---- - -## Stage 4B — Adjudication with Search (Fallback) - -**File**: `pipeline/adjudication_with_search.py` - -### What it does - -An alternative adjudication path that uses OpenAI's real-time web search API. It is submitted as a fire-and-forget job in parallel with the primary adjudication, and used as the final result only if primary adjudication fails or returns empty results. - -### Input - -`List[DataSourceWithExtractedClaims]` — claims **without** pre-gathered evidence. - -### Process - -1. Formats claims grouped by data source into a prompt. -2. Calls the OpenAI Responses API with `tools=[{"type": "web_search"}]` and structured output. -3. OpenAI performs its own web searches during reasoning. -4. Handles UTF-8 encoding issues and repairs malformed JSON in responses. - -### Trade-offs vs. primary adjudication - -| | Primary (4A) | With Search (4B) | -|--|--|--| -| Search control | Full (we choose queries/sources) | Delegated to OpenAI | -| Latency | Depends on evidence retrieval | Single API call | -| Reliability | Depends on our gatherers | Depends on OpenAI | -| Cost | Multiple API calls | One call | - -### When it is used - -The function `_chose_fact_checking_result()` in `main_pipeline.py` selects between the two paths: it uses 4A by default, and falls back to 4B when 4A returns no verdicts. - ---- - -## No-Claims Fallback - -**File**: `pipeline/no_claims_fallback.py` - -### When it is triggered - -After claim extraction, if no valid claims were found across all data sources. - -### What it does - -Calls a lightweight LLM with a friendly prompt that explains why the content does not contain fact-checkable claims. Common reasons include: -- Personal opinions without verifiable assertions. -- Greetings or casual conversation. -- Questions without implicit claims. -- Vague statements lacking specific details. - -### Output - -`NoClaimsFallbackOutput`: -```python -explanation: str # User-friendly message explaining the situation -original_text: str # Echo of the input -``` - -The pipeline wraps this in a `FactCheckResult` with `results=[]` and sets `overall_summary` to the explanation. - ---- - -## Orchestration and Parallelism - -**Files**: `main_pipeline.py`, `async_code.py`, `threads/thread_utils.py` - -### Entry point - -```python -run_fact_check_pipeline( - data_sources: List[DataSource], - config: PipelineConfig, - steps: PipelineSteps, - analytics: ..., - message_id: str, -) -> FactCheckResult -``` - -### Fire-and-forget streaming - -The core async function `fire_and_forget_streaming_pipeline()` uses a `ThreadPoolManager` (singleton, priority queue) to execute work without blocking: - -1. Claim extraction jobs are submitted immediately for all `original_text` sources. -2. Link expansion runs in parallel. -3. As each claim extraction completes, evidence retrieval jobs are submitted for those claims without waiting for other extractions. -4. As link expansion completes, claim extraction is triggered for each expanded source. -5. Adjudication with search (4B) is submitted as a background fire-and-forget task. -6. The main loop waits only for all evidence retrieval to finish, then proceeds to adjudication (4A). - -### Job priorities - -| Operation | Priority | -|-----------|----------| -| Claim extraction | 10 (highest) | -| Adjudication with search | 8 | -| Link expansion (pipeline) | 6 | -| Link context expanding | 5 | -| Evidence retrieval | 3 (lowest) | - -### Error handling and resilience - -| Failure | Behavior | -|---------|----------| -| Evidence gatherer timeout | Empty citations for that gatherer; continue | -| Link expansion timeout | Skip that link; continue with others | -| Primary adjudication fails | Fall back to adjudication with search (4B) | -| No claims extracted | Use no-claims fallback | -| Malformed JSON from LLM | Attempt JSON repair before raising | - ---- - -## Extensibility - -### PipelineSteps protocol - -`steps.py` defines a `PipelineSteps` protocol with one method per pipeline stage: - -```python -expand_links_from_sources(...) -extract_claims_from_all_sources(...) -get_evidence_gatherers(...) -gather_evidence(...) -handle_no_claims_fallback(...) -adjudicate_claims(...) -adjudicate_claims_with_search(...) -``` - -A `DefaultPipelineSteps` implementation is provided and used by default. Swapping it out allows custom implementations for testing or alternative integrations. - -### EvidenceGatherer protocol - -```python -class EvidenceGatherer(Protocol): - async def gather(claim: ExtractedClaim) -> List[Citation] - def gather_sync(claim: ExtractedClaim) -> List[Citation] - source_name: str -``` - -Any class implementing this interface can be added as a new evidence source without touching the pipeline logic. - ---- - -## Configuration Reference - -`PipelineConfig` controls all pipeline behavior: - -| Key | Controls | -|-----|---------| -| `claim_extraction_llm_config` | LLM used for claim extraction | -| `adjudication_llm_config` | LLM used for adjudication | -| `fallback_llm_config` | LLM used for no-claims fallback | -| `max_links_to_expand` | How many URLs to scrape | -| `link_content_expander_timeout_per_link` | Per-URL scraping timeout | -| `link_content_expander_timeout_total` | Total link expansion timeout | -| `enable_adjudication_with_search` | Toggle fallback adjudication path | diff --git a/app/ai/__init__.py b/app/ai/__init__.py deleted file mode 100644 index 8ea1606..0000000 --- a/app/ai/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -AI Module for Fact-Checking Pipeline - -This module contains all AI/LLM-related functionality for the fact-checking system. - -Submodules: -- main_pipeline: Main pipeline orchestration -- pipeline: Individual pipeline steps (claim extraction, evidence gathering, adjudication) -- claim_extractor: Extract claims from text -- adjudicator: Adjudicate claims with LLM -""" - -from .main_pipeline import run_fact_check_pipeline - - -__all__ = [ - "run_fact_check_pipeline" -] diff --git a/app/ai/async_code.py b/app/ai/async_code.py deleted file mode 100644 index 2816606..0000000 --- a/app/ai/async_code.py +++ /dev/null @@ -1,447 +0,0 @@ -""" -parallel execution utilities for fact-checking pipeline. - -provides utilities to run pipeline steps in parallel using ThreadPoolManager, -with support for streaming results and progress tracking. -""" - -import logging -from typing import List, Callable, TypeVar, Dict, Any, Optional -from app.observability.analytics import AnalyticsCollector - -from app.ai.threads.thread_utils import ( - ThreadPoolManager, - OperationType, - map_threaded_async, -) -from app.models import ( - DataSource, - ClaimExtractionInput, - ClaimExtractionOutput, - ExtractedClaim, - EvidenceRetrievalInput, - EnrichedClaim, - EvidenceRetrievalResult, - DataSourceWithExtractedClaims, -) - -logger = logging.getLogger(__name__) - -T = TypeVar("T") -R = TypeVar("R") - -async def parallel_claim_extraction( - data_sources: List[DataSource], - extract_fn: Callable[[ClaimExtractionInput], ClaimExtractionOutput], - manager: Optional[ThreadPoolManager] = None, -) -> List[ClaimExtractionOutput]: - """ - extract claims from all data sources in parallel. - - args: - data_sources: list of data sources to extract claims from - extract_fn: function that extracts claims from a single source - manager: thread pool manager (uses singleton if None) - - returns: - list of claim extraction outputs in same order as input sources - - example: - >>> async def extract_claims(input_data): - ... # your claim extraction logic - ... return ClaimExtractionOutput(...) - >>> - >>> results = await parallel_claim_extraction( - ... data_sources=sources, - ... extract_fn=extract_claims - ... ) - """ - if manager is None: - manager = ThreadPoolManager.get_instance() - - logger.info(f"starting parallel claim extraction for {len(data_sources)} sources") - - # create extraction inputs - extraction_inputs = [ - ClaimExtractionInput(data_source=source) for source in data_sources - ] - - # run extractions in parallel - results = await map_threaded_async( - items=extraction_inputs, - func=extract_fn, - operation_type=OperationType.CLAIMS_EXTRACTION, - manager=manager, - ) - - logger.info( - f"claim extraction completed: {len(results)} outputs, " - f"{sum(len(r.claims) for r in results)} total claims" - ) - - return results - - -def collect_evidence_results( - manager: ThreadPoolManager, - evidence_jobs_submitted: int, - claim_id_to_claim: Dict[str, ExtractedClaim], - pipeline_id: Optional[str] = None, -) -> Dict[str, List[Any]]: - """ - wait for all evidence gathering jobs to complete and collect results. - - waits for evidence jobs to finish and groups citations by claim id. - - args: - manager: thread pool manager - evidence_jobs_submitted: total number of evidence jobs submitted - claim_id_to_claim: dict of claims by id (to initialize citation storage) - - returns: - dict mapping claim_id to list of citations - """ - # initialize citations dict for all claims - claim_citations: Dict[str, List[Any]] = { - claim_id: [] for claim_id in claim_id_to_claim - } - - evidence_completed = 0 - - while evidence_completed < evidence_jobs_submitted: - try: - # wait for next completed evidence gathering (10 second timeout) - _job_id, result = manager.wait_next_completed( - operation_type=OperationType.LINK_EVIDENCE_RETRIEVER, - timeout=10.0, - raise_on_error=False, # don't raise on individual failures - pipeline_id=pipeline_id, - ) - - evidence_completed += 1 - - # result is a tuple: (claim_id, citations) - if isinstance(result, tuple) and len(result) == 2: - claim_id, citations = result - - # add citations to the appropriate claim - if claim_id in claim_citations and isinstance(citations, list): - claim_citations[claim_id].extend(citations) - - logger.info( - f"evidence gathering completed ({evidence_completed}/" - f"{evidence_jobs_submitted}): {len(citations)} citations " - f"for claim {claim_id}" - ) - else: - logger.warning( - f"received citations for unknown claim {claim_id}" - ) - elif isinstance(result, Exception): - logger.error(f"evidence gathering job failed: {result}") - else: - logger.warning(f"unexpected result format: {type(result)}") - - except TimeoutError: - logger.debug("no evidence gathering completed in last 10s, retrying...") - continue - except Exception as e: - logger.error(f"evidence gathering job failed: {e}", exc_info=True) - evidence_completed += 1 - - return claim_citations - - -def fire_evidence_jobs_for_claim( - claim: ExtractedClaim, - evidence_gatherers: List[Any], - manager: ThreadPoolManager, - claim_id_to_claim: Dict[str, ExtractedClaim], - evidence_jobs_by_claim: Dict[str, List[str]], - pipeline_id: Optional[str] = None, -) -> int: - """ - fire individual evidence gathering jobs for a single claim. - - creates one job per evidence gatherer and submits them to the thread pool. - - args: - claim: the claim to gather evidence for - evidence_gatherers: list of evidence gatherers to use - manager: thread pool manager - claim_id_to_claim: dict to track claims by id (updated in place) - evidence_jobs_by_claim: dict to track gatherers per claim (updated in place) - pipeline_id: optional pipeline ID for request isolation - - returns: - number of jobs submitted - """ - # track claim for later enrichment - claim_id_to_claim[claim.id] = claim - evidence_jobs_by_claim[claim.id] = [] - - jobs_submitted = 0 - - # fire one job per gatherer (max parallelism) - for gatherer in evidence_gatherers: - gatherer_name = ( - gatherer.source_name - if hasattr(gatherer, 'source_name') - else type(gatherer).__name__ - ) - - # create sync wrapper that returns (claim_id, citations) tuple - def gather_with_gatherer( - gatherer_instance=gatherer, - claim_instance=claim, - c_id=claim.id - ): - citations = gatherer_instance.gather_sync(claim_instance) - return (c_id, citations) # return tuple for tracking - - manager.submit( - OperationType.LINK_EVIDENCE_RETRIEVER, - gather_with_gatherer, - pipeline_id=pipeline_id, - ) - jobs_submitted += 1 - evidence_jobs_by_claim[claim.id].append(gatherer_name) - - logger.info( - f"fired {gatherer_name} evidence job for claim: {claim.text[:50]}..." - ) - - return jobs_submitted - - -def fire_and_forget_streaming_pipeline( - data_sources: List[DataSource], - extract_fn: Callable[[ClaimExtractionInput], ClaimExtractionOutput], - evidence_gatherers: List[Any], - analytics: AnalyticsCollector, - link_expansion_fn: Optional[Callable[[List[DataSource]], List[DataSource]]] = None, - manager: Optional[ThreadPoolManager] = None, - pipeline_steps: Optional[Any] = None, - enable_adjudication_with_search: bool = False, - pipeline_id: Optional[str] = None, -) -> tuple[List[ClaimExtractionOutput], Dict[str, EnrichedClaim]]: - """ - extract claims and gather evidence using fire-and-forget streaming pattern. - - workflow: - 1. fire claim extraction jobs for original data sources (don't wait) - 2. if link_expansion_fn provided, fire link expansion job (don't wait) - 3. loop while claim extraction or link expansion jobs are pending: - - if claim extraction completes → fire evidence jobs for each claim - - if link expansion completes → fire claim extraction jobs for each expanded source - 4. (optional) fire adjudication with search job after all claims extracted - 5. wait for all evidence gathering jobs to complete - 6. group citations by claim id and build enriched claims - - args: - data_sources: list of original data sources to extract claims from - extract_fn: function that extracts claims from a single source - evidence_gatherers: list of evidence gatherers (e.g., WebSearchGatherer, GoogleFactCheckGatherer) - analytics: analytics collector for tracking pipeline metrics - link_expansion_fn: optional function that expands links from data sources, - returns list of new DataSource objects - manager: thread pool manager (uses singleton if None) - pipeline_steps: pipeline steps instance for calling adjudication (required if enable_adjudication_with_search=True) - enable_adjudication_with_search: if True, fires adjudication with search job after claim extraction - - returns: - tuple of (claim_outputs, enriched_claims_map) - - example: - >>> claim_outputs, enriched_claims = fire_and_forget_streaming_pipeline( - ... data_sources=sources, - ... extract_fn=extract_claims, - ... evidence_gatherers=[WebSearchGatherer(), GoogleFactCheckGatherer()], - ... link_expansion_fn=expand_links, - ... pipeline_steps=steps, - ... enable_adjudication_with_search=True - ... ) - """ - if manager is None: - manager = ThreadPoolManager.get_instance() - - logger.info(f"starting fire-and-forget pipeline for {len(data_sources)} sources") - - # step 1: fire claim extraction jobs for original data sources (don't wait) - claim_extraction_jobs_submitted = 0 - for source in data_sources: - extraction_input = ClaimExtractionInput(data_source=source) - manager.submit( - OperationType.CLAIMS_EXTRACTION, - extract_fn, - extraction_input, - pipeline_id=pipeline_id, - ) - claim_extraction_jobs_submitted += 1 - - # step 2: fire link expansion pipeline job if provided (don't wait) - link_expansion_pending = False - if link_expansion_fn is not None: - manager.submit( - OperationType.LINK_EXPANSION_PIPELINE, - link_expansion_fn, - data_sources, - pipeline_id=pipeline_id, - ) - link_expansion_pending = True - logger.info("fired link expansion pipeline job") - - logger.info(f"fired {claim_extraction_jobs_submitted} claim extraction jobs") - - # track results - claim_outputs: List[ClaimExtractionOutput] = [] - claim_id_to_claim: Dict[str, ExtractedClaim] = {} - - # track evidence jobs: map (claim_id, gatherer_name) -> job - evidence_jobs_submitted = 0 - evidence_jobs_by_claim: Dict[str, List[str]] = {} # claim_id -> list of gatherer names - - # step 3: loop while claim extraction or link expansion jobs are pending - claim_extractions_completed = 0 - - while claim_extractions_completed < claim_extraction_jobs_submitted or link_expansion_pending: - # check for completed claim extraction - try: - _job_id, output = manager.wait_next_completed( - operation_type=OperationType.CLAIMS_EXTRACTION, - timeout=0.1, # non-blocking check - raise_on_error=True, - pipeline_id=pipeline_id, - ) - - claim_extractions_completed += 1 - claim_outputs.append(output) - - logger.info( - f"claim extraction completed ({claim_extractions_completed}/" - f"{claim_extraction_jobs_submitted}): {len(output.claims)} claims " - f"from source {output.data_source.id}" - ) - - # immediately fire INDIVIDUAL evidence gathering jobs for each claim - for claim in output.claims: - jobs_fired = fire_evidence_jobs_for_claim( - claim=claim, - evidence_gatherers=evidence_gatherers, - manager=manager, - claim_id_to_claim=claim_id_to_claim, - evidence_jobs_by_claim=evidence_jobs_by_claim, - pipeline_id=pipeline_id, - ) - evidence_jobs_submitted += jobs_fired - - except TimeoutError: - # no claim extraction completed, check link expansion - pass - except Exception as e: - logger.error(f"claim extraction job failed: {e}", exc_info=True) - claim_extractions_completed += 1 - - # check for completed link expansion pipeline - if link_expansion_pending: - try: - _job_id, expanded_sources = manager.wait_next_completed( - operation_type=OperationType.LINK_EXPANSION_PIPELINE, - timeout=0.1, # non-blocking check - raise_on_error=True, - pipeline_id=pipeline_id, - ) - - link_expansion_pending = False - - # handle None or non-list results - if expanded_sources is None: - logger.warning("link expansion returned None - no sources expanded") - expanded_sources = [] - elif not isinstance(expanded_sources, list): - logger.error(f"link expansion returned unexpected type: {type(expanded_sources)}") - expanded_sources = [] - - logger.info(f"link expansion pipeline completed: {len(expanded_sources)} sources expanded") - #add new link data sources to Analytics - analytics.populate_from_data_sources(expanded_sources) - - - # fire claim extraction jobs for each expanded source - for source in expanded_sources: - extraction_input = ClaimExtractionInput(data_source=source) - manager.submit( - OperationType.CLAIMS_EXTRACTION, - extract_fn, - extraction_input, - pipeline_id=pipeline_id, - ) - claim_extraction_jobs_submitted += 1 - logger.info(f"fired claim extraction for expanded source: {source.id}") - - except TimeoutError: - # no link expansion completed yet - pass - except Exception as e: - logger.error(f"link expansion pipeline job failed: {e}", exc_info=True) - link_expansion_pending = False - - logger.info( - f"all claim extractions completed. waiting for {evidence_jobs_submitted} " - f"evidence gathering jobs" - ) - - # step 3.5 (optional): fire adjudication with search job - adjudication_job_submitted = False - if enable_adjudication_with_search: - if pipeline_steps is None: - logger.warning("adjudication with search enabled but no pipeline_steps provided - skipping") - else: - logger.info("firing adjudication with search job") - - # convert ClaimExtractionOutput to DataSourceWithExtractedClaims - sources_with_claims = [ - DataSourceWithExtractedClaims( - data_source=output.data_source, - extracted_claims=output.claims - ) - for output in claim_outputs - ] - - logger.info(f"converted {len(sources_with_claims)} claim outputs to DataSourceWithExtractedClaims") - - # fire adjudication with search job (fire and forget) - manager.submit( - OperationType.ADJUDICATION_WITH_SEARCH, - pipeline_steps.adjudicate_claims_with_search, - sources_with_claims, - pipeline_id=pipeline_id, - ) - - adjudication_job_submitted = True - logger.info("adjudication with search job submitted (retrieve later with wait_next_completed)") - - # step 4: wait for all evidence gathering jobs and group by claim - claim_citations = collect_evidence_results( - manager, evidence_jobs_submitted, claim_id_to_claim, pipeline_id - ) - - # step 4: build enriched claims from grouped citations - enriched_claims: Dict[str, EnrichedClaim] = {} - for claim_id, claim in claim_id_to_claim.items(): - enriched_claims[claim_id] = EnrichedClaim( - id=claim.id, - text=claim.text, - source=claim.source, - entities=claim.entities, - llm_comment=claim.llm_comment, - citations=claim_citations[claim_id], - ) - - logger.info( - f"fire-and-forget pipeline completed: {len(claim_outputs)} outputs, " - f"{len(enriched_claims)} enriched claims, " - f"{sum(len(c) for c in claim_citations.values())} total citations" - ) - - return claim_outputs, enriched_claims \ No newline at end of file diff --git a/app/ai/context/__init__.py b/app/ai/context/__init__.py deleted file mode 100644 index 303db07..0000000 --- a/app/ai/context/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .protocol import EvidenceGatherer \ No newline at end of file diff --git a/app/ai/context/protocol.py b/app/ai/context/protocol.py deleted file mode 100644 index 99bfcc7..0000000 --- a/app/ai/context/protocol.py +++ /dev/null @@ -1,44 +0,0 @@ - -from typing import List, Protocol -from abc import abstractmethod - -from app.models import ( - ExtractedClaim, - Citation, -) - -# ===== EVIDENCE GATHERER PROTOCOL ===== - -class EvidenceGatherer(Protocol): - """ - Protocol defining the interface for evidence gatherers. - - Any evidence source (web search, fact-check API, database, etc.) - must implement this interface to be pluggable into the pipeline. - """ - - @abstractmethod - async def gather(self, claim: ExtractedClaim) -> List[Citation]: - """ - Gather evidence citations for a given claim. - - Args: - claim: The claim to gather evidence for - - Returns: - List of citations found (can be empty if no evidence found) - """ - - @abstractmethod - def gather_sync(self, claim: ExtractedClaim) -> List[Citation]: - """ - Gather evidence citations for a given claim in an sync way - """ - - @property - @abstractmethod - def source_name(self) -> str: - """ - Returns the name of this evidence source. - Used for citation.source field. - """ \ No newline at end of file diff --git a/app/ai/context/web/CLAUDE.MD b/app/ai/context/web/CLAUDE.MD deleted file mode 100644 index 0424053..0000000 --- a/app/ai/context/web/CLAUDE.MD +++ /dev/null @@ -1,397 +0,0 @@ -# Google Custom Search JSON API - -This file explains how the project uses the Google Custom Search JSON API to programmatically search the web using FastAPI in Python. Sensitive keys are stored in `.env` and should never be version-controlled. - -## Environment Variables - -```env -GOOGLE_SEARCH_API_KEY=your_google_search_api_key -GOOGLE_CSE_CX=your_cse_cx_key -```` - -* `GOOGLE_SEARCH_API_KEY`: your Google Custom Search API key. -* `GOOGLE_CSE_CX`: Programmable Search Engine ID. - -In the code, we use these variables via `os.environ` or through the project settings system. - -Simple example: - -```python -import os - -GOOGLE_SEARCH_API_KEY = os.environ["GOOGLE_SEARCH_API_KEY"] -GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"] -``` - -## Base Endpoint - -All requests are made to: - -```text -https://www.googleapis.com/customsearch/v1 -``` - -Minimum parameters for a search: - -* `key`: `GOOGLE_SEARCH_API_KEY` -* `cx`: `GOOGLE_CSE_CX` -* `q`: search query string - -Conceptual URL example: - -```text -https://www.googleapis.com/customsearch/v1 - ?key=GOOGLE_SEARCH_API_KEY - &cx=GOOGLE_CSE_CX - &q=vacina+causa+autismo -``` - -## Python Search Function - -Example of a Python wrapper using `httpx`: - -```python -import os -from typing import Any, Dict, List, Optional - -import httpx - -GOOGLE_SEARCH_API_KEY = os.environ["GOOGLE_SEARCH_API_KEY"] -GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"] -GOOGLE_SEARCH_BASE_URL = "https://www.googleapis.com/customsearch/v1" - - -class GoogleSearchError(Exception): - pass - - -async def google_search( - query: str, - *, - num: int = 10, - start: int = 1, - site_search: Optional[str] = None, - site_search_filter: Optional[str] = None, # "i" include, "e" exclude - date_restrict: Optional[str] = None, # e.g., "d7", "m1" - sort: Optional[str] = None, # e.g., "date:r:20240101:20241231" - file_type: Optional[str] = None, # e.g., "pdf" - safe: Optional[str] = None, # "active" or "off" - language: Optional[str] = None, # e.g., "lang_pt" -) -> List[Dict[str, Any]]: - """ - performs a search using google custom search api and returns the list of result items - """ - - params: Dict[str, Any] = { - "key": GOOGLE_SEARCH_API_KEY, - "cx": GOOGLE_CSE_CX, - "q": query, - "num": num, - "start": start, - } - - if site_search: - params["siteSearch"] = site_search - if site_search_filter: - params["siteSearchFilter"] = site_search_filter - if date_restrict: - params["dateRestrict"] = date_restrict - if sort: - params["sort"] = sort - if file_type: - params["fileType"] = file_type - if safe: - params["safe"] = safe - if language: - params["lr"] = language - - async with httpx.AsyncClient(timeout=15) as client: - response = await client.get(GOOGLE_SEARCH_BASE_URL, params=params) - - if response.status_code != 200: - raise GoogleSearchError( - f"google search error: {response.status_code} {response.text}" - ) - - data = response.json() - return data.get("items", []) -``` - -all comments start with lowercase letters, as agreed. - -## Integration with FastAPI - -Example of a FastAPI endpoint that exposes the search function: - -```python -from typing import Any, Dict, List, Optional - -from fastapi import APIRouter, HTTPException, Query - -router = APIRouter(prefix="/search", tags=["search"]) - - -@router.get("/") -async def search_endpoint( - q: str = Query(..., description="search text"), - num: int = Query(5, ge=1, le=10), - site: Optional[str] = Query(None, description="domain filter, e.g., who.int"), - last_days: Optional[int] = Query(None, ge=1, description="limit to the last N days"), -) -> Dict[str, Any]: - """ - search endpoint that wraps the google custom search api - """ - - date_restrict = None - if last_days is not None: - date_restrict = f"d{last_days}" - - try: - items = await google_search( - query=q, - num=num, - site_search=site, - site_search_filter="i" if site else None, - date_restrict=date_restrict, - ) - except GoogleSearchError as exc: - raise HTTPException(status_code=502, detail=str(exc)) - - # here you can normalize the fields relevant to your project - results: List[Dict[str, Any]] = [] - for item in items: - results.append( - { - "title": item.get("title"), - "link": item.get("link"), - "snippet": item.get("snippet"), - "displayLink": item.get("displayLink"), - } - ) - - return {"query": q, "results": results} -``` - -## Supported Filters and Recommended Usage - -Below is a summary of the most useful filters for the project. - -### 1. Site Filter - -Two main approaches: - -#### 1.1 `siteSearch` and `siteSearchFilter` Parameters - -* `siteSearch`: domain to filter by, e.g., `who.int` -* `siteSearchFilter`: - - * `"i"` to include only that domain - * `"e"` to exclude that domain - -Example: - -```python -items = await google_search( - query="vaccines cause autism", - site_search="who.int", - site_search_filter="i", -) -``` - -#### 1.2 `site:` Operator in Query String - -Another option is to include `site:` directly in the query: - -```python -items = await google_search( - query="vaccines cause autism site:who.int", -) -``` - -For multiple domains: - -```python -items = await google_search( - query="vaccines cause autism (site:who.int OR site:cdc.gov)", -) -``` - -In general, to keep the code cleaner, prefer `siteSearch` and `siteSearchFilter` when the domain filter comes from config or structured input. - -### 2. Date Filters - -#### 2.1 `dateRestrict` (Relative Range) - -Possible formats: - -* `dN` for the last N days, e.g., `d7` -* `wN` for the last N weeks, e.g., `w4` -* `mN` for the last N months, e.g., `m1` -* `yN` for the last N years, e.g., `y1` - -Example, last 7 days: - -```python -items = await google_search( - query="vaccines cause autism", - date_restrict="d7", -) -``` - -#### 2.2 `sort=date:r:YYYYMMDD:YYYYMMDD` (Absolute Range) - -To limit results to an explicit date range: - -```python -items = await google_search( - query="vaccines cause autism", - sort="date:r:20240101:20241231", -) -``` - -Format: - -* `sort = "date:r:START:END"` -* dates in `YYYYMMDD` format - -You can generate these dates from `datetime` objects in Python. - -### 3. Language - -Use the `lr` parameter to restrict by language: - -* `lr="lang_pt"` for Portuguese -* `lr="lang_en"` for English -* etc. - -Example: - -```python -items = await google_search( - query="vaccines cause autism", - language="lang_pt", -) -``` - -### 4. Safe Search - -Use the `safe` parameter: - -* `"active"` to enable safe search -* `"off"` to disable it - -Example: - -```python -items = await google_search( - query="vaccines cause autism", - safe="active", -) -``` - -### 5. File Type - -Use `fileType` to filter by file type: - -* example: `fileType="pdf"` - -```python -items = await google_search( - query="vaccines cause autism", - file_type="pdf", -) -``` - -You can also reinforce this in the query string: - -```python -items = await google_search( - query="vaccines cause autism filetype:pdf", -) -``` - -### 6. Pagination - -The API supports pagination via: - -* `num`: number of results per call, max 10 -* `start`: index of the first result, starting at 1 - -Example, get the second page of 10 results: - -```python -items = await google_search( - query="vaccines cause autism", - num=10, - start=11, -) -``` - -You can build an internal pagination layer in the project to translate pages 1, 2, 3 into `start` values. - -## Project Best Practices - -* never log the actual API key (`GOOGLE_SEARCH_API_KEY`) -* ensure the `.env` file is never committed to the repository -* if the key leaks, revoke and generate a new one in the Google Console -* centralize the `google_search` function in this module so that any change to parameters or filtering policy is handled in one place - - -Você pode adicionar esse exemplo de saída JSON bruta ao final do seu arquivo de documentação como uma nova seção. Aqui está a versão traduzida e integrada da sugestão, com título e contexto explicando o uso prático para parsing: - -## Example of Raw API Response - -Below is a simplified example of a real response returned by the Google Custom Search API, so that developers can understand the structure and fields for JSON parsing: - -```json -{ - "kind": "customsearch#search", - "searchInformation": { - "searchTime": 0.45, - "formattedSearchTime": "0.45", - "totalResults": "225000", - "formattedTotalResults": "225,000" - }, - "items": [ - { - "kind": "customsearch#result", - "title": "Lauro Jardim: Inaugurada por Michelle Bolsonaro, Casa do Autista ...", - "link": "https://www.facebook.com/jornaloglobo/posts/1177801651049989/", - "displayLink": "www.facebook.com", - "snippet": "Aug 18, 2025 ... preso, e o Bolsonaro seria o Presidente por legitimidade !...", - "pagemap": { - "metatags": [ - { - "og:title": "O Globo", - "og:description": "Casa do Autista em BC é contestada em auditoria" - } - ] - } - }, - { - "kind": "customsearch#result", - "title": "'MARCHA ESTRANHA' | O autismo é uma condição do ...", - "link": "https://www.instagram.com/p/DMNGzWuMVTW/", - "displayLink": "www.instagram.com", - "snippet": "Jul 17, 2025 ... Bolsonaro preso | O presidente dos Estados Unidos, Donald Trump..." - } - // ...more items - ] -} -```` - -### Tip - -To extract the most relevant fields, you can access: - -```python -for item in data["items"]: - title = item["title"] - link = item["link"] - snippet = item.get("snippet", "") - display_link = item.get("displayLink", "") -``` - -This structure should be sufficient for most use cases where the goal is to list and display search results on your frontend or log them for analysis. - - diff --git a/app/ai/context/web/__init__.py b/app/ai/context/web/__init__.py deleted file mode 100644 index e98ef62..0000000 --- a/app/ai/context/web/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from .google_search import searchGoogleClaim, google_search, GoogleSearchError -from .serper_search import serper_search, SerperSearchError -from .web_search_gatherer import WebSearchGatherer - - -__all__ = [ - "searchGoogleClaim", - "google_search", - "GoogleSearchError", - "serper_search", - "SerperSearchError", - "WebSearchGatherer" -] diff --git a/app/ai/context/web/google_search.py b/app/ai/context/web/google_search.py deleted file mode 100644 index 126be08..0000000 --- a/app/ai/context/web/google_search.py +++ /dev/null @@ -1,348 +0,0 @@ -""" -google custom search api integration for web search. -falls back to serper.dev when google fails and SERPER_API_KEY is configured. -""" - -import os -import logging -from typing import Any, Dict - -import httpx - -from app.ai.context.web.serper_search import ( - serper_search, - SerperSearchError, - _is_serper_configured, -) - -logger = logging.getLogger(__name__) - - -class GoogleSearchError(Exception): - """exception raised when google search api fails""" - pass - - -async def searchGoogleClaim(claim: str, maxResults: int = 10, timeout: float = 45.0) -> dict: - """ - search google for information about a claim using Google Custom Search API. - falls back to serper.dev when google fails and SERPER_API_KEY is configured. - - args: - claim: the claim text to search for - maxResults: maximum number of search results to return (max 10) - timeout: timeout in seconds for the search operation (default: 45.0) - - returns: - dict with search results and metadata - """ - result = await _searchGoogleClaimInternal(claim, maxResults, timeout) - - if not result["success"] and _is_serper_configured(): - logger.warning(f"google claim search failed ({result.get('error')}), trying serper fallback") - fallback = await _searchSerperClaimFallback(claim, maxResults, timeout) - if fallback["success"]: - return fallback - - return result - - -async def _searchGoogleClaimInternal(claim: str, maxResults: int = 10, timeout: float = 45.0) -> dict: - """internal google claim search — original logic without fallback.""" - try: - logger.info(f"searching google for claim: {claim[:100]}...") - logger.info(f"search timeout: {timeout}s, max results: {maxResults}") - - api_key = os.environ.get("GOOGLE_SEARCH_API_KEY", "") - cse_cx = os.environ.get("GOOGLE_CSE_CX", "") - - if not api_key or not cse_cx: - logger.error("missing GOOGLE_SEARCH_API_KEY or GOOGLE_CSE_CX environment variables") - return { - "success": False, - "claim": claim, - "results": [], - "total_results": 0, - "error": "missing google search api credentials" - } - - params = { - "key": api_key, - "cx": cse_cx, - "q": claim, - "num": min(maxResults, 10), - "lr": "lang_pt", - } - - base_url = "https://www.googleapis.com/customsearch/v1" - async with httpx.AsyncClient(timeout=timeout) as client: - response = await client.get(base_url, params=params) - - if response.status_code != 200: - error_msg = f"google api returned {response.status_code}: {response.text[:100]}" - logger.error(error_msg) - return { - "success": False, - "claim": claim, - "results": [], - "total_results": 0, - "error": error_msg - } - - data = response.json() - items = data.get("items", []) - - if not items: - logger.info("google search completed: no results found") - return { - "success": False, - "claim": claim, - "results": [], - "total_results": 0, - "error": "no search results found" - } - - searchResults = [] - for position, item in enumerate(items, start=1): - searchResults.append({ - "title": item.get("title", ""), - "url": item.get("link", ""), - "description": item.get("snippet", ""), - "position": position, - "domain": item.get("displayLink", "") - }) - - logger.info(f"google search completed: {len(searchResults)} results found") - - return { - "success": True, - "claim": claim, - "results": searchResults, - "total_results": len(searchResults), - "metadata": { - "search_engine": "google", - "language": "pt", - "api": "google-custom-search" - }, - "error": None - } - - except httpx.TimeoutException: - logger.error(f"google search timeout after {timeout}s") - print(f"\n[GOOGLE SEARCH] TIMEOUT after {timeout}s") - print(f"[GOOGLE SEARCH] claim was: {claim[:100]}...") - return { - "success": False, - "claim": claim, - "results": [], - "total_results": 0, - "error": f"timeout after {timeout}s" - } - except Exception as e: - logger.error(f"google search error: {e}") - print(f"\n[GOOGLE SEARCH] ERROR: {type(e).__name__}: {str(e)[:100]}") - return { - "success": False, - "claim": claim, - "results": [], - "total_results": 0, - "error": str(e) - } - - -async def _searchSerperClaimFallback(claim: str, maxResults: int = 10, timeout: float = 45.0) -> dict: - """fallback claim search using serper.dev, returns same dict format as google.""" - try: - logger.info(f"serper fallback: searching for claim: {claim[:100]}...") - items = await serper_search( - query=claim, - num=min(maxResults, 10), - language="lang_pt", - timeout=timeout, - ) - - if not items: - return { - "success": False, - "claim": claim, - "results": [], - "total_results": 0, - "error": "no search results found (serper fallback)" - } - - searchResults = [] - for position, item in enumerate(items, start=1): - searchResults.append({ - "title": item.get("title", ""), - "url": item.get("link", ""), - "description": item.get("snippet", ""), - "position": position, - "domain": item.get("displayLink", "") - }) - - logger.info(f"serper fallback completed: {len(searchResults)} results found") - - return { - "success": True, - "claim": claim, - "results": searchResults, - "total_results": len(searchResults), - "metadata": { - "search_engine": "serper", - "language": "pt", - "api": "serper-dev-fallback" - }, - "error": None - } - - except Exception as e: - logger.error(f"serper fallback also failed: {e}") - return { - "success": False, - "claim": claim, - "results": [], - "total_results": 0, - "error": f"serper fallback failed: {e}" - } - - -async def google_search( - query: str, - *, - num: int = 10, - start: int = 1, - site_search: str | None = None, - site_search_filter: str | None = None, # "i" include, "e" exclude - date_restrict: str | None = None, # e.g., "d7", "m1" - sort: str | None = None, # e.g., "date:r:20240101:20241231" - file_type: str | None = None, # e.g., "pdf" - safe: str | None = None, # "active" or "off" - language: str | None = None, # e.g., "lang_pt" - timeout: float = 15.0, -) -> list[Dict[str, Any]]: - """ - performs a search using google custom search api and returns the list of result items. - falls back to serper.dev when google fails and SERPER_API_KEY is configured. - - args: - query: search query string - num: number of results to return (1-10) - start: index of first result (for pagination) - site_search: domain to filter by (e.g., "who.int") - site_search_filter: "i" to include only site_search, "e" to exclude - date_restrict: relative date filter (e.g., "d7" for last 7 days) - sort: date sorting (e.g., "date:r:20240101:20241231") - file_type: filter by file type (e.g., "pdf") - safe: safe search setting ("active" or "off") - language: language restriction (e.g., "lang_pt") - timeout: request timeout in seconds - - returns: - list of search result items from google api (or serper fallback) - """ - try: - return await _google_search_internal( - query, - num=num, start=start, - site_search=site_search, site_search_filter=site_search_filter, - date_restrict=date_restrict, sort=sort, file_type=file_type, - safe=safe, language=language, timeout=timeout, - ) - except (GoogleSearchError, httpx.TimeoutException, Exception) as e: - logger.warning(f"google search failed, trying serper fallback: {e}") - return await _serper_fallback( - e, query, - num=num, - site_search=site_search, site_search_filter=site_search_filter, - date_restrict=date_restrict, language=language, timeout=timeout, - ) - - -async def _google_search_internal( - query: str, - *, - num: int = 10, - start: int = 1, - site_search: str | None = None, - site_search_filter: str | None = None, - date_restrict: str | None = None, - sort: str | None = None, - file_type: str | None = None, - safe: str | None = None, - language: str | None = None, - timeout: float = 15.0, -) -> list[Dict[str, Any]]: - """original google search logic without fallback.""" - api_key = os.environ.get("GOOGLE_SEARCH_API_KEY", "") - cse_cx = os.environ.get("GOOGLE_CSE_CX", "") - - if not api_key or not cse_cx: - raise GoogleSearchError("missing GOOGLE_SEARCH_API_KEY or GOOGLE_CSE_CX") - - params: Dict[str, Any] = { - "key": api_key, - "cx": cse_cx, - "q": query, - "num": num, - "start": start, - } - - if site_search: - params["siteSearch"] = site_search - if site_search_filter: - params["siteSearchFilter"] = site_search_filter - if date_restrict: - params["dateRestrict"] = date_restrict - if sort: - params["sort"] = sort - if file_type: - params["fileType"] = file_type - if safe: - params["safe"] = safe - if language: - params["lr"] = language - - base_url = "https://www.googleapis.com/customsearch/v1" - async with httpx.AsyncClient(timeout=timeout) as client: - response = await client.get(base_url, params=params) - - if response.status_code != 200: - raise GoogleSearchError( - f"google search error: {response.status_code} {response.text}" - ) - - data = response.json() - return data.get("items", []) - - -async def _serper_fallback( - original_error: Exception, - query: str, - *, - num: int = 10, - site_search: str | None = None, - site_search_filter: str | None = None, - date_restrict: str | None = None, - language: str | None = None, - timeout: float = 15.0, -) -> list[Dict[str, Any]]: - """attempt serper.dev as fallback; re-raise original error if serper is not configured or also fails.""" - if not _is_serper_configured(): - logger.warning("serper not configured, re-raising original error") - raise original_error - - try: - items = await serper_search( - query=query, - num=num, - site_search=site_search, - site_search_filter=site_search_filter, - date_restrict=date_restrict, - language=language, - timeout=timeout, - ) - logger.info(f"serper fallback succeeded: {len(items)} result(s)") - return items - except Exception as serper_err: - logger.error(f"serper fallback also failed: {serper_err}") - raise original_error from serper_err diff --git a/app/ai/context/web/serper_search.py b/app/ai/context/web/serper_search.py deleted file mode 100644 index a7bd853..0000000 --- a/app/ai/context/web/serper_search.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -serper.dev search api integration — used as fallback when google custom search fails. -""" - -import os -import logging -from typing import Any, Dict - -import httpx - -logger = logging.getLogger(__name__) - -SERPER_API_URL = "https://google.serper.dev/search" - -# language code mapping: google lr format → serper hl/gl -_LANGUAGE_MAP: Dict[str, Dict[str, str]] = { - "lang_pt": {"hl": "pt", "gl": "br"}, - "lang_en": {"hl": "en", "gl": "us"}, - "lang_es": {"hl": "es", "gl": "es"}, - "lang_fr": {"hl": "fr", "gl": "fr"}, - "lang_de": {"hl": "de", "gl": "de"}, -} - - -class SerperSearchError(Exception): - """exception raised when serper.dev search api fails""" - pass - - -def _is_serper_configured() -> bool: - """check whether serper api key is available in the environment.""" - return bool(os.environ.get("SERPER_API_KEY", "")) - - -def _build_serper_query( - query: str, - site_search: str | None = None, - site_search_filter: str | None = None, -) -> str: - """prepend site: or -site: operator to query based on google-style params.""" - if not site_search: - return query - - if site_search_filter == "e": - return f"-site:{site_search} {query}" - # default to include - return f"site:{site_search} {query}" - - -async def serper_search( - query: str, - *, - num: int = 10, - site_search: str | None = None, - site_search_filter: str | None = None, - date_restrict: str | None = None, - language: str | None = None, - timeout: float = 15.0, -) -> list[Dict[str, Any]]: - """ - performs a search using serper.dev api and returns results in the same - format as google custom search (items with title, link, snippet, displayLink). - - args: - query: search query string - num: number of results to return (1-10) - site_search: domain to filter by (e.g., "who.int") - site_search_filter: "i" to include only site_search, "e" to exclude - date_restrict: relative date filter (e.g., "d7" for last 7 days) - language: language restriction in google format (e.g., "lang_pt") - timeout: request timeout in seconds - - returns: - list of search result items matching google cse item format - """ - api_key = os.environ.get("SERPER_API_KEY", "") - if not api_key: - raise SerperSearchError("missing SERPER_API_KEY") - - effective_query = _build_serper_query(query, site_search, site_search_filter) - - payload: Dict[str, Any] = { - "q": effective_query, - "num": min(num, 10), - } - - # map date_restrict (e.g. "d7") → tbs (e.g. "qdr:d7") - if date_restrict: - payload["tbs"] = f"qdr:{date_restrict}" - - # map language - if language and language in _LANGUAGE_MAP: - lang_cfg = _LANGUAGE_MAP[language] - payload["hl"] = lang_cfg["hl"] - payload["gl"] = lang_cfg["gl"] - - headers = { - "X-API-KEY": api_key, - "Content-Type": "application/json", - } - - async with httpx.AsyncClient(timeout=timeout) as client: - response = await client.post(SERPER_API_URL, json=payload, headers=headers) - - if response.status_code != 200: - raise SerperSearchError( - f"serper search error: {response.status_code} {response.text[:200]}" - ) - - data = response.json() - organic = data.get("organic", []) - - # map serper organic results → google cse item format - items: list[Dict[str, Any]] = [] - for result in organic: - items.append({ - "title": result.get("title", ""), - "link": result.get("link", ""), - "snippet": result.get("snippet", ""), - "displayLink": result.get("domain", ""), - }) - - return items diff --git a/app/ai/context/web/tests/__init__.py b/app/ai/context/web/tests/__init__.py deleted file mode 100644 index a612184..0000000 --- a/app/ai/context/web/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# test module for web search gatherer diff --git a/app/ai/context/web/tests/test_google_search_fallback.py b/app/ai/context/web/tests/test_google_search_fallback.py deleted file mode 100644 index 903ee51..0000000 --- a/app/ai/context/web/tests/test_google_search_fallback.py +++ /dev/null @@ -1,231 +0,0 @@ -""" -tests for google search → serper.dev fallback behavior. - -validates: -- google succeeds → serper never called -- google fails + serper succeeds → results from serper -- google timeout + serper succeeds → results from serper -- both fail → original error propagated -- serper not configured → original error propagated - -run with: - pytest app/ai/context/web/tests/test_google_search_fallback.py -v -""" - -import pytest -from unittest.mock import AsyncMock, patch, MagicMock - -import httpx - -from app.ai.context.web.google_search import ( - google_search, - searchGoogleClaim, - GoogleSearchError, -) - - -MOCK_GOOGLE_ITEMS = [ - {"title": "Google Result", "link": "https://google.com/1", "snippet": "from google", "displayLink": "google.com"}, -] - -MOCK_SERPER_ITEMS = [ - {"title": "Serper Result", "link": "https://serper.com/1", "snippet": "from serper", "displayLink": "serper.com"}, -] - - -# ===== google_search() fallback tests ===== - -@pytest.mark.asyncio -async def test_google_succeeds_serper_not_called(): - """when google works, serper should never be called.""" - with patch("app.ai.context.web.google_search._google_search_internal", new_callable=AsyncMock) as mock_google: - with patch("app.ai.context.web.google_search.serper_search", new_callable=AsyncMock) as mock_serper: - mock_google.return_value = MOCK_GOOGLE_ITEMS - - result = await google_search("test query") - - assert result == MOCK_GOOGLE_ITEMS - mock_google.assert_called_once() - mock_serper.assert_not_called() - - -@pytest.mark.asyncio -async def test_google_fails_serper_succeeds(): - """when google raises, fallback to serper and return serper results.""" - with patch("app.ai.context.web.google_search._google_search_internal", new_callable=AsyncMock) as mock_google: - with patch("app.ai.context.web.google_search.serper_search", new_callable=AsyncMock) as mock_serper: - with patch("app.ai.context.web.google_search._is_serper_configured", return_value=True): - mock_google.side_effect = GoogleSearchError("quota exceeded") - mock_serper.return_value = MOCK_SERPER_ITEMS - - result = await google_search("test query") - - assert result == MOCK_SERPER_ITEMS - mock_serper.assert_called_once() - - -@pytest.mark.asyncio -async def test_google_timeout_serper_succeeds(): - """when google times out, fallback to serper.""" - with patch("app.ai.context.web.google_search._google_search_internal", new_callable=AsyncMock) as mock_google: - with patch("app.ai.context.web.google_search.serper_search", new_callable=AsyncMock) as mock_serper: - with patch("app.ai.context.web.google_search._is_serper_configured", return_value=True): - mock_google.side_effect = httpx.TimeoutException("timed out") - mock_serper.return_value = MOCK_SERPER_ITEMS - - result = await google_search("test query") - - assert result == MOCK_SERPER_ITEMS - - -@pytest.mark.asyncio -async def test_both_fail_original_error_raised(): - """when both google and serper fail, original google error is raised.""" - with patch("app.ai.context.web.google_search._google_search_internal", new_callable=AsyncMock) as mock_google: - with patch("app.ai.context.web.google_search.serper_search", new_callable=AsyncMock) as mock_serper: - with patch("app.ai.context.web.google_search._is_serper_configured", return_value=True): - mock_google.side_effect = GoogleSearchError("google down") - mock_serper.side_effect = Exception("serper also down") - - with pytest.raises(GoogleSearchError, match="google down"): - await google_search("test query") - - -@pytest.mark.asyncio -async def test_serper_not_configured_original_error_raised(): - """when serper is not configured, original google error is raised.""" - with patch("app.ai.context.web.google_search._google_search_internal", new_callable=AsyncMock) as mock_google: - with patch("app.ai.context.web.google_search._is_serper_configured", return_value=False): - mock_google.side_effect = GoogleSearchError("missing keys") - - with pytest.raises(GoogleSearchError, match="missing keys"): - await google_search("test query") - - -@pytest.mark.asyncio -async def test_fallback_passes_params_to_serper(): - """verify that relevant params are forwarded to serper_search.""" - with patch("app.ai.context.web.google_search._google_search_internal", new_callable=AsyncMock) as mock_google: - with patch("app.ai.context.web.google_search.serper_search", new_callable=AsyncMock) as mock_serper: - with patch("app.ai.context.web.google_search._is_serper_configured", return_value=True): - mock_google.side_effect = GoogleSearchError("fail") - mock_serper.return_value = [] - - await google_search( - "test query", - num=5, - site_search="who.int", - site_search_filter="i", - date_restrict="d7", - language="lang_pt", - timeout=30.0, - ) - - mock_serper.assert_called_once_with( - query="test query", - num=5, - site_search="who.int", - site_search_filter="i", - date_restrict="d7", - language="lang_pt", - timeout=30.0, - ) - - -# ===== searchGoogleClaim() fallback tests ===== - -@pytest.mark.asyncio -async def test_claim_google_succeeds_no_fallback(): - """when google claim search succeeds, no fallback needed.""" - with patch("app.ai.context.web.google_search._searchGoogleClaimInternal", new_callable=AsyncMock) as mock_internal: - mock_internal.return_value = { - "success": True, - "claim": "test", - "results": [{"title": "r1"}], - "total_results": 1, - "metadata": {"api": "google-custom-search"}, - "error": None, - } - - result = await searchGoogleClaim("test") - - assert result["success"] is True - assert result["metadata"]["api"] == "google-custom-search" - - -@pytest.mark.asyncio -async def test_claim_google_fails_serper_succeeds(): - """when google claim search fails, serper fallback returns results.""" - with patch("app.ai.context.web.google_search._searchGoogleClaimInternal", new_callable=AsyncMock) as mock_internal: - with patch("app.ai.context.web.google_search._searchSerperClaimFallback", new_callable=AsyncMock) as mock_fallback: - with patch("app.ai.context.web.google_search._is_serper_configured", return_value=True): - mock_internal.return_value = { - "success": False, - "claim": "test", - "results": [], - "total_results": 0, - "error": "quota exceeded", - } - mock_fallback.return_value = { - "success": True, - "claim": "test", - "results": [{"title": "serper result"}], - "total_results": 1, - "metadata": {"api": "serper-dev-fallback"}, - "error": None, - } - - result = await searchGoogleClaim("test") - - assert result["success"] is True - assert result["metadata"]["api"] == "serper-dev-fallback" - mock_fallback.assert_called_once() - - -@pytest.mark.asyncio -async def test_claim_both_fail_returns_google_error(): - """when both fail, return the original google failure response.""" - with patch("app.ai.context.web.google_search._searchGoogleClaimInternal", new_callable=AsyncMock) as mock_internal: - with patch("app.ai.context.web.google_search._searchSerperClaimFallback", new_callable=AsyncMock) as mock_fallback: - with patch("app.ai.context.web.google_search._is_serper_configured", return_value=True): - google_result = { - "success": False, - "claim": "test", - "results": [], - "total_results": 0, - "error": "google down", - } - mock_internal.return_value = google_result - mock_fallback.return_value = { - "success": False, - "claim": "test", - "results": [], - "total_results": 0, - "error": "serper also failed", - } - - result = await searchGoogleClaim("test") - - # returns original google error when serper also fails - assert result["success"] is False - assert result["error"] == "google down" - - -@pytest.mark.asyncio -async def test_claim_serper_not_configured_returns_google_error(): - """when serper is not configured, return google failure directly.""" - with patch("app.ai.context.web.google_search._searchGoogleClaimInternal", new_callable=AsyncMock) as mock_internal: - with patch("app.ai.context.web.google_search._is_serper_configured", return_value=False): - google_result = { - "success": False, - "claim": "test", - "results": [], - "total_results": 0, - "error": "missing credentials", - } - mock_internal.return_value = google_result - - result = await searchGoogleClaim("test") - - assert result["success"] is False - assert result["error"] == "missing credentials" diff --git a/app/ai/context/web/tests/test_serper_search.py b/app/ai/context/web/tests/test_serper_search.py deleted file mode 100644 index 3504a40..0000000 --- a/app/ai/context/web/tests/test_serper_search.py +++ /dev/null @@ -1,328 +0,0 @@ -""" -tests for serper.dev search client. - -validates: -- parameter mapping (language, date, site_search → query) -- response mapping (organic → items format) -- error handling (missing key, non-200, timeout) -- query building helpers - -run with: - pytest app/ai/context/web/tests/test_serper_search.py -v -""" - -import pytest -from unittest.mock import AsyncMock, patch, MagicMock - -import httpx - -from app.ai.context.web.serper_search import ( - serper_search, - SerperSearchError, - _is_serper_configured, - _build_serper_query, - SERPER_API_URL, -) - - -# ===== _is_serper_configured ===== - -def test_is_configured_when_key_set(): - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - assert _is_serper_configured() is True - - -def test_is_not_configured_when_key_missing(): - with patch.dict("os.environ", {}, clear=True): - assert _is_serper_configured() is False - - -def test_is_not_configured_when_key_empty(): - with patch.dict("os.environ", {"SERPER_API_KEY": ""}): - assert _is_serper_configured() is False - - -# ===== _build_serper_query ===== - -def test_build_query_no_site(): - assert _build_serper_query("test query") == "test query" - - -def test_build_query_no_site_explicit_none(): - assert _build_serper_query("test query", site_search=None) == "test query" - - -def test_build_query_site_include(): - result = _build_serper_query("vaccines", site_search="who.int", site_search_filter="i") - assert result == "site:who.int vaccines" - - -def test_build_query_site_exclude(): - result = _build_serper_query("vaccines", site_search="fake.com", site_search_filter="e") - assert result == "-site:fake.com vaccines" - - -def test_build_query_site_default_include(): - """when filter is not 'e', default to include.""" - result = _build_serper_query("vaccines", site_search="who.int") - assert result == "site:who.int vaccines" - - -# ===== serper_search — param mapping ===== - -@pytest.mark.asyncio -async def test_serper_search_basic_params(): - """verify basic query and num are sent correctly.""" - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"organic": []} - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - await serper_search("test query", num=5) - - mock_client.post.assert_called_once() - call_kwargs = mock_client.post.call_args - payload = call_kwargs.kwargs.get("json") or call_kwargs[1].get("json") - assert payload["q"] == "test query" - assert payload["num"] == 5 - - -@pytest.mark.asyncio -async def test_serper_search_date_restrict_mapping(): - """verify date_restrict is mapped to tbs with qdr: prefix.""" - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"organic": []} - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - await serper_search("query", date_restrict="d7") - - payload = mock_client.post.call_args.kwargs.get("json") or mock_client.post.call_args[1].get("json") - assert payload["tbs"] == "qdr:d7" - - -@pytest.mark.asyncio -async def test_serper_search_language_mapping(): - """verify language is mapped to hl and gl.""" - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"organic": []} - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - await serper_search("query", language="lang_pt") - - payload = mock_client.post.call_args.kwargs.get("json") or mock_client.post.call_args[1].get("json") - assert payload["hl"] == "pt" - assert payload["gl"] == "br" - - -@pytest.mark.asyncio -async def test_serper_search_unknown_language_ignored(): - """unknown language codes should not add hl/gl.""" - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"organic": []} - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - await serper_search("query", language="lang_xx") - - payload = mock_client.post.call_args.kwargs.get("json") or mock_client.post.call_args[1].get("json") - assert "hl" not in payload - assert "gl" not in payload - - -@pytest.mark.asyncio -async def test_serper_search_num_capped_at_10(): - """num should be capped at 10.""" - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"organic": []} - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - await serper_search("query", num=20) - - payload = mock_client.post.call_args.kwargs.get("json") or mock_client.post.call_args[1].get("json") - assert payload["num"] == 10 - - -# ===== serper_search — response mapping ===== - -@pytest.mark.asyncio -async def test_serper_search_response_mapping(): - """verify organic results are mapped to google cse item format.""" - serper_response = { - "organic": [ - { - "title": "Test Title", - "link": "https://example.com/article", - "snippet": "A test snippet", - "domain": "example.com", - "position": 1, - }, - { - "title": "Second Result", - "link": "https://other.com/page", - "snippet": "Another snippet", - "domain": "other.com", - "position": 2, - }, - ] - } - - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = serper_response - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - items = await serper_search("test") - - assert len(items) == 2 - - # verify first item mapping - assert items[0]["title"] == "Test Title" - assert items[0]["link"] == "https://example.com/article" - assert items[0]["snippet"] == "A test snippet" - assert items[0]["displayLink"] == "example.com" - - # verify second item mapping - assert items[1]["title"] == "Second Result" - assert items[1]["link"] == "https://other.com/page" - - -@pytest.mark.asyncio -async def test_serper_search_empty_organic(): - """empty organic list returns empty items.""" - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"organic": []} - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - items = await serper_search("test") - - assert items == [] - - -# ===== serper_search — error handling ===== - -@pytest.mark.asyncio -async def test_serper_search_missing_key(): - """should raise SerperSearchError when key is missing.""" - with patch.dict("os.environ", {}, clear=True): - with pytest.raises(SerperSearchError, match="missing SERPER_API_KEY"): - await serper_search("test") - - -@pytest.mark.asyncio -async def test_serper_search_non_200(): - """should raise SerperSearchError on non-200 response.""" - mock_response = MagicMock() - mock_response.status_code = 429 - mock_response.text = "rate limit exceeded" - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - with pytest.raises(SerperSearchError, match="429"): - await serper_search("test") - - -@pytest.mark.asyncio -async def test_serper_search_timeout(): - """should propagate httpx.TimeoutException.""" - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.side_effect = httpx.TimeoutException("timed out") - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - with pytest.raises(httpx.TimeoutException): - await serper_search("test") - - -@pytest.mark.asyncio -async def test_serper_search_sends_correct_headers(): - """verify X-API-KEY header is sent.""" - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"organic": []} - - with patch.dict("os.environ", {"SERPER_API_KEY": "my-secret-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - await serper_search("test") - - call_kwargs = mock_client.post.call_args - headers = call_kwargs.kwargs.get("headers") or call_kwargs[1].get("headers") - assert headers["X-API-KEY"] == "my-secret-key" - assert headers["Content-Type"] == "application/json" - - -@pytest.mark.asyncio -async def test_serper_search_posts_to_correct_url(): - """verify request goes to serper api url.""" - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"organic": []} - - with patch.dict("os.environ", {"SERPER_API_KEY": "test-key"}): - with patch("app.ai.context.web.serper_search.httpx.AsyncClient") as mock_client_cls: - mock_client = AsyncMock() - mock_client.post.return_value = mock_response - mock_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) - mock_client_cls.return_value.__aexit__ = AsyncMock(return_value=False) - - await serper_search("test") - - call_args = mock_client.post.call_args - assert call_args[0][0] == SERPER_API_URL diff --git a/app/ai/context/web/tests/web_search_gatherer_test.py b/app/ai/context/web/tests/web_search_gatherer_test.py deleted file mode 100644 index b10c616..0000000 --- a/app/ai/context/web/tests/web_search_gatherer_test.py +++ /dev/null @@ -1,846 +0,0 @@ -# -*- coding: utf-8 -*- -""" -tests for the web search gatherer evidence collection step. - -these tests validate: -- the structure of outputs -- citation extraction from search results -- timeout handling -- error handling -- source attribution -- time profiling via decorator - -IMPORTANT: these tests make REAL calls to the Google Custom Search API. -set GOOGLE_SEARCH_API_KEY and GOOGLE_CSE_CX in your environment before running. - -run with: - pytest app/ai/context/web/tests/web_search_gatherer_test.py -v -s - -the -s flag shows stdout so you can see the search results and time profiling logs. -""" - -from typing import List - -import pytest - -from app.ai.context.web import WebSearchGatherer -from app.models import Citation, ClaimSource, ExtractedClaim - - -# ===== HELPER FUNCTIONS ===== - -def create_test_claim(text: str, claim_id: str = "test-claim-1") -> ExtractedClaim: - """create a test extracted claim.""" - return ExtractedClaim( - id=claim_id, - text=text, - source=ClaimSource( - source_type="original_text", - source_id="msg-001" - ), - entities=[], - llm_comment="test claim" - ) - - -def print_citations(citations: List[Citation], test_name: str): - """print citations for debugging.""" - print("\n" + "=" * 80) - print(f"TEST: {test_name}") - print("=" * 80) - print(f"\nFound {len(citations)} citation(s):\n") - - for i, citation in enumerate(citations, 1): - print(f" Citation {i}:") - print(f" Title: {citation.title[:60]}...") - print(f" URL: {citation.url}") - print(f" Publisher: {citation.publisher}") - print(f" Source: {citation.source}") - if citation.citation_text: - text_preview = ( - citation.citation_text[:80] - if len(citation.citation_text) > 80 - else citation.citation_text - ) - print(f" Citation Text: {text_preview}...") - print() - - -def validate_citation_structure(citation: Citation): - """validate that a citation has the correct structure.""" - # required fields - assert citation.url, "citation URL should not be empty" - assert citation.title, "citation title should not be empty" - assert citation.publisher is not None, "citation should have a publisher" - assert citation.citation_text is not None, "citation should have citation text" - assert citation.source is not None, "citation should have a source" - - # type checks - assert isinstance(citation.url, str), "URL should be a string" - assert isinstance(citation.title, str), "title should be a string" - assert isinstance(citation.publisher, str), "publisher should be a string" - assert isinstance(citation.citation_text, str), "citation text should be string" - assert isinstance(citation.source, str), "source should be a string" - - # source should always be google_web_search - assert citation.source == "google_web_search", "source should be google_web_search" - - # URL should be valid - assert citation.url.startswith("http"), "URL should start with http" - - -def validate_citations_list(citations: List[Citation]): - """validate that a list of citations has the correct structure.""" - assert isinstance(citations, list), "result should be a list" - - for citation in citations: - validate_citation_structure(citation) - - -# ===== ASYNC GATHER TESTS ===== - -@pytest.mark.asyncio -async def test_basic_web_search_gather(): - """test basic web search with real API call.""" - # setup - gatherer = WebSearchGatherer(max_results=3, timeout=45.0) - claim = create_test_claim("COVID-19 vaccine safety studies") - - # execute - real API call - citations = await gatherer.gather(claim) - - # validate - print_citations(citations, "Basic Web Search Gather") - validate_citations_list(citations) - assert len(citations) > 0, "should return at least one citation" - - # verify all citations have proper content - for citation in citations: - assert len(citation.title) > 0, "title should not be empty" - assert citation.url.startswith("http"), "URL should be valid" - - -@pytest.mark.asyncio -async def test_web_search_gather_scientific_claim(): - """test web search with a scientific claim.""" - # setup - gatherer = WebSearchGatherer(max_results=5, timeout=45.0) - claim = create_test_claim("climate change causes global temperature increase") - - # execute - real API call - citations = await gatherer.gather(claim) - - # validate - print_citations(citations, "Scientific Claim Web Search") - validate_citations_list(citations) - assert len(citations) > 0, "should find citations for scientific claim" - - -@pytest.mark.asyncio -async def test_web_search_gather_portuguese_claim(): - """test web search with portuguese text.""" - # setup - gatherer = WebSearchGatherer(max_results=3, timeout=45.0) - claim = create_test_claim("vacinas contra COVID-19 são seguras") - - # execute - real API call - citations = await gatherer.gather(claim) - - # validate - print_citations(citations, "Portuguese Claim Web Search") - validate_citations_list(citations) - assert len(citations) > 0, "should find citations for portuguese claim" - - -@pytest.mark.asyncio -async def test_web_search_gather_with_timeout(): - """test web search with very short timeout to trigger timeout handling.""" - # setup - very short timeout - gatherer = WebSearchGatherer(max_results=5, timeout=0.001) - claim = create_test_claim("test claim with short timeout") - - # execute - should timeout - citations = await gatherer.gather(claim) - - # validate - should return empty list on timeout - print("\n" + "=" * 80) - print("TEST: Web Search with Timeout") - print("=" * 80) - print(f"\nTimeout handled gracefully, returned {len(citations)} citations\n") - - assert len(citations) == 0, "should return empty list on timeout" - - -@pytest.mark.asyncio -async def test_web_search_gather_multiple_results(): - """test web search requesting multiple results.""" - # setup - gatherer = WebSearchGatherer(max_results=10, timeout=60.0) - claim = create_test_claim("python programming language features") - - # execute - real API call - citations = await gatherer.gather(claim) - - # validate - print_citations(citations, "Multiple Results Web Search") - validate_citations_list(citations) - assert len(citations) > 0, "should return multiple citations" - - # verify each citation has unique URL - urls = [c.url for c in citations] - assert len(urls) == len(set(urls)), "all URLs should be unique" - - -@pytest.mark.asyncio -async def test_citation_metadata_fields(): - """test that citation metadata fields are set correctly.""" - # setup - gatherer = WebSearchGatherer(max_results=2, timeout=45.0) - claim = create_test_claim("artificial intelligence machine learning") - - # execute - real API call - citations = await gatherer.gather(claim) - - # validate - assert len(citations) > 0, "should get at least one citation" - - for citation in citations: - # verify metadata fields - assert citation.rating is None, "web search should have None rating" - assert citation.date is None, "web search should have None date" - assert citation.source == "google_web_search", "source should be google_web_search" - - print("\n" + "=" * 80) - print("TEST: Citation Metadata Fields") - print("=" * 80) - print("\n✓ rating field is None (web search doesn't provide ratings)") - print("✓ date field is None (web search doesn't include publication date)") - print("✓ source field is 'google_web_search'") - print() - - -# ===== SYNC GATHER TESTS ===== - -def test_gather_sync_basic(): - """test synchronous gather method with basic claim.""" - # setup - gatherer = WebSearchGatherer(max_results=3, timeout=45.0) - claim = create_test_claim("renewable energy solar power") - - # execute - real API call via sync method - citations = gatherer.gather_sync(claim) - - # validate - print_citations(citations, "Synchronous Gather - Basic") - validate_citations_list(citations) - assert len(citations) > 0, "should return at least one citation" - - -def test_gather_sync_portuguese(): - """test synchronous gather with portuguese claim.""" - # setup - gatherer = WebSearchGatherer(max_results=3, timeout=45.0) - claim = create_test_claim("energia renovável e sustentabilidade") - - # execute - real API call via sync method - citations = gatherer.gather_sync(claim) - - # validate - print_citations(citations, "Synchronous Gather - Portuguese") - validate_citations_list(citations) - assert len(citations) > 0, "should find citations for portuguese claim" - - -def test_gather_sync_with_custom_config(): - """test synchronous gather with custom max_results and timeout.""" - # setup - gatherer = WebSearchGatherer(max_results=5, timeout=60.0) - claim = create_test_claim("quantum computing algorithms") - - # execute - real API call via sync method - citations = gatherer.gather_sync(claim) - - # validate - print_citations(citations, "Synchronous Gather - Custom Config") - validate_citations_list(citations) - assert len(citations) > 0, "should return citations with custom config" - - print(f"✓ Used custom config: max_results=5, timeout=60.0") - print(f"✓ Retrieved {len(citations)} citation(s)") - - -def test_gather_sync_timeout_handling(): - """test synchronous gather with timeout.""" - # setup - very short timeout to trigger timeout - gatherer = WebSearchGatherer(max_results=5, timeout=0.001) - claim = create_test_claim("test sync timeout handling") - - # execute - should timeout - citations = gatherer.gather_sync(claim) - - # validate - should return empty list on timeout - print("\n" + "=" * 80) - print("TEST: Synchronous Gather - Timeout Handling") - print("=" * 80) - print(f"\nTimeout handled gracefully, returned {len(citations)} citations\n") - - assert len(citations) == 0, "should return empty list on timeout" - - -def test_gather_sync_multiple_calls(): - """test multiple sequential synchronous gather calls.""" - # setup - gatherer = WebSearchGatherer(max_results=2, timeout=45.0) - - claims = [ - create_test_claim("blockchain technology", "claim-1"), - create_test_claim("machine learning applications", "claim-2"), - create_test_claim("cloud computing services", "claim-3"), - ] - - all_citations = [] - - # execute - multiple sequential calls - for i, claim in enumerate(claims, 1): - print(f"\n--- Sequential Call {i} ---") - citations = gatherer.gather_sync(claim) - all_citations.append(citations) - - # validate each call - validate_citations_list(citations) - assert len(citations) > 0, f"call {i} should return citations" - - # validate overall - print("\n" + "=" * 80) - print("TEST: Synchronous Gather - Multiple Sequential Calls") - print("=" * 80) - print(f"\n✓ Made {len(claims)} sequential calls") - print(f"✓ Total citations retrieved: {sum(len(c) for c in all_citations)}") - print() - - assert len(all_citations) == 3, "should have results for all 3 claims" - - -def test_gather_sync_publisher_extraction(): - """test that publisher field is correctly extracted in sync mode.""" - # setup - gatherer = WebSearchGatherer(max_results=3, timeout=45.0) - claim = create_test_claim("space exploration mars missions") - - # execute - real API call via sync method - citations = gatherer.gather_sync(claim) - - # validate - assert len(citations) > 0, "should get at least one citation" - - for citation in citations: - # publisher should be extracted and not empty - assert citation.publisher, "publisher should not be empty" - assert len(citation.publisher) > 0, "publisher should have content" - - print("\n" + "=" * 80) - print("TEST: Synchronous Gather - Publisher Extraction") - print("=" * 80) - print("\n✓ All citations have valid publisher information") - print("Publishers found:") - for citation in citations[:3]: # show first 3 - print(f" - {citation.publisher}") - print() - - -def test_gather_sync_citation_text_content(): - """test that citation text is properly populated in sync mode.""" - # setup - gatherer = WebSearchGatherer(max_results=3, timeout=45.0) - claim = create_test_claim("electric vehicles battery technology") - - # execute - real API call via sync method - citations = gatherer.gather_sync(claim) - - # validate - assert len(citations) > 0, "should get at least one citation" - - for citation in citations: - # citation text should not be empty - assert citation.citation_text, "citation text should not be empty" - assert len(citation.citation_text) > 0, "citation text should have content" - - print("\n" + "=" * 80) - print("TEST: Synchronous Gather - Citation Text Content") - print("=" * 80) - print("\n✓ All citations have non-empty citation text") - print(f"✓ Average text length: " - f"{sum(len(c.citation_text) for c in citations) / len(citations):.0f} chars") - print() - - -# ===== INITIALIZATION AND CONFIGURATION TESTS ===== - -def test_gatherer_initialization(): - """test gatherer initialization with custom parameters.""" - # test with custom parameters - gatherer = WebSearchGatherer(max_results=10, timeout=60.0) - - assert gatherer.max_results == 10, "max_results should be set correctly" - assert gatherer.timeout == 60.0, "timeout should be set correctly" - assert gatherer.source_name == "google_web_search", ( - "source name should be google_web_search" - ) - - # test with default parameters - default_gatherer = WebSearchGatherer() - - assert default_gatherer.max_results == 5, "default max_results should be 5" - assert default_gatherer.timeout == 45.0, "default timeout should be 45.0" - - print("\n" + "=" * 80) - print("TEST: Gatherer Initialization") - print("=" * 80) - print("\n✓ Custom parameters: max_results=10, timeout=60.0") - print("✓ Default parameters: max_results=5, timeout=45.0") - print() - - -def test_source_name_property(): - """test that source_name property returns correct value.""" - gatherer = WebSearchGatherer() - - assert gatherer.source_name == "google_web_search", ( - "source_name should be google_web_search" - ) - - print("\n" + "=" * 80) - print("TEST: Source Name Property") - print("=" * 80) - print("\n✓ source_name property returns 'google_web_search'") - print() - - -# ===== TIME PROFILING TEST ===== - -@pytest.mark.asyncio -async def test_time_profiling_decorator(): - """test that time profiling decorator is working.""" - # setup - gatherer = WebSearchGatherer(max_results=2, timeout=45.0) - claim = create_test_claim("time profiling test claim") - - # execute - decorator should log execution time - citations = await gatherer.gather(claim) - - # if we get here without errors, the decorator is working - assert isinstance(citations, list), "should return a list" - - print("\n" + "=" * 80) - print("TEST: Time Profiling Decorator") - print("=" * 80) - print("\n✓ time_profile decorator is applied and working") - print("✓ check logs above for [TIME PROFILE] gather completed in X.XXs") - print() - - -# ===== DOMAIN FILTERING TESTS ===== - -def test_build_search_query_no_domains(): - """test query building without domain filtering.""" - # setup - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=None) - - # execute - query = gatherer._build_search_query_with_domains("vaccines cause autism") - - # validate - assert query == "vaccines cause autism", "should return original query when no domains" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - No Domains") - print("=" * 80) - print(f"\nInput: 'vaccines cause autism'") - print(f"Output: '{query}'") - print("✓ No domain filtering applied (fail-open)") - print() - - -def test_build_search_query_empty_domains(): - """test query building with empty domains list.""" - # setup - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=[]) - - # execute - query = gatherer._build_search_query_with_domains("climate change") - - # validate - assert query == "climate change", "should return original query when domains list is empty" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - Empty Domains List") - print("=" * 80) - print(f"\nInput: 'climate change'") - print(f"Output: '{query}'") - print("✓ No domain filtering applied for empty list") - print() - - -def test_build_search_query_single_domain(): - """test query building with single domain.""" - # setup - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=["who.int"]) - - # execute - query = gatherer._build_search_query_with_domains("COVID-19 vaccines") - - # validate - expected = "COVID-19 vaccines (site:who.int)" - assert query == expected, f"should add single domain filter: expected '{expected}', got '{query}'" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - Single Domain") - print("=" * 80) - print(f"\nInput: 'COVID-19 vaccines'") - print(f"Domains: ['who.int']") - print(f"Output: '{query}'") - print("✓ Single domain filter applied correctly") - print() - - -def test_build_search_query_multiple_domains(): - """test query building with multiple domains.""" - # setup - domains = ["who.int", "cdc.gov", "gov.br"] - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=domains) - - # execute - query = gatherer._build_search_query_with_domains("vaccine safety") - - # validate - expected = "vaccine safety (site:who.int OR site:cdc.gov OR site:gov.br)" - assert query == expected, f"should add multiple domain filters: expected '{expected}', got '{query}'" - - # verify structure - assert "site:who.int" in query, "should contain who.int" - assert "site:cdc.gov" in query, "should contain cdc.gov" - assert "site:gov.br" in query, "should contain gov.br" - assert " OR " in query, "should use OR operator" - assert query.startswith("vaccine safety ("), "should start with original query" - assert query.endswith(")"), "should end with closing parenthesis" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - Multiple Domains") - print("=" * 80) - print(f"\nInput: 'vaccine safety'") - print(f"Domains: {domains}") - print(f"Output: '{query}'") - print("✓ Multiple domain filters with OR operator") - print() - - -def test_build_search_query_domains_with_whitespace(): - """test query building handles domains with whitespace.""" - # setup - domains = [" who.int ", "cdc.gov", " gov.br"] - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=domains) - - # execute - query = gatherer._build_search_query_with_domains("test claim") - - # validate - whitespace should be stripped - assert "site:who.int" in query, "should contain trimmed who.int" - assert "site: who.int " not in query, "should not contain whitespace around domain" - - expected = "test claim (site:who.int OR site:cdc.gov OR site:gov.br)" - assert query == expected, f"should strip whitespace: expected '{expected}', got '{query}'" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - Domains with Whitespace") - print("=" * 80) - print(f"\nInput: 'test claim'") - print(f"Domains: {domains}") - print(f"Output: '{query}'") - print("✓ Whitespace stripped from domains") - print() - - -def test_build_search_query_domains_with_empty_strings(): - """test query building filters out empty domain strings.""" - # setup - domains = ["who.int", "", " ", "cdc.gov"] - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=domains) - - # execute - query = gatherer._build_search_query_with_domains("health data") - - # validate - empty strings should be filtered out - expected = "health data (site:who.int OR site:cdc.gov)" - assert query == expected, f"should filter empty strings: expected '{expected}', got '{query}'" - assert query.count("site:") == 2, "should only have 2 site: operators" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - Filter Empty Strings") - print("=" * 80) - print(f"\nInput: 'health data'") - print(f"Domains: {domains}") - print(f"Output: '{query}'") - print("✓ Empty strings filtered out") - print() - - -def test_build_search_query_all_empty_domains(): - """test query building when all domains are empty/whitespace.""" - # setup - domains = ["", " ", " "] - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=domains) - - # execute - query = gatherer._build_search_query_with_domains("test query") - - # validate - should fail-open and return original query - assert query == "test query", "should return original query when all domains are empty" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - All Empty Domains") - print("=" * 80) - print(f"\nInput: 'test query'") - print(f"Domains: {domains}") - print(f"Output: '{query}'") - print("✓ Fail-open behavior when all domains empty") - print() - - -def test_build_search_query_special_characters_in_claim(): - """test query building with special characters in claim text.""" - # setup - domains = ["who.int", "cdc.gov"] - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=domains) - - # execute - claim with quotes and special chars - claim_text = 'vaccines "mRNA technology" & safety (2024)' - query = gatherer._build_search_query_with_domains(claim_text) - - # validate - expected = 'vaccines "mRNA technology" & safety (2024) (site:who.int OR site:cdc.gov)' - assert query == expected, f"should preserve special chars: expected '{expected}', got '{query}'" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - Special Characters") - print("=" * 80) - print(f"\nInput: '{claim_text}'") - print(f"Domains: {domains}") - print(f"Output: '{query}'") - print("✓ Special characters preserved in query") - print() - - -def test_build_search_query_subdomain_support(): - """test that root domains will match subdomains.""" - # setup - domains = ["gov.br"] # should match saude.gov.br, anvisa.gov.br, etc. - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=domains) - - # execute - query = gatherer._build_search_query_with_domains("Brazilian health policy") - - # validate - expected = "Brazilian health policy (site:gov.br)" - assert query == expected, f"should use root domain: expected '{expected}', got '{query}'" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - Subdomain Support") - print("=" * 80) - print(f"\nInput: 'Brazilian health policy'") - print(f"Domains: {domains}") - print(f"Output: '{query}'") - print("✓ Root domain 'gov.br' will match all *.gov.br subdomains") - print(" (e.g., saude.gov.br, anvisa.gov.br, planalto.gov.br)") - print() - - -def test_build_search_query_many_domains(): - """test query building with many domains.""" - # setup - domains = [ - "who.int", - "cdc.gov", - "gov.br", - "fiocruz.br", - "fapesp.br", - "scielo.br", - "ebc.com.br" - ] - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=domains) - - # execute - query = gatherer._build_search_query_with_domains("health research") - - # validate - assert query.startswith("health research ("), "should start with original query" - assert query.count("site:") == len(domains), f"should have {len(domains)} site: operators" - assert query.count(" OR ") == len(domains) - 1, f"should have {len(domains) - 1} OR operators" - - for domain in domains: - assert f"site:{domain}" in query, f"should contain site:{domain}" - - print("\n" + "=" * 80) - print("TEST: Build Search Query - Many Domains") - print("=" * 80) - print(f"\nInput: 'health research'") - print(f"Domains: {len(domains)} domains") - print(f"Output length: {len(query)} characters") - print(f"✓ All {len(domains)} domains included with OR operators") - print() - - -@pytest.mark.asyncio -async def test_gather_with_domain_filtering(): - """test actual gather with domain filtering (integration test).""" - # setup - limit to trusted health domains - domains = ["who.int", "cdc.gov"] - gatherer = WebSearchGatherer(max_results=3, timeout=45.0, allowed_domains=domains) - claim = create_test_claim("COVID-19 vaccine effectiveness") - - # execute - real API call with domain filtering - citations = await gatherer.gather(claim) - - # validate - print_citations(citations, "Gather with Domain Filtering") - - # assert we got results - assert len(citations) > 0, "should return at least one citation with domain filtering" - - # verify all results are from allowed domains - for citation in citations: - url = citation.url.lower() - url_domain = citation.publisher.lower() - - # check if citation matches any allowed domain - is_allowed = any( - allowed_domain in url or allowed_domain in url_domain - for allowed_domain in domains - ) - - print(f" URL: {citation.url}") - print(f" Publisher: {citation.publisher}") - print(f" Matches allowed domains: {is_allowed}") - print() - - # assert each citation is from an allowed domain - assert is_allowed, ( - f"citation from {citation.publisher} ({citation.url}) " - f"should match one of the allowed domains: {domains}" - ) - - print("✓ Domain filtering applied to search query") - print(f"✓ Allowed domains: {domains}") - print(f"✓ All {len(citations)} citation(s) are from allowed domains") - print() - - -@pytest.mark.asyncio -async def test_gather_with_gov_br_domain_filtering(): - """test gather with gov.br domain to verify subdomain matching (integration test).""" - # setup - restrict to Brazilian government sites - domains = ["gov.br"] - gatherer = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=domains) - claim = create_test_claim("vacinas COVID-19 Brasil") - - # execute - real API call - citations = await gatherer.gather(claim) - - # validate - print_citations(citations, "Gather with gov.br Domain Filtering") - - # assert we got results - assert len(citations) > 0, "should return citations from gov.br domains" - - # verify all results are from gov.br or subdomains - gov_br_count = 0 - for citation in citations: - url = citation.url.lower() - publisher = citation.publisher.lower() - - # check if it's a gov.br domain or subdomain - is_gov_br = "gov.br" in url or "gov.br" in publisher - - print(f" Publisher: {citation.publisher}") - print(f" URL: {citation.url}") - print(f" Is gov.br domain: {is_gov_br}") - - if is_gov_br: - gov_br_count += 1 - # extract subdomain for informational purposes - if "gov.br" in publisher: - subdomain = publisher.split(".gov.br")[0].split(".")[-1] if "." in publisher else "root" - print(f" Subdomain detected: {subdomain}") - print() - - # assert it matches gov.br - assert is_gov_br, ( - f"citation from {citation.publisher} should be from gov.br or its subdomains" - ) - - print(f"✓ All {len(citations)} citation(s) are from gov.br or subdomains") - print(f"✓ Examples: saude.gov.br, anvisa.gov.br, planalto.gov.br, etc.") - print() - - -@pytest.mark.asyncio -async def test_gather_without_vs_with_domain_filtering(): - """test comparing results with and without domain filtering (integration test).""" - claim = create_test_claim("climate change global warming") - - # first: gather without filtering (open web) - gatherer_no_filter = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=None) - citations_no_filter = await gatherer_no_filter.gather(claim) - - # second: gather with strict domain filtering - trusted_domains = ["who.int", "nasa.gov", "noaa.gov"] - gatherer_with_filter = WebSearchGatherer(max_results=5, timeout=45.0, allowed_domains=trusted_domains) - citations_with_filter = await gatherer_with_filter.gather(claim) - - print("\n" + "=" * 80) - print("TEST: Gather Without vs With Domain Filtering") - print("=" * 80) - - # validate both returned results - print(f"\nResults without filtering: {len(citations_no_filter)} citation(s)") - print(f"Results with filtering: {len(citations_with_filter)} citation(s)") - - # assert we got results from both - assert len(citations_no_filter) > 0, "should get results without filtering" - assert len(citations_with_filter) > 0, "should get results with filtering" - - # show domains from unfiltered results - print("\nDomains from UNFILTERED search:") - unfiltered_domains = set() - for citation in citations_no_filter[:5]: - domain = citation.publisher - unfiltered_domains.add(domain) - print(f" - {domain}") - - # show domains from filtered results - print(f"\nDomains from FILTERED search (allowed: {trusted_domains}):") - filtered_domains = set() - for citation in citations_with_filter: - domain = citation.publisher - filtered_domains.add(domain) - print(f" - {domain}") - - # assert each result matches allowed domains - is_allowed = any( - allowed in citation.url.lower() or allowed in domain.lower() - for allowed in trusted_domains - ) - assert is_allowed, ( - f"filtered result {domain} should match one of {trusted_domains}" - ) - - # compare diversity - print(f"\n✓ Unfiltered search returned {len(unfiltered_domains)} unique domains") - print(f"✓ Filtered search returned {len(filtered_domains)} unique domains") - print(f"✓ All filtered results match allowed domains: {trusted_domains}") - print() - - -# ===== PYTEST CONFIGURATION ===== - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) diff --git a/app/ai/context/web/web_search_gatherer.py b/app/ai/context/web/web_search_gatherer.py deleted file mode 100644 index ddfb3a5..0000000 --- a/app/ai/context/web/web_search_gatherer.py +++ /dev/null @@ -1,175 +0,0 @@ -from typing import List, Optional, Any, Dict -import asyncio -import logging -import os - -import httpx - -from app.models import ( - ExtractedClaim, - Citation, -) - -from app.observability.logger import time_profile, PipelineStep -from app.ai.context.web.serper_search import ( - serper_search, - _is_serper_configured, -) - -logger = logging.getLogger(__name__) - -# ===== WEB SEARCH EVIDENCE GATHERER ===== - -class WebSearchGatherer: - """ - Evidence gatherer that uses Google Custom Search API to find relevant information. - - Searches Google for the claim text and converts top results into citations. - """ - - def __init__(self, max_results: int = 5, timeout: float = 45.0, allowed_domains:list[str] | None = None): - """ - Initialize web search gatherer. - - Args: - max_results: Maximum number of search results to retrieve per claim - timeout: Timeout in seconds for web search operations (default: 45.0) - """ - self.max_results = min(max_results, 10) # google api max is 10 - self.timeout = timeout - self.api_key = os.environ.get("GOOGLE_SEARCH_API_KEY", "") - self.cse_cx = os.environ.get("GOOGLE_CSE_CX", "") - self.base_url = "https://www.googleapis.com/customsearch/v1" - if allowed_domains is not None: - self.allowed_domains = allowed_domains - else: - self.allowed_domains = [] - - @property - def source_name(self) -> str: - return "google_web_search" - - @time_profile(PipelineStep.EVIDENCE_RETRIEVAL) - async def gather(self, claim: ExtractedClaim) -> List[Citation]: - """ - Search the web for information about the claim using Google Custom Search API. - Falls back to serper.dev when google fails. - - Args: - claim: The claim to search for - - Returns: - List of citations from search results - """ - try: - print(f"\n[WEB SEARCH] searching for: {claim.text[:80]}...") - print(f"[WEB SEARCH] timeout: {self.timeout}s, max results: {self.max_results}") - - # validate api credentials - if not self.api_key or not self.cse_cx: - print("[WEB SEARCH ERROR] missing GOOGLE_SEARCH_API_KEY or GOOGLE_CSE_CX") - return await self._serper_fallback_gather(claim) - - query_with_domains = self._build_search_query_with_domains(claim.text) - # build request parameters - params: Dict[str, Any] = { - "key": self.api_key, - "cx": self.cse_cx, - "q": query_with_domains, - "num": self.max_results, - } - - # perform search with timeout - async with httpx.AsyncClient(timeout=self.timeout) as client: - response = await client.get(self.base_url, params=params) - - # check response status - if response.status_code != 200: - print(f"[WEB SEARCH ERROR] api returned {response.status_code}: {response.text[:100]}") - return await self._serper_fallback_gather(claim) - - # parse response - data = response.json() - items = data.get("items", []) - - # convert search results to citations - citations = self._items_to_citations(items, source="google_web_search") - - print(f"[WEB SEARCH] found {len(citations)} citation(s)") - return citations - - except httpx.TimeoutException: - print(f"\n[WEB SEARCH ERROR] TIMEOUT after {self.timeout}s") - print(f"[WEB SEARCH ERROR] claim was: {claim.text[:100]}...") - return await self._serper_fallback_gather(claim) - except Exception as e: - print(f"\n[WEB SEARCH ERROR] unexpected error: {type(e).__name__}: {str(e)[:100]}") - return await self._serper_fallback_gather(claim) - - @time_profile(PipelineStep.EVIDENCE_RETRIEVAL) - def gather_sync(self, claim: ExtractedClaim) -> List[Citation]: - """synchronous version - creates new event loop and runs async gather""" - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - return loop.run_until_complete(self.gather(claim)) - finally: - loop.close() - asyncio.set_event_loop(None) - - def _items_to_citations(self, items: list, source: str = "google_web_search") -> List[Citation]: - """convert search result items (google or serper format) to Citation objects.""" - citations: List[Citation] = [] - for item in items: - url = item.get("link", "") - title = item.get("title", "") - snippet = item.get("snippet", "") - display_link = item.get("displayLink", "") - - if not url or not title: - continue - - citation = Citation( - url=url, - title=title, - publisher=display_link if display_link else url.split("/")[2] if len(url.split("/")) > 2 else "unknown", - citation_text=snippet if snippet else title, - source=source, - rating=None, - date=None, - ) - citations.append(citation) - return citations - - async def _serper_fallback_gather(self, claim: ExtractedClaim) -> List[Citation]: - """fallback to serper.dev when google search fails.""" - if not _is_serper_configured(): - logger.warning("serper not configured, returning empty results") - return [] - - try: - print("[WEB SEARCH] trying serper.dev fallback...") - query_with_domains = self._build_search_query_with_domains(claim.text) - items = await serper_search( - query=query_with_domains, - num=self.max_results, - timeout=self.timeout, - ) - citations = self._items_to_citations(items, source="serper_web_search_fallback") - print(f"[WEB SEARCH] serper fallback found {len(citations)} citation(s)") - return citations - except Exception as e: - logger.error(f"serper fallback also failed: {e}") - print(f"[WEB SEARCH ERROR] serper fallback failed: {type(e).__name__}: {str(e)[:100]}") - return [] - - def _build_search_query_with_domains(self,original_query:str)->str: - if not self.allowed_domains: - return original_query - - valid_domains = [d.strip() for d in self.allowed_domains if d and d.strip()] - if not valid_domains: - return original_query - - domain_filters = " OR ".join([f"site:{domain}" for domain in valid_domains]) - return f"{original_query} ({domain_filters})" \ No newline at end of file diff --git a/app/ai/log_utils.py b/app/ai/log_utils.py deleted file mode 100644 index d350df3..0000000 --- a/app/ai/log_utils.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -Logging utilities for the fact-checking pipeline. - -provides structured logging functions for complex pipeline steps, -particularly for adjudication input and output. -""" - -from typing import TYPE_CHECKING - -from app.observability.logger import get_logger, PipelineStep - -if TYPE_CHECKING: - from app.models import AdjudicationInput, FactCheckResult, LLMConfig - - -def log_adjudication_input( - adjudication_input: "AdjudicationInput", - llm_config: "LLMConfig" -) -> None: - """ - log detailed information about adjudication input. - - logs the structure of the adjudication input including data sources, - enriched claims, citations, and LLM configuration. useful for debugging - and understanding what data is being passed to the adjudication step. - - args: - adjudication_input: the input to the adjudication step - llm_config: LLM configuration for adjudication - """ - logger = get_logger(__name__, PipelineStep.ADJUDICATION) - logger.info( - f"adjudication input created successfully: " - f"{len(adjudication_input.sources_with_claims)} data sources" - ) - - # log summary of each data source with claims - for i, ds_with_claims in enumerate(adjudication_input.sources_with_claims, 1): - ds = ds_with_claims.data_source - claims = ds_with_claims.enriched_claims - - logger.info( - f"{i}. data source: {ds.id} ({ds.source_type}) " - f"with {len(claims)} enriched claims" - ) - - # log details of each claim (debug level) - for j, claim in enumerate(claims, 1): - citations_count = len(claim.citations) - text_preview = claim.text[:80] if len(claim.text) > 80 else claim.text - - logger.debug(f" {j}) claim ID: {claim.id}") - logger.debug(f" text: {text_preview}...") - logger.debug(f" citations: {citations_count}") - logger.debug( - f" source: {claim.source.source_type} ({claim.source.source_id})" - ) - - # debug validation logging - logger.debug("adjudication input validation:") - logger.debug( - f" sources_with_claims type: {type(adjudication_input.sources_with_claims)}" - ) - logger.debug( - f" sources_with_claims length: {len(adjudication_input.sources_with_claims)}" - ) - - # detailed debug logging for first source - for i, swc in enumerate(adjudication_input.sources_with_claims): - logger.debug(f"source {i+1} detailed inspection:") - logger.debug(f" data_source.id: {swc.data_source.id}") - logger.debug(f" data_source.source_type: {swc.data_source.source_type}") - logger.debug(f" enriched_claims length: {len(swc.enriched_claims)}") - - if swc.enriched_claims: - first_claim = swc.enriched_claims[0] - logger.debug(f" first claim ID: {first_claim.id}") - logger.debug(f" first claim has 'text' attr: {hasattr(first_claim, 'text')}") - logger.debug( - f" first claim has 'citations' attr: {hasattr(first_claim, 'citations')}" - ) - - if hasattr(first_claim, 'text'): - text_preview = first_claim.text[:50] if len(first_claim.text) > 50 else first_claim.text - logger.debug(f" first claim text preview: {text_preview}...") - - if hasattr(first_claim, 'citations'): - logger.debug(f" first claim citations count: {len(first_claim.citations)}") - if first_claim.citations: - first_citation = first_claim.citations[0] - logger.debug(f" first citation type: {type(first_citation)}") - logger.debug( - f" first citation has 'citation_text': " - f"{hasattr(first_citation, 'citation_text')}" - ) - logger.debug( - f" first citation has 'date': " - f"{hasattr(first_citation, 'date')}" - ) - - # log LLM configuration - logger.debug("LLM config:") - - # handle both ChatOpenAI (has model_name) and AzureChatOpenAI (has azure_deployment) - model_identifier = getattr(llm_config.llm, 'model_name', None) or \ - getattr(llm_config.llm, 'azure_deployment', 'unknown') or \ - getattr(llm_config.llm, 'model', 'unknown') - - logger.debug(f" model: {model_identifier}") - - # temperature might be None for o3 models - temperature = getattr(llm_config.llm, 'temperature', 'N/A') - logger.debug(f" temperature: {temperature}") - - # timeout should always be present - timeout = getattr(llm_config.llm, 'timeout', 'N/A') - logger.debug(f" timeout: {timeout}") - - logger.info("=" * 80) - logger.info("adjudication - making final verdicts") - logger.info("=" * 80) - - -def log_adjudication_output(fact_check_result: "FactCheckResult") -> None: - """ - log detailed information about adjudication output. - - logs the results of the adjudication step including verdicts, - justifications, and overall summary. useful for understanding - the final decisions made by the adjudication LLM. - - args: - fact_check_result: the output from the adjudication step - """ - logger = get_logger(__name__, PipelineStep.ADJUDICATION) - - logger.info( - f"adjudication completed: " - f"{len(fact_check_result.results)} data source results" - ) - - # log summary of each data source result - for i, ds_result in enumerate(fact_check_result.results, 1): - logger.info( - f"{i}. data source: {ds_result.data_source_id} ({ds_result.source_type}) " - f"with {len(ds_result.claim_verdicts)} verdicts" - ) - - # log details of each verdict (debug level) - for j, verdict in enumerate(ds_result.claim_verdicts, 1): - claim_preview = ( - verdict.claim_text[:60] if len(verdict.claim_text) > 60 - else verdict.claim_text - ) - justification_preview = ( - verdict.justification[:100] if len(verdict.justification) > 100 - else verdict.justification - ) - - logger.debug(f" {j}) claim: {claim_preview}...") - logger.debug(f" verdict: {verdict.verdict}") - logger.debug(f" justification: {justification_preview}...") - - # log overall summary if present - if fact_check_result.overall_summary: - logger.info(f"overall summary: {fact_check_result.overall_summary}") diff --git a/app/ai/main_pipeline.py b/app/ai/main_pipeline.py deleted file mode 100644 index 00cb66a..0000000 --- a/app/ai/main_pipeline.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -Main fact-checking pipeline orchestration. - -This module coordinates the full fact-checking flow: -1. Link context expansion - extract and expand URLs from original text -2. Claim extraction - extract claims from all data sources -3. Evidence retrieval - gather supporting/refuting evidence -4. Adjudication - make final verdicts - -Architecture: -- Async-first design for efficient IO operations -- Type-safe with Pydantic models throughout -- Stateless functions with explicit dependencies -- Dependency injection for pipeline steps (enables testing and customization) -- Parallel execution using ThreadPoolManager for IO-bound operations -""" - -from typing import List - -from app.models import ( - DataSource, - ClaimExtractionOutput, - PipelineConfig, - EvidenceRetrievalResult, - ClaimExtractionInput, - AdjudicationInput, - DataSourceWithClaims, - EnrichedClaim, - FactCheckResult, - VerdictTypeEnum -) -from app.ai.pipeline.steps import PipelineSteps -from app.ai.threads.thread_utils import ThreadPoolManager -from app.ai.async_code import fire_and_forget_streaming_pipeline -from app.ai.pipeline.claim_extractor import extract_claims -from app.observability.analytics import AnalyticsCollector -from app.observability.logger import get_logger, PipelineStep -from app.ai.log_utils import log_adjudication_input, log_adjudication_output -from .utils import _chose_fact_checking_result - - -def build_adjudication_input( - claim_outputs: List[ClaimExtractionOutput], - evidence_result: EvidenceRetrievalResult, -) -> AdjudicationInput: - """ - build adjudication input by grouping enriched claims with their original data sources. - - this function reconstructs the data lineage by: - 1. taking each data source from claim extraction outputs - 2. finding all enriched claims that were extracted from that source - 3. grouping them together into DataSourceWithClaims objects - - args: - claim_outputs: list of claim extraction outputs (each has a data source + extracted claims) - evidence_result: evidence retrieval result mapping claim IDs to enriched claims - - returns: - AdjudicationInput ready for the adjudication step - - example: - >>> claim_outputs = [...] # from claim extraction step - >>> evidence_result = EvidenceRetrievalResult(claim_evidence_map={...}) - >>> adj_input = build_adjudication_input(claim_outputs, evidence_result) - >>> print(len(adj_input.sources_with_claims)) - 2 # number of data sources - """ - sources_with_claims: List[DataSourceWithClaims] = [] - - for output in claim_outputs: - data_source = output.data_source - enriched_claims_for_this_source: List[EnrichedClaim] = [] - - # for each claim extracted from this data source - for extracted_claim in output.claims: - claim_id = extracted_claim.id - - # find the corresponding enriched claim in evidence result - if claim_id in evidence_result.claim_evidence_map: - enriched_claim = evidence_result.claim_evidence_map[claim_id] - enriched_claims_for_this_source.append(enriched_claim) - - # create DataSourceWithClaims object - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=enriched_claims_for_this_source - ) - sources_with_claims.append(source_with_claims) - - return AdjudicationInput( - sources_with_claims=sources_with_claims, - additional_context=None - ) - -async def run_fact_check_pipeline( - data_sources: List[DataSource], - config: PipelineConfig, - steps: PipelineSteps, - analytics: AnalyticsCollector, - message_id: str -) -> FactCheckResult: - """ - run the complete fact-checking pipeline on a list of data sources. - - pipeline steps: - 1. identify original_text sources and extract links - 2. expand links to create new link_context data sources - 3. extract claims from all data sources (original + expanded) - 4. gather evidence for each claim (citations, fact-check APIs, web search) - 5. adjudicate claims and make final verdicts based on evidence - - args: - data_sources: list of data sources to fact-check - config: pipeline configuration with timeout and LLM settings (required) - steps: pipeline steps implementation. If None, uses DefaultPipelineSteps. - analytics: analytics collector for tracking metrics - message_id: unique identifier for this request, used for pipeline isolation - - returns: - FactCheckResult with final verdicts for all claims, grouped by data source - - example: - >>> from app.models import DataSource - >>> from app.config.default import get_default_pipeline_config - >>> from app.ai.pipeline.steps import DefaultPipelineSteps - >>> sources = [ - ... DataSource( - ... id="msg-001", - ... source_type="original_text", - ... original_text="Check this claim about vaccines" - ... ) - ... ] - >>> config = get_default_pipeline_config() - >>> steps = DefaultPipelineSteps() - >>> result = await run_fact_check_pipeline(sources, config, steps, analytics, "msg-001") - >>> print(result.results[0].claim_verdicts[0].verdict) - "Falso" - """ - - # get logger for main pipeline orchestration - pipeline_logger = get_logger(__name__, PipelineStep.SYSTEM) - pipeline_logger.info(f"[{message_id}] pipeline isolation enabled with pipeline_id={message_id}") - - # initialize thread pool manager - manager = ThreadPoolManager.get_instance(max_workers=25) - manager.initialize() - - try: - # step 1 & 2 & 3: fire-and-forget claim extraction + link expansion + evidence gathering - - # create wrapper function that binds the config for regular claim extraction - def extract_claims_with_config( - extraction_input: ClaimExtractionInput - ) -> ClaimExtractionOutput: - """calls extract_claims with bound config""" - return extract_claims( - extraction_input=extraction_input, - llm_config=config.claim_extraction_llm_config - ) - - # create wrapper function for link expansion that binds the config - def expand_links_with_config( - sources: List[DataSource] - ) -> List[DataSource]: - """calls steps.expand_links_from_sources with bound config""" - pipeline_logger.info(f"expand_links_with_config wrapper called with {len(sources)} sources") - result = steps.expand_links_from_sources(sources, config) - pipeline_logger.info(f"expand_links_with_config completed: {len(result) if result else 0} sources expanded") - return result - - # get evidence gatherers from pipeline steps - evidence_gatherers = steps.get_evidence_gatherers() - pipeline_logger.info( - f"using {len(evidence_gatherers)} evidence gatherers: " - f"{', '.join(g.source_name for g in evidence_gatherers)}" - ) - - # run fire-and-forget streaming pipeline (sync call) - # enable adjudication_with_search to fire in parallel with normal evidence gathering - claim_outputs, enriched_claims = fire_and_forget_streaming_pipeline( - data_sources, - extract_claims_with_config, - evidence_gatherers, - analytics, - link_expansion_fn=expand_links_with_config, - manager=manager, - pipeline_steps=steps, - enable_adjudication_with_search=True, - pipeline_id=message_id, - ) - - if not any(claim_out.has_valid_claims() for claim_out in claim_outputs): - # no valid claims found, use fallback - no_claims_fallaback = await steps.handle_no_claims_fallback(data_sources,config) # TODO: Pretty sure the data sources here DO NOT include the expanded links so fix that later - return FactCheckResult( - results= [], - sources_with_claims = [], - overall_summary=no_claims_fallaback.explanation - ) - - # build final evidence retrieval result - result = EvidenceRetrievalResult(claim_evidence_map=enriched_claims) - analytics.populate_claims_from_evidence(result) - - # build adjudication input by grouping enriched claims with data sources - adjudication_input = build_adjudication_input(claim_outputs, result) - - # log adjudication input details - log_adjudication_input(adjudication_input, config.adjudication_llm_config) - - # adjudicate claims - adjudication_logger = get_logger(__name__, PipelineStep.ADJUDICATION) - adjudication_logger.debug("calling steps.adjudicate_claims...") - try: - fact_check_result = steps.adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=config.adjudication_llm_config - ) - adjudication_logger.debug("steps.adjudicate_claims completed successfully") - except Exception as e: - adjudication_logger.error(f"exception in adjudicate_claims: {type(e).__name__}") - adjudication_logger.error(f"error message: {str(e)}") - adjudication_logger.error("traceback:", exc_info=True) - - # create empty result to allow fallback to adjudication_with_search - adjudication_logger.warning( - "normal adjudication failed - creating empty result to trigger adjudication_with_search fallback" - ) - fact_check_result = FactCheckResult( - results=[], - overall_summary="", - sources_with_claims=adjudication_input.sources_with_claims - ) - - # log adjudication output (only if we have results) - if fact_check_result.results: - log_adjudication_output(fact_check_result) - - # choose final result: use adjudication_with_search fallback if normal adjudication failed/insufficient - fact_check_result = _chose_fact_checking_result(fact_check_result, manager, message_id) - - # summary with prefix - pipeline_logger.set_prefix("[SUMMARY]") - pipeline_logger.info(f"{'=' * 80}") - - total_claims = sum(len(output.claims) for output in claim_outputs) - total_verdicts = sum(len(r.claim_verdicts) for r in fact_check_result.results) - - pipeline_logger.info(f"total claim extraction outputs: {len(claim_outputs)}") - pipeline_logger.info(f"total claims extracted: {total_claims}") - pipeline_logger.info(f"total enriched claims: {len(enriched_claims)}") - pipeline_logger.info( - f"evidence gathering results: {len(result.claim_evidence_map)} claims with evidence" - ) - pipeline_logger.info(f"final verdicts: {total_verdicts} verdicts") - pipeline_logger.clear_prefix() - - analytics.populate_from_adjudication(fact_check_result) - return fact_check_result - - except Exception as e: - # log pipeline failure - pipeline_logger.error(f"pipeline failed: {type(e).__name__}: {str(e)}") - pipeline_logger.error("full traceback:", exc_info=True) - raise - finally: - # cleanup completed jobs for this pipeline (non-blocking background cleanup) - pipeline_logger.info(f"[{message_id}] starting background cleanup of completed jobs") - manager.clear_completed_jobs_async(pipeline_id=message_id) \ No newline at end of file diff --git a/app/ai/pipeline/README.md b/app/ai/pipeline/README.md deleted file mode 100644 index 75d1d37..0000000 --- a/app/ai/pipeline/README.md +++ /dev/null @@ -1,276 +0,0 @@ -# Claim Extraction Pipeline Step - -This module implements the **Claim Extraction** step of the fact-checking pipeline using LangChain best practices. - -## Overview - -The claim extractor takes a user message (optionally enriched with context from linked articles) and extracts fact-checkable claims using an LLM with structured outputs. - -## Architecture - -``` -Input (ExpandedUserInput) - ↓ -Format Context - ↓ -LCEL Chain: Prompt | Model | Structured Output - ↓ -Post-Process (UUID generation, validation) - ↓ -Output (List[ExtractedClaim]) -``` - -## LangChain Best Practices Applied - -### ✅ 1. LCEL Composition -The chain uses declarative LCEL syntax for clean, composable pipelines: - -```python -chain = prompt | model.with_structured_output(schema) -``` - -This automatically provides: -- `.invoke()` for sync execution -- `.ainvoke()` for async execution -- `.stream()` and `.batch()` capabilities -- Retry and timeout handling - -### ✅ 2. Structured Outputs -Uses Pydantic models with `with_structured_output()` for type-safe LLM responses: - -```python -class ClaimExtractionOutput(BaseModel): - claims: List[ExtractedClaim] - -structured_model = model.with_structured_output(ClaimExtractionOutput) -``` - -This ensures: -- Validated JSON output -- Type safety throughout the pipeline -- Clear schema for the LLM - -### ✅ 3. Stateless Design -All functions accept explicit parameters - no global state: - -```python -def extract_claims( - expanded_context_by_source: dict[str, str], - user_message: str, - common_data: CommonPipelineData, - ... -) -> List[ExtractedClaim]: -``` - -### ✅ 4. Type Annotations -Every function, parameter, and return value is typed: - -```python -def format_expanded_context( - expanded_context_by_source: dict[str, str] -) -> str: -``` - -### ✅ 5. Async Support -Both sync and async versions provided for all operations: - -```python -claims = extract_claims(...) # Sync -claims = await extract_claims_async(...) # Async -``` - -### ✅ 6. ChatPromptTemplate -Consistent message handling using LangChain's prompt templates: - -```python -prompt = ChatPromptTemplate.from_messages([ - ("system", SYSTEM_PROMPT), - ("user", USER_PROMPT) -]) -``` - -### ✅ 7. Separation of Concerns -Clean module structure: -- `prompts.py` - Prompt templates -- `claim_extractor.py` - Chain logic and extraction -- `example_claim_extraction.py` - Usage examples - -### ✅ 8. Configuration Over Hardcoding -Model parameters are configurable: - -```python -def build_claim_extraction_chain( - model_name: str = "gpt-4o-mini", - temperature: float = 0.0, - timeout: Optional[float] = None -) -> Runnable: -``` - -## Usage - -### Basic Usage - -```python -from app.ai.pipeline.claim_extractor import extract_and_validate_claims -from app.models.commondata import CommonPipelineData - -# Prepare input -expanded_context = { - "https://example.com": "Article text here..." -} -user_message = "I heard vaccine X causes infertility" -common_data = CommonPipelineData( - message_id="msg-123", - message_text=user_message -) - -# Extract claims -claims = extract_and_validate_claims( - expanded_context_by_source=expanded_context, - user_message=user_message, - common_data=common_data -) - -# Use the extracted claims -for claim in claims: - print(f"Claim: {claim.text}") - print(f"Entities: {claim.entities}") -``` - -### Async Usage - -```python -import asyncio - -async def process_message(): - claims = await extract_claims_async( - expanded_context_by_source=expanded_context, - user_message=user_message, - common_data=common_data - ) - return claims - -claims = asyncio.run(process_message()) -``` - -### Custom Model Configuration - -```python -# Use a different model -claims = extract_claims( - ..., - model_name="gpt-4o", # More powerful model - temperature=0.1, # Slightly more creative - timeout=60.0 # Longer timeout -) -``` - -## Input Format - -The claim extractor expects input in this format: - -``` -=== Context from https://example.com/article1 === -Title: Article Title -Content: Article content here... - -=== Context from https://example.com/article2 === -Title: Another Article -Content: More content... - -====Original User Message Below==== -User's original message text here -``` - -This is automatically formatted by the `format_expanded_context()` function. - -## Output Format - -Returns a list of `ExtractedClaim` objects: - -```python -ExtractedClaim( - id="msg-123-claim-uuid-...", - text="Vaccine X causes infertility in women", - links=["https://example.com/article1"], - llm_comment="This is a medical claim that requires scientific verification", - entities=["Vaccine X", "infertility", "women"] -) -``` - -## Data Flow - -1. **Input**: User message + expanded context from links (from previous pipeline step) -2. **Format**: Context is formatted into a structured prompt -3. **Extract**: LLM extracts claims with structured output -4. **Post-process**: - - Generate unique IDs (prefixed with message_id) - - Validate claims (remove empty, duplicates, too short) -5. **Output**: List of validated `ExtractedClaim` objects - -## Error Handling - -The module handles common errors: - -- **Timeout**: Set `timeout` parameter for long-running requests -- **Invalid API key**: Raises clear error if `OPENAI_API_KEY` not set -- **Empty claims**: `validate_claims()` filters out empty results -- **Duplicate claims**: Validation removes duplicates - -## Testing - -Run the example file to test: - -```bash -python -m app.ai.pipeline.example_claim_extraction -``` - -Make sure `OPENAI_API_KEY` is set in your environment. - -## Configuration - -Model configuration can be set via: - -1. **Function parameters** (preferred for runtime config): - ```python - extract_claims(..., model_name="gpt-4o", timeout=30.0) - ``` - -2. **Environment variables** (for API keys): - ```bash - export OPENAI_API_KEY="sk-..." - ``` - -## Performance Considerations - -- **Model choice**: `gpt-4o-mini` is faster and cheaper, good for most cases -- **Temperature**: `0.0` for deterministic extraction (recommended) -- **Timeout**: Default 30s, increase for complex messages -- **Async**: Use `extract_claims_async()` for concurrent processing - -## Integration with Pipeline - -This step fits into the larger pipeline: - -``` -1. User Input -2. Context Expansion ← enriches links -3. **Claim Extraction** ← YOU ARE HERE -4. Evidence Gathering ← uses extracted claims -5. Final Adjudication -``` - -Input from previous step: `ExpandedUserInput` -Output to next step: `List[ExtractedClaim]` - -## Next Steps - -After claim extraction, the pipeline proceeds to: -- **Evidence Gathering**: Search for sources supporting/refuting each claim -- **Adjudication**: Final verdict based on evidence - -## References - -- [LangChain LCEL Documentation](https://python.langchain.com/docs/expression_language/) -- [Structured Outputs Guide](https://python.langchain.com/docs/how_to/structured_output/) -- [ChatPromptTemplate](https://python.langchain.com/docs/modules/model_io/prompts/quick_start/) diff --git a/app/ai/pipeline/__init__.py b/app/ai/pipeline/__init__.py deleted file mode 100644 index 4716d5d..0000000 --- a/app/ai/pipeline/__init__.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -Fact-Checking Pipeline Module - -This module contains the individual steps of the fact-checking pipeline: -- Claim Extraction: Extract verifiable claims from user messages -- Evidence Gathering: Search for supporting/refuting evidence -- Final Adjudication: Produce verdict based on claims and evidence - -Each step follows LangChain best practices with LCEL chains, structured outputs, -and stateless design. -""" - -# Note: Main pipeline orchestration functions are in app.ai.pipeline module (file) -# not in this directory. This directory contains step-by-step implementations. - -from .claim_extractor import ( - extract_claims, - extract_claims_async, - extract_and_validate_claims, - validate_claims, - build_claim_extraction_chain, -) - -from .link_context_expander import ( - expand_link_contexts, - expand_link_context, - extract_links, -) - -from .judgement import ( - adjudicate_claims, - adjudicate_claims_async, - build_adjudication_chain, - format_adjudication_input, -) - -from .adjudication_with_search import ( - adjudicate_claims_with_search, - adjudicate_claims_with_search_async, - get_claims_from_sources, -) - -from .prompts import ( - get_claim_extraction_prompt, - CLAIM_EXTRACTION_SYSTEM_PROMPT, - CLAIM_EXTRACTION_USER_PROMPT, - get_adjudication_prompt, - ADJUDICATION_SYSTEM_PROMPT, - ADJUDICATION_USER_PROMPT, - ADJUDICATION_WITH_SEARCH_SYSTEM_PROMPT, -) - -# Re-export data models from app.models for convenience -from app.models import ClaimExtractionOutput, FactCheckResult - -__all__ = [ - # Claim extraction functions - "extract_claims", - "extract_claims_async", - "extract_and_validate_claims", - "validate_claims", - "build_claim_extraction_chain", - - # Link context expansion functions - "expand_link_contexts", - "expand_link_context", - "extract_links", - - # Adjudication functions - "adjudicate_claims", - "adjudicate_claims_async", - "build_adjudication_chain", - "format_adjudication_input", - - # Adjudication with Google Search functions - "adjudicate_claims_with_search", - "adjudicate_claims_with_search_async", - "get_claims_from_sources", - - # Data models - "ClaimExtractionOutput", - "FactCheckResult", - - # Claim extraction prompts - "get_claim_extraction_prompt", - "CLAIM_EXTRACTION_SYSTEM_PROMPT", - "CLAIM_EXTRACTION_USER_PROMPT", - - # Adjudication prompts - "get_adjudication_prompt", - "ADJUDICATION_SYSTEM_PROMPT", - "ADJUDICATION_USER_PROMPT", - - # Adjudication with search prompts - "ADJUDICATION_WITH_SEARCH_SYSTEM_PROMPT", -] diff --git a/app/ai/pipeline/adjudication_with_search.py b/app/ai/pipeline/adjudication_with_search.py deleted file mode 100644 index 46f1c0f..0000000 --- a/app/ai/pipeline/adjudication_with_search.py +++ /dev/null @@ -1,406 +0,0 @@ -""" -This is a fallback/experimental step of the pipeline and it defines a mix of the Evidence Retrieval + Adjundication step -Adjudication with Web Search - Pipeline Step. - -This module provides an alternative adjudication step that uses OpenAI's Responses API -with web search to fact-check claims in real-time. - -Key differences from standard adjudication: -- Uses OpenAI web search instead of pre-gathered citations -- Combines evidence gathering and adjudication in one step -- Single API call with structured output - -Architecture: -- Uses OpenAI SDK with Responses API -- Single API call with web search tool that returns structured Pydantic output -- Type-safe with Pydantic models throughout -""" - -import os -import re -import json -from typing import List -from openai import OpenAI -from pydantic import BaseModel, Field, ValidationError - -from app.models import ( - ExtractedClaim, - FactCheckResult, - LLMAdjudicationOutput, - DataSource, - DataSourceWithExtractedClaims, -) - -from .prompts import ADJUDICATION_WITH_SEARCH_SYSTEM_PROMPT -from .utils import get_current_date, convert_llm_output_to_data_source_results - -# ===== JSON REPAIR UTILITIES ===== - -def _repair_json_urls(json_text: str) -> str: - """ - repair common JSON formatting issues in URLs. - - fixes: - - unescaped backslashes in URLs - - unescaped quotes in URLs - - newlines and tabs in string values - - malformed URL strings - - args: - json_text: raw JSON string that may contain malformed URLs - - returns: - repaired JSON string - """ - # ensure text is properly encoded as UTF-8 - if isinstance(json_text, bytes): - json_text = json_text.decode('utf-8', errors='replace') - - # fix unescaped newlines and tabs in strings - # use regex to avoid corrupting UTF-8 multi-byte sequences - json_text = re.sub(r'(? LLMAdjudicationOutput: - """ - parse JSON response with fallback repair logic. - - tries to parse the response as-is first, then applies JSON repair - if initial parsing fails. - - args: - raw_response: raw JSON string from the API - - returns: - parsed LLMAdjudicationOutput - - raises: - ValueError: if parsing fails even after repair attempts - """ - # ensure raw_response is a UTF-8 string - if isinstance(raw_response, bytes): - raw_response = raw_response.decode('utf-8', errors='replace') - - # try parsing as-is first - try: - parsed_data = json.loads(raw_response, strict=False) - return LLMAdjudicationOutput(**parsed_data) - except (json.JSONDecodeError, ValidationError) as e: - print(f"[DEBUG] Initial JSON parsing failed: {type(e).__name__}: {str(e)}") - print("[DEBUG] Attempting JSON repair...") - - # apply repairs - repaired_json = _repair_json_urls(raw_response) - - try: - parsed_data = json.loads(repaired_json, strict=False) - result = LLMAdjudicationOutput(**parsed_data) - print("[DEBUG] JSON repair successful!") - return result - except (json.JSONDecodeError, ValidationError) as repair_error: - print(f"[ERROR] JSON repair failed: {type(repair_error).__name__}: {str(repair_error)}") - print(f"[DEBUG] Original JSON (first 500 chars): {raw_response[:500]}") - print(f"[DEBUG] Repaired JSON (first 500 chars): {repaired_json[:500]}") - raise ValueError( - f"Failed to parse JSON response even after repair. " - f"Original error: {str(e)}. " - f"Repair error: {str(repair_error)}" - ) from repair_error - - -# ===== CLIENT INITIALIZATION ===== - -def _get_openai_client() -> OpenAI: - """ - get or create OpenAI client. - - returns: - configured OpenAI instance - """ - api_key = os.getenv("OPENAI_API_KEY") - if not api_key: - raise ValueError("OPENAI_API_KEY environment variable not set") - - return OpenAI(api_key=api_key) - - -# ===== PROMPTS ===== - -def _build_adjudication_prompt(sources_with_claims: List[DataSourceWithExtractedClaims], current_date: str) -> str: - """ - build the prompt for adjudication with search. - - args: - sources_with_claims: list of data sources with their extracted claims - current_date: current date in DD-MM-YYYY format - - returns: - formatted prompt string - """ - prompt_parts = [] - prompt_parts.append("\n\n## Alegações para Verificar:\n") - - # group claims by data source - for source_idx, source_with_claims in enumerate(sources_with_claims, 1): - data_source_id = source_with_claims.data_source.id - claims = source_with_claims.extracted_claims - - prompt_parts.append(f"\n### Fonte de Dados {source_idx}: {data_source_id}\n") - - for claim_idx, claim in enumerate(claims, 1): - prompt_parts.append(f"\n**Alegação {claim_idx}**:") - prompt_parts.append(f"- ID: {claim.id}") - prompt_parts.append(f"- Texto: {claim.text}") - prompt_parts.append("") - - prompt_parts.append("\nPor favor, verifique cada alegação usando a busca do Google e forneça vereditos fundamentados.") - prompt_parts.append("\nIMPORTANTE: Agrupe os vereditos por fonte de dados. Para cada fonte, retorne UM resultado com todos os vereditos daquela fonte.") - - return "\n".join(prompt_parts) - - -# ===== MAIN ADJUDICATION FUNCTION ===== - -def adjudicate_claims_with_search( - sources_with_claims: List[DataSourceWithExtractedClaims], - model: str = "gpt-5-nano" -) -> FactCheckResult: - """ - adjudicate claims using OpenAI web search in a single API call. - - this is the main entry point for adjudication with search. - - the function uses OpenAI Responses API with web search tool to find evidence - and returns structured Pydantic output in one call. - - args: - sources_with_claims: list of DataSourceWithExtractedClaims to fact-check - model: OpenAI model to use (default: gpt-4o-mini) - - returns: - FactCheckResult with verdicts for all claims - - raises: - ValueError: if OPENAI_API_KEY is not set - Exception: if API calls fail - - example: - >>> from app.models import ExtractedClaim, ClaimSource, DataSource, DataSourceWithExtractedClaims - >>> claim = ExtractedClaim( - ... id="claim-1", - ... text="A vacina X causa infertilidade", - ... source=ClaimSource(source_type="original_text", source_id="msg-1") - ... ) - >>> data_source = DataSource(id="msg-1", source_type="original_text", original_text="test") - >>> source_with_claims = DataSourceWithExtractedClaims(data_source=data_source, extracted_claims=[claim]) - >>> result = adjudicate_claims_with_search([source_with_claims]) - >>> print(result.results[0].claim_verdicts[0].verdict) - """ - print("\n" + "="*80) - print("[DEBUG] Starting adjudicate_claims_with_search (OpenAI)") - print("="*80) - - # Input validation and logging - print(f"[DEBUG] Input: {len(sources_with_claims)} sources with claims") - for idx, source in enumerate(sources_with_claims, 1): - print(f" Source {idx}: {source.data_source.id} with {len(source.extracted_claims)} claims") - - client = _get_openai_client() - print("[DEBUG] OpenAI client initialized successfully") - - # Get current date - current_date = get_current_date() - print(f"[DEBUG] Current date: {current_date}") - - # Count total claims for logging - total_claims = sum(len(s.extracted_claims) for s in sources_with_claims) - print(f"[DEBUG] Total claims to adjudicate: {total_claims}") - - for source in sources_with_claims: - print(f" Source {source.data_source.id}:") - for claim in source.extracted_claims: - print(f" - [{claim.id}] {claim.text[:80]}...") - - # Build the user message with claims grouped by data source - user_message = _build_adjudication_prompt(sources_with_claims, current_date) - print(f"[DEBUG] Prompt built, length: {len(user_message)} characters") - print(f"[DEBUG] Prompt preview (first 200 chars):\n{user_message[:200]}...") - - # Prepare messages for OpenAI - messages = [ - { - "role": "system", - "content": ADJUDICATION_WITH_SEARCH_SYSTEM_PROMPT.format(current_date=current_date) - }, - { - "role": "user", - "content": user_message - } - ] - - # Make single API call with web search and structured output - print(f"[DEBUG] Calling OpenAI Responses API with model: {model}") - - # Force UTF-8 encoding in messages - for message in messages: - if isinstance(message.get("content"), str): - message["content"] = message["content"].encode('utf-8').decode('utf-8') - - llm_output = None - try: - response = client.responses.parse( - model=model, - input=messages, - tools=[{"type": "web_search"}], - text_format=LLMAdjudicationOutput, - ) - print("[DEBUG] API call successful") - - # Debug: check encoding of response - for res in response.output_parsed.results: - for v in res.claim_verdicts: - print("[RAW repr claim_text]:", repr(v.claim_text)) - print("[RAW repr justification]:", repr(v.justification)) - - # Force re-encode to UTF-8 if needed - if v.claim_text: - v.claim_text = v.claim_text.encode('utf-8', errors='ignore').decode('utf-8') - if v.justification: - v.justification = v.justification.encode('utf-8', errors='ignore').decode('utf-8') - - # Fix encoding in overall_summary too - if response.output_parsed and response.output_parsed.overall_summary: - response.output_parsed.overall_summary = ( - response.output_parsed.overall_summary - .encode('utf-8', errors='ignore') - .decode('utf-8') - ) - - if not hasattr(response, 'output_parsed') or response.output_parsed is None: - # fallback: try to get raw output and parse manually - if hasattr(response, 'output') and response.output: - print("[DEBUG] output_parsed is None, attempting manual parsing from raw output") - llm_output = _parse_with_fallback(response.output) - else: - print("[ERROR] response.output_parsed is None and no raw output available!") - print(f"[DEBUG] Response object: {response}") - raise ValueError("API response output_parsed is None - no content returned") - else: - llm_output = response.output_parsed - - except (json.JSONDecodeError, ValidationError) as parse_error: - # catch JSON parsing errors and attempt repair - print(f"[ERROR] JSON parsing failed: {type(parse_error).__name__}: {str(parse_error)}") - - # try to get raw response text - if 'response' in locals() and hasattr(response, 'output') and response.output: - print("[DEBUG] Attempting to repair malformed JSON from raw output") - llm_output = _parse_with_fallback(response.output) - else: - print("[ERROR] Cannot access raw response for repair") - raise ValueError( - f"Failed to parse structured output and cannot access raw response for repair. " - f"Error: {str(parse_error)}" - ) from parse_error - - except Exception as e: - print(f"[ERROR] API call failed: {type(e).__name__}: {str(e)}") - raise - - if llm_output is None: - raise ValueError("Failed to obtain parsed output from API response") - print(f"[DEBUG] Successfully parsed {len(llm_output.results)} result(s)") - - # Print detailed verdict information - for idx, result in enumerate(llm_output.results, 1): - print(f"\n[DEBUG] Result {idx}:") - print(f" - data_source_id: {result.data_source_id}") - print(f" - Number of verdicts: {len(result.claim_verdicts)}") - for v_idx, verdict in enumerate(result.claim_verdicts, 1): - print(f" - Verdict {v_idx}:") - print(f" claim_id: {verdict.claim_id}") - print(f" claim_text: {verdict.claim_text[:60]}...") - print(f" verdict: {verdict.verdict}") - print(f" justification: {verdict.justification[:100]}...") - print(f" citations used : {verdict.citations_used}") - - # Convert LLM output to DataSourceResult using utils - print("\n[DEBUG] Converting to DataSourceResult...") - data_source_results = convert_llm_output_to_data_source_results( - llm_results=llm_output.results, - sources_with_claims=sources_with_claims - ) - - print(f"[DEBUG] Created {len(data_source_results)} DataSourceResult(s)") - for idx, ds_result in enumerate(data_source_results, 1): - print(f" Result {idx}: {ds_result.data_source_id} - {len(ds_result.claim_verdicts)} verdict(s)") - - # Build final FactCheckResult - return FactCheckResult( - results=data_source_results, - overall_summary=llm_output.overall_summary if llm_output.overall_summary else "", - sources_with_claims=sources_with_claims - ) - - -async def adjudicate_claims_with_search_async( - sources_with_claims: List[DataSourceWithExtractedClaims], - model: str = "gpt-4o-mini" -) -> FactCheckResult: - """ - async version of adjudicate_claims_with_search. - - note: uses sync calls in an async wrapper. For true async, consider - using asyncio.to_thread or similar. - - args: - sources_with_claims: list of DataSourceWithExtractedClaims to fact-check - model: OpenAI model to use - - returns: - FactCheckResult with verdicts for all claims - """ - # For now, just call the sync version - # In production, consider using asyncio.to_thread for true async - return adjudicate_claims_with_search(sources_with_claims, model) - - -# ===== HELPER FUNCTIONS ===== - -def get_claims_from_sources(sources_with_claims: list) -> List[ExtractedClaim]: - """ - extract all claims from DataSourceWithClaims objects. - - args: - sources_with_claims: list of DataSourceWithClaims objects - - returns: - flat list of all ExtractedClaim objects - """ - all_claims = [] - for source_with_claims in sources_with_claims: - for enriched_claim in source_with_claims.enriched_claims: - # EnrichedClaim extends ExtractedClaim, so we can use it directly - all_claims.append(enriched_claim) - - return all_claims diff --git a/app/ai/pipeline/claim_extractor.py b/app/ai/pipeline/claim_extractor.py deleted file mode 100644 index d18f140..0000000 --- a/app/ai/pipeline/claim_extractor.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -Claim Extraction Step for the Fact-Checking Pipeline. - -Follows LangChain best practices: -- LCEL composition for declarative chains -- Structured outputs with Pydantic -- Stateless design with explicit state passing -- Type annotations throughout -- Support for both sync and async operations - -Architecture: -- Receives a ClaimExtractionInput wrapping a DataSource -- Extracts all fact-checkable claims from the text -- Returns claims with proper source tracking -- Source-agnostic: works for any text (user message, link, image OCR, etc.) -""" - -from typing import List, Optional -import uuid - -from pydantic import BaseModel, Field -from langchain_core.runnables import Runnable - -from app.models import ( - ClaimExtractionInput, - ExtractedClaim, - ClaimExtractionOutput, - ClaimSource, - LLMConfig, -) -from .prompts import get_claim_extraction_prompt_for_source_type - - -# ===== INTERNAL LLM SCHEMAS ===== -# These are what the LLM returns - simple claim data without ID or source - -class _LLMExtractedClaim(BaseModel): - """Internal schema for what the LLM returns - just the claim content.""" - text: str = Field(..., description="The normalized claim text") - entities: List[str] = Field(default_factory=list, description="Named entities in the claim") - llm_comment: Optional[str] = Field(None, description="LLM's analysis of why this is fact-checkable") - - -class _LLMClaimOutput(BaseModel): - """Internal schema for LLM output - wrapper containing list of claims.""" - claims: List[_LLMExtractedClaim] = Field( - default_factory=list, - description="List of extracted claims" - ) - - -# ===== CHAIN CONSTRUCTION ===== - -def build_claim_extraction_chain( - llm_config: LLMConfig, - source_type: str -) -> Runnable: - """ - builds the LCEL chain for claim extraction. - - the chain follows this structure: - prompt | model.with_structured_output() -> ClaimExtractionOutput - - args: - llm_config: LLM configuration with BaseChatOpenAI instance. - source_type: type of data source to select appropriate prompt. - - returns: - a Runnable chain that takes dict input and returns ClaimExtractionOutput - - best practices applied: - - source-type-specific prompts for optimized extraction - - structured output binding for type safety - - low temperature for consistent extractions - - stateless design - no global state - """ - # get the appropriate prompt template for this source type - prompt = get_claim_extraction_prompt_for_source_type(source_type) - - # use the llm from config directly - model = llm_config.llm - - # bind the structured output schema to enforce JSON format - # use internal schema - LLM only returns claim content, not ID or source - structured_model = model.with_structured_output( - _LLMClaimOutput, - method="json_mode" # use JSON mode for reliable parsing - ) - - # compose the chain using LCEL - chain = prompt | structured_model - - return chain - - -# ===== MAIN EXTRACTION FUNCTIONS ===== - -def extract_claims( - extraction_input: ClaimExtractionInput, - llm_config: LLMConfig -) -> ClaimExtractionOutput: - """ - Extracts fact-checkable claims from a text chunk. - - This is the main synchronous entry point for claim extraction. - Source-agnostic: works for user messages, link content, OCR text, transcripts, etc. - - Args: - extraction_input: Input wrapping a DataSource with id, source_type, original_text, and metadata - llm_config: LLM configuration (model name, temperature, timeout). - - Returns: - ClaimExtractionOutput containing list of ExtractedClaim objects with unique IDs and source tracking - - Example: - >>> from app.models import ClaimExtractionInput, DataSource, LLMConfig - >>> data_source = DataSource( - ... id="msg-123", - ... source_type="original_text", - ... original_text="I heard vaccine X causes infertility in women." - ... ) - >>> input_data = ClaimExtractionInput(data_source=data_source) - >>> config = LLMConfig(model_name="gpt-4o-mini", temperature=0.0) - >>> result = extract_claims(input_data, llm_config=config) - >>> print(len(result.claims)) - 1 - >>> print(result.claims[0].text) - "Vaccine X causes infertility in women" - >>> print(result.claims[0].source.source_type) - "original_text" - """ - # Build the chain with source-type-specific prompt - source_type = extraction_input.data_source.source_type - chain = build_claim_extraction_chain( - llm_config=llm_config, - source_type=source_type - ) - - # Prepare input for the prompt template - chain_input = { - "text": extraction_input.data_source.original_text - } - - # Invoke the chain - gets LLM output (just claim content) - result: _LLMClaimOutput = chain.invoke(chain_input) - - # Convert LLM output to full ExtractedClaim objects with ID and source - claims: List[ExtractedClaim] = [] - for llm_claim in result.claims: - # Generate unique ID with source prefix - claim_id = f"{uuid.uuid4()}" - - # Build the ClaimSource object - source = ClaimSource( - source_type=extraction_input.data_source.source_type, - source_id=extraction_input.data_source.id - ) - - # Create the full ExtractedClaim with all fields - claim = ExtractedClaim( - id=claim_id, - text=llm_claim.text, - source=source, - llm_comment=llm_claim.llm_comment, - entities=llm_claim.entities - ) - claims.append(claim) - - return ClaimExtractionOutput( - data_source=extraction_input.data_source, - claims=claims - ) - - -async def extract_claims_async( - extraction_input: ClaimExtractionInput, - llm_config: LLMConfig -) -> ClaimExtractionOutput: - """ - Async version of extract_claims. - - Follows LangChain best practice: provide async methods for IO-bound operations. - - Args: - extraction_input: Input wrapping a DataSource with id, source_type, original_text, and metadata - llm_config: LLM configuration (model name, temperature, timeout). - - Returns: - ClaimExtractionOutput containing list of ExtractedClaim objects with unique IDs and source tracking - """ - # Build the chain with source-type-specific prompt - source_type = extraction_input.data_source.source_type - chain = build_claim_extraction_chain( - llm_config=llm_config, - source_type=source_type - ) - - # Prepare input for the prompt template - chain_input = { - "text": extraction_input.data_source.original_text - } - - # Invoke the chain asynchronously - gets LLM output (just claim content) - result: _LLMClaimOutput = await chain.ainvoke(chain_input) - - # Convert LLM output to full ExtractedClaim objects with ID and source - claims: List[ExtractedClaim] = [] - for llm_claim in result.claims: - # Generate unique ID - claim_id = f"{uuid.uuid4()}" - - # Build the ClaimSource object - source = ClaimSource( - source_type=extraction_input.data_source.source_type, - source_id=extraction_input.data_source.id - ) - - # Create the full ExtractedClaim with all fields - claim = ExtractedClaim( - id=claim_id, - text=llm_claim.text, - source=source, - llm_comment=llm_claim.llm_comment, - entities=llm_claim.entities - ) - claims.append(claim) - - return ClaimExtractionOutput( - data_source=extraction_input.data_source, - claims=claims - ) - - -# ===== HELPER FUNCTIONS ===== - -def validate_claims(claims: List[ExtractedClaim]) -> List[ExtractedClaim]: - """ - Validates and filters extracted claims. - - Filters out: - - Claims with empty text - - Duplicate claims (same text) - - Args: - claims: List of extracted claims - - Returns: - Filtered and validated list of claims - """ - if not claims: - return [] - - validated = [] - seen_texts: set[str] = set() - - for claim in claims: - # Skip empty or very short claims - if not claim.text or len(claim.text.strip()) < 3: - continue - - # Skip duplicates - normalized_text = claim.text.strip().lower() - if normalized_text in seen_texts: - continue - - seen_texts.add(normalized_text) - validated.append(claim) - - return validated - - -# ===== CONVENIENCE FUNCTION ===== - -def extract_and_validate_claims( - extraction_input: ClaimExtractionInput, - llm_config: LLMConfig -) -> ClaimExtractionOutput: - """ - Extracts claims and validates them in one call. - - This is the recommended entry point for most use cases. - - Args: - extraction_input: Input wrapping a DataSource with id, source_type, original_text, and metadata - llm_config: LLM configuration (model name, temperature, timeout). - - Returns: - ClaimExtractionOutput containing validated list of ExtractedClaim objects with source tracking - """ - result = extract_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - validated_claims = validate_claims(result.claims) - return ClaimExtractionOutput(claims=validated_claims) diff --git a/app/ai/pipeline/evidence_retrieval.py b/app/ai/pipeline/evidence_retrieval.py deleted file mode 100644 index 2187b8e..0000000 --- a/app/ai/pipeline/evidence_retrieval.py +++ /dev/null @@ -1,240 +0,0 @@ -""" -Evidence Retrieval Step for the Fact-Checking Pipeline. - -This module gathers evidence from multiple sources to support or refute claims. -Designed to be composable - new evidence sources can be easily added. - -Architecture: -- Receives an EvidenceRetrievalInput containing a list of ExtractedClaims -- For each claim, runs it through multiple evidence gatherers (web search, fact-check APIs, etc.) -- Each gatherer returns zero or more Citations -- Returns an EvidenceRetrievalResult mapping claim IDs to EnrichedClaims with citations - -Key Design Principles: -- No LLM calls in this step (pure retrieval) -- Composable architecture via EvidenceGatherer protocol -- Stateless design with explicit state passing -- Type annotations throughout -- Support for both sync and async operations -""" - -import asyncio -from typing import List, Dict - -from app.models import ( - EvidenceRetrievalInput, - EvidenceRetrievalResult, - EnrichedClaim, - Citation, -) -from app.ai.context import ( - EvidenceGatherer -) -from app.ai.context.web import ( - WebSearchGatherer -) -from app.observability.logger.logger import get_logger -from app.config import get_trusted_domains - -logger = get_logger(__name__) - - - -# ===== MAIN EVIDENCE RETRIEVAL FUNCTIONS ===== - -async def gather_evidence_async( - retrieval_input: EvidenceRetrievalInput, - gatherers: List[EvidenceGatherer] | None = None, - timeout_per_gatherer: float = 45.0 -) -> EvidenceRetrievalResult: - """ - Main async function to gather evidence for all claims. - - For each claim, runs it through all evidence gatherers and accumulates - citations. Returns a mapping of claim IDs to enriched claims with evidence. - - Args: - retrieval_input: Input containing list of claims to gather evidence for - gatherers: List of evidence gatherers to use. If None, uses default (web search only). - - Returns: - EvidenceRetrievalResult with claim_evidence_map containing enriched claims - - Example: - >>> from app.models import EvidenceRetrievalInput, ExtractedClaim, ClaimSource - >>> claim = ExtractedClaim( - ... id="claim-123", - ... text="Vaccine X causes infertility", - ... source=ClaimSource(source_type="original_text", source_id="msg-001") - ... ) - >>> input_data = EvidenceRetrievalInput(claims=[claim]) - >>> result = await gather_evidence_async(input_data) - >>> enriched = result.claim_evidence_map["claim-123"] - >>> len(enriched.citations) > 0 - True - """ - # use default gatherers if none provided - if gatherers is None: - allowed_domains = get_trusted_domains() - gatherers = [WebSearchGatherer(max_results=5,allowed_domains=allowed_domains)] - - # initialize result map (maps claim id to its enriched claim) - claim_evidence_map: Dict[str, EnrichedClaim] = {} - - # process each claim - for claim in retrieval_input.claims: - logger.info(f"processing claim: {claim.text[:80]}...") - - # gather citations from all sources - all_citations: List[Citation] = [] - - for gatherer in gatherers: - gatherer_name = gatherer.source_name if hasattr(gatherer, 'source_name') else type(gatherer).__name__ - - try: - logger.debug(f"querying {gatherer_name}...") - citations = await gatherer.gather(claim) - - if citations: - logger.info(f"{gatherer_name}: {len(citations)} citation(s) found") - else: - logger.warning(f"{gatherer_name}: no citations found") - - all_citations.extend(citations) - - except asyncio.TimeoutError: - logger.error(f"{gatherer_name}: TIMEOUT - operation exceeded time limit") - except Exception as e: - logger.error(f"{gatherer_name}: ERROR - {type(e).__name__}: {str(e)[:100]}") - - logger.info(f"total citations for this claim: {len(all_citations)}") - - # create enriched claim with citations - # EnrichedClaim extends ExtractedClaim, so we copy all fields - enriched_claim = EnrichedClaim( - id=claim.id, - text=claim.text, - source=claim.source, - llm_comment=claim.llm_comment, - entities=claim.entities, - citations=all_citations - ) - - # add to result map - claim_evidence_map[claim.id] = enriched_claim - - return EvidenceRetrievalResult(claim_evidence_map=claim_evidence_map) - - -# ===== HELPER FUNCTIONS ===== - -def deduplicate_citations(citations: List[Citation]) -> List[Citation]: - """ - Remove duplicate citations based on URL. - - If multiple citations have the same URL, keeps the first one. - - Args: - citations: List of citations that may contain duplicates - - Returns: - Deduplicated list of citations - """ - if not citations: - return [] - - seen_urls: set[str] = set() - deduplicated: List[Citation] = [] - - for citation in citations: - # normalize URL for comparison - normalized_url = citation.url.lower().strip() - - if normalized_url not in seen_urls: - seen_urls.add(normalized_url) - deduplicated.append(citation) - - return deduplicated - - -def filter_low_quality_citations( - citations: List[Citation], - min_text_length: int = 10 -) -> List[Citation]: - """ - Filter out low-quality citations. - - Removes citations with: - - Very short citation text - - Missing critical fields - - Args: - citations: List of citations to filter - min_text_length: Minimum length for citation_text - - Returns: - Filtered list of citations - """ - if not citations: - return [] - - filtered: List[Citation] = [] - - for citation in citations: - # skip if citation text is too short - if len(citation.citation_text.strip()) < min_text_length: - continue - - # skip if critical fields are missing - if not citation.url or not citation.title: - continue - - filtered.append(citation) - - return filtered - - -# ===== CONVENIENCE FUNCTION ===== - -async def gather_and_filter_evidence( - retrieval_input: EvidenceRetrievalInput, - gatherers: List[EvidenceGatherer] | None = None, - deduplicate: bool = True, - filter_quality: bool = True -) -> EvidenceRetrievalResult: - """ - Gathers evidence and applies quality filters in one call. - - This is the recommended entry point for most use cases. - Automatically deduplicates and filters low-quality citations. - - Args: - retrieval_input: Input containing list of claims to gather evidence for - gatherers: List of evidence gatherers to use. If None, uses default. - deduplicate: Whether to remove duplicate citations - filter_quality: Whether to filter out low-quality citations - - Returns: - EvidenceRetrievalResult with filtered and deduplicated citations - """ - # gather evidence - result = await gather_evidence_async( - retrieval_input=retrieval_input, - gatherers=gatherers - ) - - # apply filters if requested - if deduplicate or filter_quality: - for enriched_claim in result.claim_evidence_map.values(): - citations = enriched_claim.citations - - if deduplicate: - citations = deduplicate_citations(citations) - - if filter_quality: - citations = filter_low_quality_citations(citations) - - # update the enriched claim with filtered citations - enriched_claim.citations = citations - - return result diff --git a/app/ai/pipeline/judgement.py b/app/ai/pipeline/judgement.py deleted file mode 100644 index 26bd5d6..0000000 --- a/app/ai/pipeline/judgement.py +++ /dev/null @@ -1,514 +0,0 @@ -""" -Adjudication/Judgment Step for the Fact-Checking Pipeline. - -Follows LangChain best practices: -- LCEL composition for declarative chains -- Structured outputs with Pydantic -- Stateless design with explicit state passing -- Type annotations throughout -- Support for both sync and async operations - -Architecture: -- Receives AdjudicationInput with DataSources paired with their EnrichedClaims -- Analyzes evidence and generates verdicts for each claim -- Returns structured FactCheckResult with verdicts grouped by data source -""" - -from typing import List, Optional -from datetime import datetime, timezone -from langchain_core.runnables import Runnable - -from app.models import ( - AdjudicationInput, - FactCheckResult, - DataSourceResult, - ClaimVerdict, - LLMConfig, - DataSourceWithClaims, - EnrichedClaim, - LLMDataSourceResult, - LLMAdjudicationOutput, -) -from .prompts import get_adjudication_prompt -from app.observability.logger import time_profile, PipelineStep, get_logger - - -# ===== CONSTANTS ===== - -# date format for fact-checking context: DD-MM-YYYY -DATE_FORMAT = "%d-%m-%Y" - - -# ===== HELPER FUNCTIONS FOR DATE HANDLING ===== - -def get_current_date() -> str: - """ - Returns the current date in DD-MM-YYYY format using UTC timezone. - - Returns: - Formatted date string (e.g., "08-12-2024") - """ - now = datetime.now(timezone.utc) - return now.strftime(DATE_FORMAT) - - -# ===== HELPER FUNCTIONS FOR INPUT FORMATTING ===== - -def format_enriched_claim(claim: EnrichedClaim) -> str: - """ - Formats an EnrichedClaim into a string representation for the LLM. - - Args: - claim: EnrichedClaim with citations and evidence - - Returns: - Formatted string with claim text, citations, and search queries - """ - lines = [] - lines.append(f" Afirmação ID: {claim.id}") - lines.append(f" Texto: {claim.text}") - - if claim.citations: - lines.append(f"\n Citações e Evidências ({len(claim.citations)} fonte(s)):") - for i, citation in enumerate(claim.citations, 1): - lines.append(f"\n [{i}] {citation.title}") - lines.append(f" Fonte: {citation.publisher}") - lines.append(f" URL: {citation.url}") - lines.append(f" Trecho: \"{citation.citation_text}\"") - if citation.rating: - lines.append(f" Avaliação prévia: {citation.rating}") - if citation.date: - lines.append(f" Data da revisão: {citation.date}") - else: - lines.append("\n Citações e Evidências: Nenhuma fonte encontrada") - - return "\n".join(lines) - - -def format_data_source_with_claims(source_with_claims: DataSourceWithClaims) -> str: - """ - Formats a DataSource with its EnrichedClaims for LLM input. - - Args: - source_with_claims: DataSourceWithClaims object - - Returns: - Formatted string combining DataSource metadata and all claims with evidence - """ - lines = [] - - # Format the data source using its to_llm_string method - lines.append(source_with_claims.data_source.to_llm_string()) - lines.append("\nAfirmações extraídas da fonte e as evidências de cada uma\n") - - if source_with_claims.enriched_claims: - for i, claim in enumerate(source_with_claims.enriched_claims, 1): - lines.append(f"Afirmação {i}:") - lines.append(format_enriched_claim(claim)) - lines.append("\n" + "-" * 80 + "\n") - else: - lines.append("Nenhuma alegação extraída desta fonte.\n") - - return "\n".join(lines) - - -def format_adjudication_input(adjudication_input: AdjudicationInput) -> str: - """ - Formats the complete AdjudicationInput for LLM consumption. - - Args: - adjudication_input: AdjudicationInput with all sources and claims - - Returns: - Complete formatted string for the LLM prompt - """ - lines = [] - - for source_with_claims in adjudication_input.sources_with_claims: - lines.append(f"\n{'=' * 80}") - lines.append("NOVA FONTE DE DADOS \n") - lines.append(format_data_source_with_claims(source_with_claims)) - - return "\n".join(lines) - - -# ===== Helper functions for LLM output parsing ===== - -def get_data_source_with_claims( - llm_source_result: LLMDataSourceResult, - adjudication_input: AdjudicationInput, - result_index: int -) -> Optional[DataSourceWithClaims]: - """ - Matches an LLM data source result back to the original input. - - Uses hybrid matching strategy: - 1. Try to match by data_source_id (if provided by LLM) - 2. Fall back to matching by position/order - - Args: - llm_source_result: LLM output for one data source - adjudication_input: Original input with all sources - result_index: Position of this result in the LLM output list - - Returns: - Matched DataSourceWithClaims or None if no match found - """ - # Create mapping of data_source_id to original source_with_claims - source_map = { - source_with_claims.data_source.id: source_with_claims - for source_with_claims in adjudication_input.sources_with_claims - } - - # TODO change the prints to logs - - # Try to match by data_source_id first - if llm_source_result.data_source_id: - source_with_claims = source_map.get(llm_source_result.data_source_id) - if source_with_claims: - return source_with_claims - - print(f"[WARNING] LLM returned unknown data_source_id: {llm_source_result.data_source_id}") - - - # Fallback: match by order (position in list) - print(f"[INFO] data_source_id missing for result {result_index}, matching by order") - if result_index < len(adjudication_input.sources_with_claims): - return adjudication_input.sources_with_claims[result_index] - - - print(f"[WARNING] No source at index {result_index}") - return None - - -def get_claim_verdicts( - llm_source_result: LLMDataSourceResult, - source_with_claims: DataSourceWithClaims -) -> List[ClaimVerdict]: - """ - Converts LLM claim verdicts to ClaimVerdict objects with proper IDs. - - Uses hybrid matching strategy for claim IDs: - 1. Try to use claim_id from LLM output (if provided and valid) - 2. Fall back to matching by claim_text - - Args: - llm_source_result: LLM output for one data source - source_with_claims: Original input for this data source - - Returns: - List of ClaimVerdict objects with proper claim_id populated - """ - # Create mappings for claim matching - claim_id_by_id = {claim.id: claim for claim in source_with_claims.enriched_claims} - claim_id_by_text = {claim.text: claim.id for claim in source_with_claims.enriched_claims} - - # Convert LLM verdicts to ClaimVerdict objects - claim_verdicts: List[ClaimVerdict] = [] - for llm_verdict in llm_source_result.claim_verdicts: - # Try to get claim_id: first from LLM output, then from claim_text matching - if llm_verdict.claim_id and llm_verdict.claim_id in claim_id_by_id: - # Use claim_id from LLM (most reliable) - claim_id = llm_verdict.claim_id - else: - # Fallback: match by claim_text - claim_id = claim_id_by_text.get(llm_verdict.claim_text, "unknown") - if llm_verdict.claim_id: - print(f"[WARNING] LLM returned unknown claim_id: {llm_verdict.claim_id}, matched by text instead") - - verdict = ClaimVerdict( - claim_id=claim_id, - claim_text=llm_verdict.claim_text, - verdict=llm_verdict.verdict, - justification=llm_verdict.justification, - citations_used=llm_verdict.citations_used - ) - claim_verdicts.append(verdict) - - return claim_verdicts - - -# ===== CHAIN CONSTRUCTION ===== - -def build_adjudication_chain(llm_config: LLMConfig) -> Runnable: - """ - builds the LCEL chain for fact-check adjudication. - - the chain follows this structure: - prompt | model.with_structured_output() -> LLMAdjudicationOutput - - args: - llm_config: LLM configuration with BaseChatOpenAI instance. - - returns: - a Runnable chain that takes dict input and returns LLMAdjudicationOutput - - best practices applied: - - structured output binding for type safety - - uses configured LLM model for advanced reasoning - - stateless design - no global state - """ - # get the prompt template - prompt = get_adjudication_prompt() - - # use the llm from config directly - model = llm_config.llm - - # bind the structured output schema - # note: using default method instead of json_mode for better reliability - structured_model = model.with_structured_output( - LLMAdjudicationOutput - ) - - # compose the chain using LCEL - chain = prompt | structured_model - - return chain - - -def _log_input_metrics( - adjudication_input: AdjudicationInput, - formatted_sources: str, - additional_context_str: str -) -> None: - """ - log input size metrics for adjudication performance analysis. - - logs metrics about the input complexity to help correlate with execution time: - - number of data sources and claims - - total evidence citations - - estimated prompt size in characters and tokens - - args: - adjudication_input: the adjudication input with sources and claims - formatted_sources: the formatted sources string for the prompt - additional_context_str: additional context string (if any) - """ - logger = get_logger(__name__, PipelineStep.ADJUDICATION) - - # calculate input metrics - total_claims = sum( - len(source.enriched_claims) - for source in adjudication_input.sources_with_claims - ) - total_sources = len(adjudication_input.sources_with_claims) - total_citations = sum( - len(claim.citations) - for source in adjudication_input.sources_with_claims - for claim in source.enriched_claims - ) - - # log metrics with prefix - logger.set_prefix("[ADJUNDICATOR INPUT METRICS]") - logger.info(f"total data sources: {total_sources}") - logger.info(f"total claims to adjudicate: {total_claims}") - logger.info(f"total evidence citations: {total_citations}") - - if total_claims > 0: - avg_citations_per_claim = total_citations / total_claims - logger.info(f"average citations per claim: {avg_citations_per_claim:.1f}") - - # log prompt size estimate - total_prompt_chars = len(formatted_sources) + len(additional_context_str) - estimated_tokens = total_prompt_chars // 4 # rough estimate: 1 token ≈ 4 chars - logger.info(f"total prompt size: {total_prompt_chars} chars (~{estimated_tokens} tokens estimated)") - - logger.clear_prefix() - - -# ===== MAIN ADJUDICATION FUNCTIONS ===== - -@time_profile(PipelineStep.ADJUDICATION) -def adjudicate_claims( - adjudication_input: AdjudicationInput, - llm_config: LLMConfig -) -> FactCheckResult: - """ - Adjudicates fact-checkable claims with evidence-based verdicts. - - This is the main synchronous entry point for claim adjudication. - Analyzes each claim with its evidence and generates structured verdicts. - - Args: - adjudication_input: AdjudicationInput with data sources and enriched claims - llm_config: LLM configuration (model name, temperature, timeout). - - Returns: - FactCheckResult with structured verdicts grouped by data source - - Example: - >>> from app.models import AdjudicationInput, DataSourceWithClaims, LLMConfig - >>> # ... create adjudication_input ... - >>> config = LLMConfig(model_name="o3-mini", temperature=0.0) - >>> result = adjudicate_claims(adjudication_input, llm_config=config) - >>> print(len(result.results)) - 2 - >>> print(result.results[0].claim_verdicts[0].verdict) - "Falso" - """ - logger = get_logger(__name__, PipelineStep.ADJUDICATION) - logger.debug("starting adjudicate_claims function") - - # Build the chain - try: - chain = build_adjudication_chain(llm_config=llm_config) - except Exception as e: - print(f"[ADJUDICATOR ERROR] Failed to build chain: {e}") - raise - - # Format the input for the LLM - try: - formatted_sources = format_adjudication_input(adjudication_input) - except Exception as e: - print(f"[ADJUDICATOR ERROR] Failed to format input: {e}") - import traceback - traceback.print_exc() - raise - - # Get current date - current_date = get_current_date() - - # Prepare additional context - additional_context_str = "" - if adjudication_input.additional_context: - additional_context_str = f"\n**Contexto Adicional**: {adjudication_input.additional_context}\n" - - # Prepare input for the prompt template - chain_input = { - "current_date": current_date, - "formatted_sources_and_claims": formatted_sources, - "additional_context": additional_context_str - } - - # Log input metrics for performance analysis - _log_input_metrics(adjudication_input, formatted_sources, additional_context_str) - - # Invoke the chain - gets LLM output - try: - result: LLMAdjudicationOutput = chain.invoke(chain_input) - except Exception as e: - print(f"[ADJUDICATOR ERROR] LLM invocation failed: {e}") - import traceback - traceback.print_exc() - raise - - # Convert LLM output to FactCheckResult using helper functions - data_source_results: List[DataSourceResult] = [] - - # Process each LLM result - for idx, llm_source_result in enumerate(result.results): - # Match LLM result to original input data source - source_with_claims = get_data_source_with_claims( - llm_source_result=llm_source_result, - adjudication_input=adjudication_input, - result_index=idx - ) - if not source_with_claims: - print(f"[ADJUDICATOR WARNING] No source_with_claims match found for result {idx}") - continue # Skip if no match found - - # Convert LLM verdicts to ClaimVerdict objects with proper IDs - claim_verdicts = get_claim_verdicts( - llm_source_result=llm_source_result, - source_with_claims=source_with_claims - ) - - # Create DataSourceResult with info from original input - source_result = DataSourceResult( - data_source_id=source_with_claims.data_source.id, - source_type=source_with_claims.data_source.source_type, - claim_verdicts=claim_verdicts - ) - data_source_results.append(source_result) - - return FactCheckResult( - results=data_source_results, - overall_summary=result.overall_summary if result.overall_summary else None, - sources_with_claims=adjudication_input.sources_with_claims - ) - - -async def adjudicate_claims_async( - adjudication_input: AdjudicationInput, - llm_config: LLMConfig -) -> FactCheckResult: - """ - Async version of adjudicate_claims. - - Follows LangChain best practice: provide async methods for IO-bound operations. - - Args: - adjudication_input: AdjudicationInput with data sources and enriched claims - llm_config: LLM configuration (model name, temperature, timeout). - - Returns: - FactCheckResult with structured verdicts grouped by data source - """ - # Build the chain - chain = build_adjudication_chain(llm_config=llm_config) - - # Format the input for the LLM - formatted_sources = format_adjudication_input(adjudication_input) - - # Get current date - current_date = get_current_date() - - # Prepare additional context - additional_context_str = "" - if adjudication_input.additional_context: - additional_context_str = f"\n**Contexto Adicional**: {adjudication_input.additional_context}\n" - - # Prepare input for the prompt template - chain_input = { - "current_date": current_date, - "formatted_sources_and_claims": formatted_sources, - "additional_context": additional_context_str - } - - # Invoke the chain asynchronously - gets LLM output - result: LLMAdjudicationOutput = await chain.ainvoke(chain_input) - - # Debug: Print what LLM returned - print("\n[DEBUG] LLM returned (async):") - print(f" - Number of data source results: {len(result.results)}") - print(f" - Overall summary present: {bool(result.overall_summary)}") - if result.results: - print(f" - First result has {len(result.results[0].claim_verdicts)} verdict(s)") - else: - print(" - WARNING: results list is empty!") - print(f" - Raw result object: {result}") - - # Convert LLM output to FactCheckResult using helper functions - data_source_results: List[DataSourceResult] = [] - - # Process each LLM result - for idx, llm_source_result in enumerate(result.results): - # Match LLM result to original input data source - source_with_claims = get_data_source_with_claims( - llm_source_result=llm_source_result, - adjudication_input=adjudication_input, - result_index=idx - ) - if not source_with_claims: - continue # Skip if no match found - - # Convert LLM verdicts to ClaimVerdict objects with proper IDs - claim_verdicts = get_claim_verdicts( - llm_source_result=llm_source_result, - source_with_claims=source_with_claims - ) - - # Create DataSourceResult with info from original input - source_result = DataSourceResult( - data_source_id=source_with_claims.data_source.id, - source_type=source_with_claims.data_source.source_type, - claim_verdicts=claim_verdicts - ) - data_source_results.append(source_result) - - return FactCheckResult( - results=data_source_results, - overall_summary=result.overall_summary if result.overall_summary else None, - sources_with_claims=adjudication_input.sources_with_claims - ) - diff --git a/app/ai/pipeline/link_context_expander.py b/app/ai/pipeline/link_context_expander.py deleted file mode 100644 index f0edd51..0000000 --- a/app/ai/pipeline/link_context_expander.py +++ /dev/null @@ -1,313 +0,0 @@ -""" -Link Context Expander Step for the Fact-Checking Pipeline. - -This module is responsible for extracting links from an 'original_text' DataSource -and transforming each one into a 'link_context' DataSource with expanded content. - -Architecture: -- Receives a DataSource of type 'original_text' -- Extracts all URLs from the text -- Expands each link to get its content using web scraping (in parallel using ThreadPool) -- Returns a list of new DataSources of type 'link_context' -- Enforces timeouts from PipelineConfig -""" -import re -import uuid -import asyncio -import logging -from typing import List, Optional -from app.ai.threads.thread_utils import wait_all -from app.models import DataSource, PipelineConfig -from app.ai.context.web.apify_utils import scrapeGenericUrl -from app.ai.context.web.models import WebContentResult -from app.ai.threads.thread_utils import ThreadPoolManager, OperationType - -logger = logging.getLogger(__name__) - - -def extract_links(text: str) -> List[str]: - """ - Extract all URLs from text using regex. - - Supports http, https protocols and common URL patterns. - Returns list of unique URLs found in the text. - - Args: - text: The text to extract links from - - Returns: - List of unique URLs found in the text, preserving order - - Example: - >>> extract_links("Check this out: https://example.com and https://test.com") - ['https://example.com', 'https://test.com'] - """ - # regex pattern for URLs with http/https - url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+' - - # find all matches - urls = re.findall(url_pattern, text) - - # strip common trailing punctuation that's not part of URLs - # common punctuation: . , ; : ! ? ) ] } that often follow URLs in text - trailing_punctuation = '.,:;!?)]}' - cleaned_urls = [] - for url in urls: - # strip trailing punctuation - while url and url[-1] in trailing_punctuation: - url = url[:-1] - if url: # only add non-empty URLs - cleaned_urls.append(url) - - # remove duplicates while preserving order - seen = set() - unique_urls = [] - for url in cleaned_urls: - if url not in seen: - seen.add(url) - unique_urls.append(url) - - return unique_urls - - -async def expand_link_context(url: str) -> WebContentResult: - """ - Expand a link and extract its content using web scraping. - - Uses the scrapeGenericUrl function to fetch and parse content from the URL. - Handles different platforms (social media, generic websites) automatically. - - Args: - url: The URL to expand and extract content from - - Returns: - WebContentResult with the scraped content and metadata - - Note: - This function: - - Detects platform automatically (Facebook, Instagram, Twitter, TikTok, generic) - - Tries simple HTTP scraping first for generic sites (no browser, faster) - - Falls back to Apify actor with browser if simple scraping fails - - Handles errors (404, timeouts, etc.) gracefully - - Supports different content types (articles, social media posts, etc.) - - Processing time is measured by the scraping functions and included in result - """ - # call the scraping function (processing time is measured internally) - result_dict = await scrapeGenericUrl(url) - - # parse the result dict into WebContentResult schema - result = WebContentResult.from_dict(data=result_dict, url=url) - - return result - - -def expand_link_context_sync(url: str, timeout_per_link: float) -> Optional[WebContentResult]: - """ - Synchronous wrapper for expand_link_context to be used in thread pool. - - This function runs the async expand_link_context in a new event loop, - making it suitable for execution in worker threads. - - Args: - url: The URL to expand and extract content from - timeout_per_link: Timeout in seconds for this link expansion - - Returns: - WebContentResult if successful, None if timeout or error occurs - """ - import time - start_time = time.time() - - logger.info(f"[SYNC] Starting scrape: {url[:80]}...") - - try: - # create new event loop for this thread - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - # run async function with timeout - result = loop.run_until_complete( - asyncio.wait_for( - expand_link_context(url), - timeout=timeout_per_link - ) - ) - - elapsed = time.time() - start_time - logger.info( - f"[SYNC] ✅ Success: {url[:60]}... | " - f"time={elapsed:.2f}s | content={result.content_length} chars | " - f"success={result.success}" - ) - return result - finally: - loop.close() - - except asyncio.TimeoutError: - elapsed = time.time() - start_time - logger.warning( - f"[SYNC] ⏱️ TIMEOUT: {url[:60]}... | " - f"limit={timeout_per_link}s | elapsed={elapsed:.2f}s" - ) - return None - except Exception as e: - elapsed = time.time() - start_time - logger.error( - f"[SYNC] ❌ ERROR: {url[:60]}... | " - f"elapsed={elapsed:.2f}s | error={type(e).__name__}: {str(e)[:100]}", - exc_info=True - ) - return None - - -def expand_link_contexts( - data_source: DataSource, - config: PipelineConfig -) -> List[DataSource]: - """ - Main function to expand link contexts from an original_text DataSource in parallel. - - Takes a DataSource of type 'original_text', extracts all links from it, - expands each link to get its content using web scraping IN PARALLEL using ThreadPool, - and returns a list of new DataSources of type 'link_context'. - - Uses ThreadPoolManager with OperationType.LINK_CONTEXT_EXPANDING for priority-based - parallel execution. Results are collected as they complete for maximum throughput. - - Args: - data_source: Input DataSource that must be of type 'original_text' - config: Pipeline configuration with timeout and limit settings - - Returns: - List of DataSources, one for each successfully expanded link - - Raises: - ValueError: If the input DataSource is not of type 'original_text' - - Example: - >>> from app.models import DataSource - >>> from app.config.default import get_default_pipeline_config - >>> original = DataSource( - ... id="msg-001", - ... source_type="original_text", - ... original_text="Check out https://example.com for more info" - ... ) - >>> config = get_default_pipeline_config() - >>> expanded = expand_link_contexts(original, config) - >>> len(expanded) - 1 - >>> expanded[0].source_type - 'link_context' - >>> expanded[0].metadata['url'] - 'https://example.com' - """ - - # validate input DataSource type - if data_source.source_type != "original_text": - raise ValueError( - f"expand_link_contexts expects a DataSource of type 'original_text', " - f"but received type '{data_source.source_type}'" - ) - - # extract links from the text - links = extract_links(data_source.original_text) - - # if no links found, return empty list - if not links: - logger.info("no links found in original text") - return [] - - # limit number of links based on config - original_count = len(links) - links = links[:config.max_links_to_expand] - if original_count > len(links): - logger.info( - f"limiting link expansion from {original_count} to {len(links)} " - f"(max_links_to_expand={config.max_links_to_expand})" - ) - - logger.info(f"expanding {len(links)} links in parallel using ThreadPool") - - # get thread pool manager instance - manager = ThreadPoolManager.get_instance() - if not manager._initialized: - manager.initialize() - - # submit all link expansion jobs to thread pool - timeout_per_link = config.timeout_config.link_content_expander_timeout_per_link - - futures = [] - for url in links: - future = manager.submit( - OperationType.LINK_CONTEXT_EXPANDING, - expand_link_context_sync, - url, - timeout_per_link - ) - futures.append(future) - - # wait for ALL results (simple and clean!) - try: - web_results = wait_all( - futures, - timeout=config.timeout_config.link_content_expander_timeout_total - ) - except TimeoutError: - logger.warning( - f"total timeout exceeded for link expansion " - f"(limit: {config.timeout_config.link_content_expander_timeout_total}s)" - ) - return [] - - # process all results - expanded_sources: List[DataSource] = [] - successful_count = 0 - failed_count = 0 - - for web_result in web_results: - # if result is None, job failed or timed out - if web_result is None: - failed_count += 1 - continue - - # create metadata dict from web result - metadata = { - "success": web_result.success, - "url": web_result.url, - "content_length": web_result.content_length, - "parent_source_id": data_source.id, - } - - # add social media metadata if available - if web_result.metadata: - metadata["platform"] = web_result.metadata.platform - metadata["author"] = web_result.metadata.author - metadata["timestamp"] = web_result.metadata.timestamp - metadata["likes"] = web_result.metadata.likes - metadata["shares"] = web_result.metadata.shares - metadata["comments"] = web_result.metadata.comments - - # add error to metadata if scraping failed - if web_result.error: - metadata["error"] = web_result.error - - # create a new DataSource for this link - link_source = DataSource( - id=f"link-{uuid.uuid4()}", - source_type="link_context", - original_text=web_result.content if web_result.success else "", - metadata=metadata, - locale=data_source.locale, - timestamp=data_source.timestamp, - ) - - expanded_sources.append(link_source) - successful_count += 1 - - logger.info( - f"link expansion complete: {successful_count} succeeded, {failed_count} failed, " - f"{len(expanded_sources)} total DataSources created" - ) - - return expanded_sources \ No newline at end of file diff --git a/app/ai/pipeline/no_claims_fallback.py b/app/ai/pipeline/no_claims_fallback.py deleted file mode 100644 index d6a41b9..0000000 --- a/app/ai/pipeline/no_claims_fallback.py +++ /dev/null @@ -1,222 +0,0 @@ -""" -No Claims Fallback Step for the Fact-Checking Pipeline. - -This module provides a fallback mechanism when the claim extractor cannot find -any verifiable claims in the user's input. It uses an LLM to generate a friendly -explanation for the user about why no claims could be extracted. - -Follows LangChain best practices: -- LCEL composition for declarative chains -- Structured outputs with Pydantic -- Stateless design with explicit state passing -- Type annotations throughout -- Support for both sync and async operations - -Architecture: -- Receives text that had no claims extracted -- Uses LLM to generate user-friendly explanation -- Returns structured output with explanation text -""" - -from typing import Optional -from pydantic import BaseModel, Field -from langchain_core.runnables import Runnable -from langchain_core.output_parsers import StrOutputParser - -from app.models import PipelineConfig -from .prompts import get_no_claims_fallback_prompt - - -# ===== STRUCTURED OUTPUT SCHEMA ===== - -class NoClaimsFallbackOutput(BaseModel): - """ - output schema for no claims fallback. - - contains the explanation text that will be shown to the user. - """ - explanation: str = Field( - ..., - description="Friendly explanation for why no claims were found" - ) - original_text: str = Field( - ..., - description="The original text that had no claims" - ) - - -# ===== CHAIN CONSTRUCTION ===== - -def build_no_claims_fallback_chain( - config: PipelineConfig -) -> Runnable: - """ - builds the LCEL chain for no claims fallback. - - the chain follows this structure: - prompt | model | output_parser -> str - - args: - config: Pipeline configuration with fallback LLM config. - - returns: - a Runnable chain that takes dict input and returns string explanation - - best practices applied: - - simple string output for user-facing messages - - moderate temperature for natural, friendly responses - - stateless design - no global state - """ - # get the prompt template - prompt = get_no_claims_fallback_prompt() - - # use the fallback llm from config - model = config.fallback_llm_config.llm - - # use string output parser for simple text response - output_parser = StrOutputParser() - - # compose the chain using LCEL - chain = prompt | model | output_parser - - return chain - - -# ===== MAIN FALLBACK FUNCTIONS ===== - -def generate_no_claims_explanation( - text: str, - config: PipelineConfig -) -> NoClaimsFallbackOutput: - """ - generates explanation for why no claims were found in text. - - this is the main synchronous entry point for no claims fallback. - - args: - text: the original text that had no verifiable claims - config: Pipeline configuration with fallback LLM config. - - returns: - NoClaimsFallbackOutput containing explanation and original text - - example: - >>> from app.config.default import get_default_pipeline_config - >>> config = get_default_pipeline_config() - >>> result = generate_no_claims_explanation("Olá, bom dia!", config) - >>> print(result.explanation) - "Olá! Não identifiquei nenhuma alegação verificável..." - """ - # build the chain - chain = build_no_claims_fallback_chain(config) - - # prepare input for the prompt template - chain_input = { - "text": text - } - - # invoke the chain - gets explanation string - try: - explanation: str = chain.invoke(chain_input) - except Exception as e: - # if LLM call fails (API overload, timeout, etc.), use default message - from app.observability.logger import get_logger - logger = get_logger(__name__) - logger.warning(f"no-claims fallback LLM call failed: {type(e).__name__}: {e}") - logger.info("using default no-claims message") - - # return friendly default message - explanation = ( - "Não consegui identificar alegações verificáveis em sua mensagem. " - "Para verificar informações, é útil incluir detalhes concretos como nomes de pessoas, " - "lugares, datas, números ou eventos específicos. " - "Posso ajudar com algo assim?" - ) - - # return structured output - return NoClaimsFallbackOutput( - explanation=explanation, - original_text=text - ) - - -async def generate_no_claims_explanation_async( - text: str, - config: PipelineConfig -) -> NoClaimsFallbackOutput: - """ - async version of generate_no_claims_explanation. - - follows LangChain best practice: provide async methods for IO-bound operations. - - args: - text: the original text that had no verifiable claims - config: Pipeline configuration with fallback LLM config. - - returns: - NoClaimsFallbackOutput containing explanation and original text - """ - # build the chain - chain = build_no_claims_fallback_chain(config) - - # prepare input for the prompt template - chain_input = { - "text": text - } - - # invoke the chain asynchronously - gets explanation string - try: - explanation: str = await chain.ainvoke(chain_input) - except Exception as e: - # if LLM call fails (API overload, timeout, etc.), use default message - from app.observability.logger import get_logger - logger = get_logger(__name__) - logger.warning(f"no-claims fallback LLM call failed: {type(e).__name__}: {e}") - logger.info("using default no-claims message") - - # return friendly default message - explanation = ( - "Não consegui identificar alegações verificáveis em sua mensagem. " - "Para verificar informações, é útil incluir detalhes concretos como nomes de pessoas, " - "lugares, datas, números ou eventos específicos. " - "Posso ajudar com algo assim?" - ) - - # return structured output - return NoClaimsFallbackOutput( - explanation=explanation, - original_text=text - ) - - -# ===== HELPER FUNCTIONS ===== - -def should_use_fallback(total_claims_count: int) -> bool: - """ - determines if fallback should be used based on claims count. - - args: - total_claims_count: total number of claims extracted from all sources - - returns: - True if fallback should be used (no claims found), False otherwise - """ - return total_claims_count == 0 - - -def get_combined_text_from_sources(sources: list) -> str: - """ - combines text from multiple data sources for fallback. - - args: - sources: list of DataSource objects - - returns: - combined text from all sources, joined with newlines - """ - texts = [] - for source in sources: - if hasattr(source, 'original_text') and source.original_text: - texts.append(source.original_text) - - return "\n\n".join(texts) if texts else "" diff --git a/app/ai/pipeline/prompts.py b/app/ai/pipeline/prompts.py deleted file mode 100644 index 8bd9e4d..0000000 --- a/app/ai/pipeline/prompts.py +++ /dev/null @@ -1,1078 +0,0 @@ -""" -Prompt templates for the fact-checking pipeline steps. -Following LangChain best practices: use ChatPromptTemplate for consistent message handling. -""" - -from langchain_core.prompts import ChatPromptTemplate - -# ===== CLAIM EXTRACTION PROMPTS ===== - -CLAIM_EXTRACTION_SYSTEM_PROMPT = """Você é um especialista em extração de alegações para um sistema de checagem de fatos. - -Sua tarefa é identificar as alegações verificáveis presentes no texto fornecido. Seja conservativo na extração de afirmações e apenas extrair afirmações coerentes e que contenham todo o contexto em si mesmo - -## O que Extrair: - -**Extraia alegações que:** -- Podem ser verificadas como verdadeiras ou falsas com base em evidências. -- Contenham afirmações sobre eventos, acontecimentos ou pessoas de forma a mais direta possível -- Contenham todo o contexto necessário para verificação embutidos -- Fazem afirmações sobre o mundo (eventos passados, presentes ou futuros). -- Contêm entidades nomeadas, eventos ou detalhes específicos. -- São opiniões pessoais que contém alegações ou juízo de valor sobre algum fato do mundo e podem ser verificadas. -- São perguntas que contém alegações ou juízo de valor sobre algum fato do mundo e podem ser verificadas. -- Fazem afirmações sobre ações futuras de grupos, organizações, governos ou pessoas (desde que sejam alegações verificáveis sobre planos, anúncios ou intenções declaradas). - -**Exemplos de boas alegações:** -- "A vacina X causa infertilidade em mulheres" -- "O presidente anunciou um imposto de carbono de R$50 por tonelada" -- "O evento de nome X aconteceu na cidade de Sidney" -- "O estudo examinou 50.000 participantes" -- "Não há evidências ligando a vacina X a problemas de fertilidade" -- "Eu acho que vacinas causam autismo" -- "Vacinas causam autismo?" -- "O governo vai aumentar os impostos sobre combustíveis em janeiro" -- "A empresa X vai demitir 5.000 funcionários no próximo trimestre" -- "O partido Y anunciou que vai apresentar um projeto de lei para proibir plásticos descartáveis" -- "O sindicato planeja iniciar uma greve nacional na próxima semana" - -**CRÍTICO - Extraia O QUE está sendo alegado, NÃO COMO está sendo compartilhado:** - -Quando você vê frases como: -- "circula como" -- "é compartilhado como se" -- "apresentado como" -- "divulgado dizendo que" -- "compartilhada como se fosse" - -Você DEVE extrair a ALEGAÇÃO SUBSTANTIVA (o que está sendo afirmado que aconteceu), NÃO o ato de compartilhamento. - -**Transformação OBRIGATÓRIA:** -- Se o texto diz: "X é compartilhado como se fosse Y" -- Extraia: "Y" (não "X é compartilhado") - - ERRADO - Meta-alegações sobre compartilhamento: - - "A foto é compartilhada como se mostrasse um acidente" - - "O post circula dizendo que houve um terremoto" - - "O vídeo é apresentado como se fosse de 2024" - - "A paralisação foi compartilhada como se fosse em dezembro" - - CORRETO - Alegações sobre o evento/fato substantivo: - - "Houve um acidente na rodovia X" (de: "Foto é compartilhada como se mostrasse um acidente na rodovia X") - - "Houve um terremoto na cidade Y" (de: "Post circula dizendo que houve um terremoto na cidade Y") - - "O vídeo mostra eventos que ocorreram em 2024" (de: "Vídeo é apresentado como se fosse de 2024") - - "Houve uma paralisação em dezembro" (de: "Paralisação foi compartilhada como se fosse em dezembro") - -**NÃO extraia:** -- Perguntas sem alegações implícitas ("O que você acha?") -- Afirmações cujo contexto esteja faltando ou que mencione entidades externas à afirmação em si (Ex: o evento ocorreu na cidade) -- Cumprimentos ou conversa trivial -- Trechos dos quais não é possível extrair nenhuma afirmação sobre algo, nenhum fato ou nenhum juízo de valor: (Ex: Olá, bom dia) -- Meta-alegações sobre como a informação está sendo compartilhada ou apresentada, ao invés do fato substantivo em si - - -## Diretrizes: - -**PRIORIDADE: Extraia o MENOR número de alegações possível, com o MÁXIMO de contexto em cada uma.** - -Prefira consolidar informações relacionadas em UMA alegação rica, ao invés de múltiplas alegações vagas. - -**REGRA DE OURO - NUNCA extraia alegações vagas:** -- Se o texto menciona "o ataque", "o evento", "a vacina", "o acidente" SEM especificar QUAL/ONDE/QUANDO -- Você DEVE procurar essas informações no texto e incluí-las na alegação -- Se NÃO encontrar contexto suficiente no texto, NÃO extraia essa alegação - -ERRADO - Alegações vagas SEM contexto específico: - - "A imagem sugere que o ataque foi encenado" (QUAL ataque? ONDE? QUANDO?) - - "A imagem sugere que o ataque terrorista foi encenado" (QUAL ataque terrorista? ONDE? QUANDO?) - - "O evento aconteceu" (QUAL evento? ONDE? QUANDO?) - - "A vacina causa problemas" (QUAL vacina? QUE problemas?) - - "Houve uma paralisação" (ONDE? QUANDO? DE QUEM?) - -CORRETO - Alegações ricas com contexto completo: - - "A imagem sugere que o ataque terrorista ao shopping Westgate em Nairobi, Quênia, em setembro de 2013 foi encenado" - - "O terremoto de magnitude 7.0 aconteceu na cidade de Marrakech, Marrocos, em março de 2024" - - "Houve uma paralisação de caminhoneiros no Rodoanel de São Paulo em novembro de 2025" - -1. **Normalize e esclareça - MAXIMIZE O CONTEXTO**: Reformule alegações para serem claras, específicas, autocontidas e independentes. Inclua TODOS os detalhes relevantes (quem, o quê, quando, onde) em CADA alegação. - - **OBRIGATÓRIO**: Antes de extrair qualquer alegação, pergunte-se: - - QUEM está envolvido? (pessoas específicas, organizações, grupos) - - O QUÊ aconteceu? (evento específico, não genérico) - - QUANDO aconteceu? (data, mês, ano - se mencionado) - - ONDE aconteceu? (local específico - cidade, país, endereço) - - Se você não consegue responder a maioria dessas perguntas, a alegação está VAGA DEMAIS e não deve ser extraída. - - Exemplos: - - VAGO: "Esse negócio da vacina é uma loucura!" - - ESPECÍFICO: "A vacina Pfizer contra COVID-19 tem efeitos colaterais perigosos" - - - VAGO: "O estudo examinou 50.000 participantes" - - ESPECÍFICO: "O estudo de segurança da vacina Pfizer contra COVID-19 publicado em 2021 examinou 50.000 participantes" - - - VAGO: "A imagem sugere que o ataque foi encenado" - - ESPECÍFICO: "A imagem sugere que o ataque terrorista ao shopping Westgate em Nairobi em setembro de 2013 foi encenado" - - **Sempre pergunte**: Esta alegação pode ser compreendida por alguém que NÃO leu o texto original? Se não, adicione mais contexto. - -2. **Extraia o fato substantivo, não o meta-relato**: Quando o texto menciona como algo está sendo compartilhado ("circula como", "é compartilhado como se", "compartilhada como se fosse"), - extraia a alegação sobre o EVENTO ou FATO em si, não sobre o ato de compartilhamento. - - **Regra de transformação**: "X é compartilhado como se fosse Y" → Extraia "Y aconteceu" (não "X é compartilhado") - - Exemplos: - - Original: "Imagem circula mostrando explosão em fábrica" - - ERRADO: "A imagem circula nas redes sociais" - - CORRETO: "Houve uma explosão em uma fábrica de produtos químicos" - - - Original: "Vídeo é compartilhado como se mostrasse paralisação em dezembro" - - ERRADO: "O vídeo é compartilhado como se mostrasse paralisação em dezembro" - - ERRADO: "A paralisação foi compartilhada como se fosse em dezembro" - - CORRETO: "Houve uma paralisação em dezembro" - -3. **APENAS alegações autocontidas**: Extraia alegações que podem ser compreendidas completamente sozinhas e não alegações que mencionam acontecimentos de forma abstrata, sem um nome ou informação específica - -BOM - Autocontidas: - - "Não há evidências ligando a vacina X a problemas de fertilidade em mulheres." - - "O estudo concluiu que a vacina X é segura." - - RUIM - Precisa de contexto: - - "O estudo examinou mais de 50.000 participantes." (Qual estudo?) - - "A pesquisa foi conduzida pelo Ministério da Saúde durante 3 anos." (Qual pesquisa?) - - "O evento ocorreu" (Qual evento?) - - **Corrija normalizando:** - - "O estudo de segurança da vacina X examinou mais de 50.000 participantes." - - "O Ministério da Saúde conduziu pesquisa sobre a vacina X durante 3 anos." - - "O evento Rock in Rio ocorreu" - - Se uma alegação usa pronomes (ele, ela, isso, aquilo) ou referências vagas (o estudo, a pesquisa), - normalize substituindo pelo sujeito real. Se você não conseguir identificar o sujeito - a partir do texto, pule essa alegação. - -4. **Consolide quando possível, separe quando necessário**: Prefira CONSOLIDAR informações relacionadas em UMA alegação rica. Apenas separe em múltiplas alegações quando tratarem de eventos/fatos completamente DIFERENTES e não-relacionados. - - RUIM - Fragmentação desnecessária: - - Alegação 1: "Houve um ataque" - - Alegação 2: "O ataque foi em janeiro" - - Alegação 3: "O ataque foi encenado" - - BOM - Consolidação: - - Alegação única: "O ataque terrorista ao shopping X em janeiro de 2024 foi encenado" - -5. **Preserve o idioma**: Mantenha o idioma original do texto. Texto em português → alegações em português. - -6. **Forneça análise**: Para cada alegação, explique brevemente por que ela é verificável e o que a torna passível de checagem. - -7. **Trate perguntas**: Se o texto pergunta "X é verdade?", extraia a alegação X. - - Texto: "É verdade que a vacina X causa infertilidade?" - - Extraia: "A vacina X causa infertilidade" - -## Formato de Saída: - -Você deve retornar um objeto JSON com um array "claims". Cada alegação deve ter: -- text: O texto normalizado e independente da alegação -- entities: Array de entidades principais mencionadas na alegação -- llm_comment: Sua breve análise do por que esta alegação é verificável - -Se nenhuma alegação verificável for encontrada, retorne um array vazio de claims. - -**LEMBRE-SE - Checklist Final:** -1. **Menos é mais**: Prefira POUCAS alegações RICAS a MUITAS alegações VAGAS -2. **Contexto completo**: Cada alegação deve incluir QUEM, O QUÊ, QUANDO, ONDE (quando aplicável) -3. **Autocontidas**: Alguém que NÃO leu o texto original deve conseguir entender completamente cada alegação -4. **Consolidação**: Junte informações relacionadas em UMA alegação ao invés de fragmentar -5. **Fatos, não meta-relatos**: Extraia "Y aconteceu", não "X é compartilhado como se fosse Y" - -Sempre pergunte: "Esta alegação pode ser verificada sem mais contexto? Posso consolidá-la com outra alegação relacionada?" - -Nota: NÃO inclua os campos 'id' ou 'source' - eles serão adicionados automaticamente.""" - -CLAIM_EXTRACTION_USER_PROMPT = """Extraia todas as alegações verificáveis do seguinte texto. - -====Texto para Analisar==== -{text} - -Lembre-se: -- Extraia APENAS alegações autocontidas e verificáveis que podem ser compreendidas sozinhas -- Normalize alegações substituindo pronomes e referências vagas por sujeitos específicos -- Se o texto pergunta "X é verdade?", extraia a alegação X -- Identifique entidades em cada alegação -- Forneça breve análise para cada alegação -- Retorne array vazio se nenhuma alegação autocontida for encontrada - -Retorne as alegações como um objeto JSON estruturado.""" - - -def get_claim_extraction_prompt_default() -> ChatPromptTemplate: - """ - Returns the default ChatPromptTemplate for claim extraction. - - Used for source types: original_text, link_context, other - - Expected input variables: - - text: The text content to extract claims from - - Returns: - ChatPromptTemplate configured for general claim extraction - """ - return ChatPromptTemplate.from_messages([ - ("system", CLAIM_EXTRACTION_SYSTEM_PROMPT), - ("user", CLAIM_EXTRACTION_USER_PROMPT) - ]) - - -# ===== IMAGE CLAIM EXTRACTION PROMPTS ===== - -IMAGE_CLAIM_EXTRACTION_SYSTEM_PROMPT = """Você é um especialista em extração de alegações para um sistema de checagem de fatos. - -Sua tarefa é identificar TODAS as alegações verificáveis presentes no texto fornecido. - -IMPORTANTE: Considere verificáveis apenas alegações sobre a realidade fora do texto ou da imagem, que possam ser checadas com dados, documentos, notícias, registros oficiais, estudos etc. - -## O que Extrair: - -**Extraia alegações que:** -- Podem ser verificadas como verdadeiras ou falsas com base em evidências. -- Fazem afirmações sobre o mundo. -- Contêm entidades nomeadas, eventos ou detalhes específicos. -- São opiniões pessoais que contém alegações ou juízo de valor sobre algum fato do mundo e que podem ser verificadas. -- São perguntas que contém alegações ou juízo de valor sobre algum fato do mundo e que podem ser verificadas. - -**Exemplos de boas alegações:** -- "A vacina X causa infertilidade em mulheres" -- "O presidente anunciou um imposto de carbono de R$50 por tonelada" -- "O estudo examinou 50.000 participantes" -- "Não há evidências ligando a vacina X a problemas de fertilidade" -- "Eu acho que vacinas causam autismo" -- "Vacinas causam autismo?" - - -## O que NÃO Extrair: - -**NÃO extraia:** -- Perguntas sem alegações implícitas ("O que você acha?") -- Cumprimentos ou conversa trivial -- Trechos dos quais não é possível extrair nenhuma afirmação sobre algo, nenhum fato ou nenhum juízo de valor: (Ex: Olá, bom dia) - -## Casos especiais: descrições de imagens, memes, charges e quadrinhos - -O texto que você receber pode ser uma descrição de uma imagem, ilustração, meme ou charge. - -Nesses casos: - -1. Ignore alegações que falam apenas sobre a composição visual da cena dentro da imagem. - - Exemplo: "A imagem mostra um trabalhador sendo esmagado por uma engrenagem gigante" -> não extrair. - -2. Só extraia alegações quando o texto fizer afirmações explícitas sobre a realidade fora da imagem, por exemplo: - - "A charge critica a exploração de trabalhadores por grandes empresas" - - "A ilustração representa como o governo aumenta impostos sobre a classe média" - - "O meme sugere que a mídia mente com frequência sobre economia" - - Nestes casos você pode extrair: - - "Grandes empresas exploram trabalhadores" - - "O governo aumenta impostos sobre a classe média" - - "A mídia mente com frequência sobre economia" - -3. Se o texto for apenas uma descrição visual sem nenhuma afirmação sobre a realidade, retorne um array vazio em "claims". - -## Contexto geral e mensagem da imagem - -Quando o texto for uma descrição de imagem, charge, meme ou ilustração, tente identificar se ele sugere uma mensagem mais ampla sobre o mundo, a sociedade ou algum conceito. - -Siga estes passos: - -1. Identifique o tema ou contexto geral sugerido pelo texto da descrição: - - Pode ser política, celebridades, famosos, sociedade, economia, segurança pública, tecnologia, meio ambiente, relações de trabalho, saúde, educação, direitos humanos, etc. - - Também pode ser sobre um fato específico, um grupo de pessoas, um objeto, uma instituição ou um conceito abstrato. - -2. Procure pistas na própria descrição que indiquem a intenção ou crítica: - - Palavras ou expressões como "critica", "denuncia", "sugere que", "representa", "mostra como", "faz uma metáfora sobre", "mostra a relação entre". - - Referências a grupos sociais, instituições, categorias de pessoas ou situações do mundo real. - - Nomes de pessoas famosas, celebridades, políticos - -3. A partir dessas pistas, formule uma ou mais alegações gerais sobre o mundo, mantendo o texto fiel ao que está sugerido: - - Exemplos: - - Se a descrição diz que a imagem "critica como as empresas tratam os consumidores", você pode extrair: - - "Empresas tratam consumidores de forma desrespeitosa." - - Se a descrição diz que a imagem "representa a vigilância constante das pessoas por empresas de tecnologia", você pode extrair: - - "Empresas de tecnologia monitoram constantemente as pessoas." - - Se a descrição diz que a imagem "mostra como grupos vulneráveis sofrem mais com crises econômicas", você pode extrair: - - "Grupos vulneráveis sofrem mais impactos em épocas de crise econômica." - -4. Não invente mensagens que não estejam sugeridas de forma razoável pelo texto: - - Não extrapole além do que o texto permite inferir de forma clara. - - Se a descrição não der nenhuma pista de mensagem social, política, econômica ou conceitual, não crie alegações a partir de suposições. - -5. Sempre que gerar uma alegação a partir da mensagem geral da imagem, escreva a alegação como uma afirmação factual sobre o mundo: - - Ela deve poder ser checada com dados, relatos, estudos, documentos, registros históricos ou outras fontes de evidência. - -## Diretrizes gerais: - -1. Normalize e esclareça: Reformule alegações para serem claras, específicas, autocontidas e independentes. - - Original: "Esse negócio da vacina é uma loucura!" - - Normalizada: "A vacina X tem efeitos colaterais perigosos" - - Original: "O estudo examinou 50.000 participantes" - - Normalizada: "O estudo de segurança da vacina X examinou 50.000 participantes" - -2. Apenas alegações autocontidas: - Extraia alegações que podem ser compreendidas completamente sozinhas. - - Bom - Autocontidas: - - "Não há evidências ligando a vacina X a problemas de fertilidade em mulheres." - - "O estudo concluiu que a vacina X é segura." - - Ruim - Precisa de contexto: - - "O estudo examinou mais de 50.000 participantes." (Qual estudo?) - - "A pesquisa foi conduzida pelo Ministério da Saúde durante 3 anos." (Qual pesquisa?) - - Corrija normalizando: - - "O estudo de segurança da vacina X examinou mais de 50.000 participantes." - - "O Ministério da Saúde conduziu pesquisa sobre a vacina X durante 3 anos." - - Se uma alegação usa pronomes (ele, ela, isso, aquilo) ou referências vagas (o estudo, a pesquisa), - normalize substituindo pelo sujeito real. Se você não conseguir identificar o sujeito a partir do texto, pule essa alegação. - -3. Extraia todas as alegações distintas: Um único texto pode conter múltiplas alegações. Extraia cada uma separadamente. - -4. Preserve o idioma: Mantenha o idioma original do texto. Texto em português -> alegações em português. - -5. Extraia entidades: Identifique entidades nomeadas principais (pessoas, lugares, organizações, produtos, datas, números) em cada alegação. - -6. Forneça análise: Para cada alegação, explique brevemente por que ela é verificável e o que a torna passível de checagem. - -7. Trate perguntas: Se o texto pergunta "X é verdade?", extraia a alegação X. - - Texto: "É verdade que a vacina X causa infertilidade?" - - Extraia: "A vacina X causa infertilidade" - -Se o texto mencionar, sugerir ou levantar dúvidas sobre: -- edição digital, -- manipulação, -- montagem, -- adulteração, -- artificialidade, -- incoerências visuais, -- aparência de geração por IA, -- marcas d'água de IA (SORA watermark, Gemini sparkle, Meta Imagine watermark, etc.), -- características típicas de imagens geradas por IA (dedos extras, distorções em mãos, texto ilegível, objetos fundidos), - -então você deve extrair alegações sobre a autenticidade ou origem da imagem, DESDE QUE tais alegações sejam explicitamente mencionadas ou claramente sugeridas pelo texto. - -## Exemplo genérico com descrição de imagem: - -Texto de entrada: -"Descrição da imagem: A figura mostra uma charge. Um grupo de pessoas está embaixo de uma grande bota com a palavra 'IMPOSTOS'. A legenda diz que a charge critica como os impostos pesam sobre a população." - -Saída esperada: -- Extraia apenas: - - "Os impostos pesam sobre a população." - -Não extraia: -- "Um grupo de pessoas está embaixo de uma grande bota" -- "Há uma bota com a palavra 'IMPOSTOS'" - -## Formato de Saída: - -Você deve retornar um objeto JSON com um array "claims". Cada alegação deve ter: -- text: O texto normalizado e independente da alegação -- entities: Array de entidades principais mencionadas na alegação -- llm_comment: Sua breve análise do por que esta alegação é verificável - -Se nenhuma alegação verificável for encontrada, retorne um array vazio de claims. - -IMPORTANTE: Extraia apenas alegações autocontidas que podem ser compreendidas sem -ler o texto ao redor. Substitua pronomes e referências vagas por sujeitos específicos. - -Nota: Não inclua os campos "id" ou "source" - eles serão adicionados automaticamente. -""" - -IMAGE_CLAIM_EXTRACTION_USER_PROMPT = """Extraia todas as alegações verificáveis do seguinte texto extraído de uma imagem. - -====Texto Extraído (transcrito) da Imagem ==== -{text} - -Lembre-se: -- Extraia APENAS alegações autocontidas e verificáveis que podem ser compreendidas sozinhas -- A alegação deve ser sobre a realidade fora do texto ou da imagem (mundo real) -- Se for uma descrição de imagem, charge, meme ou ilustração, IGNORE frases que apenas descrevem o que aparece na cena (objetos, posições cotidianos) e extraia somente afirmações sobre mundo, famosos, políticos, sociedade, fatos, grupos, instituições ou conceitos -- Normalize alegações substituindo pronomes e referências vagas por sujeitos específicos -- Se o texto perguntar "X é verdade?", extraia a alegação X -- Identifique entidades em cada alegação -- Forneça breve análise para cada alegação -- Retorne array vazio se nenhuma alegação autocontida for encontradaq - -Retorne as alegações como um objeto JSON estruturado.""" - - -def get_image_claim_extraction_prompt() -> ChatPromptTemplate: - """ - Returns the ChatPromptTemplate for claim extraction from images (OCR text). - - Expected input variables: - - text: The OCR-extracted text from the image - - Returns: - ChatPromptTemplate configured for image claim extraction - """ - return ChatPromptTemplate.from_messages([ - ("system", IMAGE_CLAIM_EXTRACTION_SYSTEM_PROMPT), - ("user", IMAGE_CLAIM_EXTRACTION_USER_PROMPT) - ]) - - -# ===== VIDEO CLAIM EXTRACTION PROMPTS ===== - -VIDEO_CLAIM_EXTRACTION_SYSTEM_PROMPT = """Você é um especialista em extração de alegações para um sistema de checagem de fatos. - -Sua tarefa é identificar TODAS as alegações verificáveis presentes no texto fornecido. - -IMPORTANTE: Considere verificáveis apenas alegações sobre a realidade fora do texto ou da imagem, que possam ser checadas com dados, documentos, notícias, registros oficiais, estudos etc. - -## O que Extrair: - -**Extraia alegações que:** -- Podem ser verificadas como verdadeiras ou falsas com base em evidências. -- Fazem afirmações sobre o mundo. -- Contêm entidades nomeadas, eventos ou detalhes específicos. -- São opiniões pessoais que contém alegações ou juízo de valor sobre algum fato do mundo e que podem ser verificadas. -- São perguntas que contém alegações ou juízo de valor sobre algum fato do mundo e que podem ser verificadas. - -**Exemplos de boas alegações:** -- "A vacina X causa infertilidade em mulheres" -- "O presidente anunciou um imposto de carbono de R$50 por tonelada" -- "O estudo examinou 50.000 participantes" -- "Não há evidências ligando a vacina X a problemas de fertilidade" -- "Eu acho que vacinas causam autismo" -- "Vacinas causam autismo?" - -## O que NÃO Extrair: - -**NÃO extraia:** -- Perguntas sem alegações implícitas ("O que você acha?") -- Cumprimentos ou conversa trivial -- Trechos dos quais não é possível extrair nenhuma afirmação sobre algo, nenhum fato ou nenhum juízo de valor: (Ex: Olá, bom dia) - -## Casos especiais: descrições de imagens, memes, charges e quadrinhos - -O texto que você receber pode ser uma descrição de uma imagem, ilustração, meme ou charge. - -Nesses casos: - -1. Ignore alegações que falam apenas sobre a composição visual da cena dentro da imagem. - - Exemplo: "A imagem mostra um trabalhador sendo esmagado por uma engrenagem gigante" -> não extrair. - -2. Só extraia alegações quando o texto fizer afirmações explícitas sobre a realidade fora da imagem, por exemplo: - - "A charge critica a exploração de trabalhadores por grandes empresas" - - "A ilustração representa como o governo aumenta impostos sobre a classe média" - - "O meme sugere que a mídia mente com frequência sobre economia" - - Nestes casos você pode extrair: - - "Grandes empresas exploram trabalhadores" - - "O governo aumenta impostos sobre a classe média" - - "A mídia mente com frequência sobre economia" - -3. Se o texto for apenas uma descrição visual sem nenhuma afirmação sobre a realidade, retorne um array vazio em "claims". - -## Contexto geral e mensagem da imagem - -Quando o texto for uma descrição de imagem, charge, meme ou ilustração, tente identificar se ele sugere uma mensagem mais ampla sobre o mundo, a sociedade ou algum conceito. - -Siga estes passos: - -1. Identifique o tema ou contexto geral sugerido pelo texto da descrição: - - Pode ser política, celebridades, famosos, sociedade, economia, segurança pública, tecnologia, meio ambiente, relações de trabalho, saúde, educação, direitos humanos, etc. - - Também pode ser sobre um fato específico, um grupo de pessoas, um objeto, uma instituição ou um conceito abstrato. - -2. Procure pistas na própria descrição que indiquem a intenção ou crítica: - - Palavras ou expressões como "critica", "denuncia", "sugere que", "representa", "mostra como", "faz uma metáfora sobre", "mostra a relação entre". - - Referências a grupos sociais, instituições, categorias de pessoas ou situações do mundo real. - - Nomes de pessoas famosas, celebridades, políticos - -3. A partir dessas pistas, formule uma ou mais alegações gerais sobre o mundo, mantendo o texto fiel ao que está sugerido: - - Exemplos: - - Se a descrição diz que a imagem "critica como as empresas tratam os consumidores", você pode extrair: - - "Empresas tratam consumidores de forma desrespeitosa." - - Se a descrição diz que a imagem "representa a vigilância constante das pessoas por empresas de tecnologia", você pode extrair: - - "Empresas de tecnologia monitoram constantemente as pessoas." - - Se a descrição diz que a imagem "mostra como grupos vulneráveis sofrem mais com crises econômicas", você pode extrair: - - "Grupos vulneráveis sofrem mais impactos em épocas de crise econômica." - -4. Não invente mensagens que não estejam sugeridas de forma razoável pelo texto: - - Não extrapole além do que o texto permite inferir de forma clara. - - Se a descrição não der nenhuma pista de mensagem social, política, econômica ou conceitual, não crie alegações a partir de suposições. - -5. Sempre que gerar uma alegação a partir da mensagem geral da imagem, escreva a alegação como uma afirmação factual sobre o mundo: - - Ela deve poder ser checada com dados, relatos, estudos, documentos, registros históricos ou outras fontes de evidência. - -## Diretrizes gerais: - -1. Normalize e esclareça: Reformule alegações para serem claras, específicas, autocontidas e independentes. - - Original: "Esse negócio da vacina é uma loucura!" - - Normalizada: "A vacina X tem efeitos colaterais perigosos" - - Original: "O estudo examinou 50.000 participantes" - - Normalizada: "O estudo de segurança da vacina X examinou 50.000 participantes" - -2. Apenas alegações autocontidas: - Extraia alegações que podem ser compreendidas completamente sozinhas. - - Bom - Autocontidas: - - "Não há evidências ligando a vacina X a problemas de fertilidade em mulheres." - - "O estudo concluiu que a vacina X é segura." - - Ruim - Precisa de contexto: - - "O estudo examinou mais de 50.000 participantes." (Qual estudo?) - - "A pesquisa foi conduzida pelo Ministério da Saúde durante 3 anos." (Qual pesquisa?) - - Corrija normalizando: - - "O estudo de segurança da vacina X examinou mais de 50.000 participantes." - - "O Ministério da Saúde conduziu pesquisa sobre a vacina X durante 3 anos." - - Se uma alegação usa pronomes (ele, ela, isso, aquilo) ou referências vagas (o estudo, a pesquisa), - normalize substituindo pelo sujeito real. Se você não conseguir identificar o sujeito a partir do texto, pule essa alegação. - -3. Extraia todas as alegações distintas: Um único texto pode conter múltiplas alegações. Extraia cada uma separadamente. - -4. Preserve o idioma: Mantenha o idioma original do texto. Texto em português -> alegações em português. - -5. Extraia entidades: Identifique entidades nomeadas principais (pessoas, lugares, organizações, produtos, datas, números) em cada alegação. - -6. Forneça análise: Para cada alegação, explique brevemente por que ela é verificável e o que a torna passível de checagem. - -7. Trate perguntas: Se o texto pergunta "X é verdade?", extraia a alegação X. - - Texto: "É verdade que a vacina X causa infertilidade?" - - Extraia: "A vacina X causa infertilidade" - -Se o texto mencionar, sugerir ou levantar dúvidas sobre: -- edição digital, -- manipulação, -- montagem, -- adulteração, -- artificialidade, -- incoerências visuais, -- aparência de geração por IA, -- marcas d'água de IA (SORA watermark, Gemini sparkle, Meta Imagine watermark, etc.), -- características típicas de imagens geradas por IA (dedos extras, distorções em mãos, texto ilegível, objetos fundidos), - -então você deve extrair alegações sobre a autenticidade ou origem da imagem, DESDE QUE tais alegações sejam explicitamente mencionadas ou claramente sugeridas pelo texto. - -## Exemplo genérico com descrição de imagem: - -Texto de entrada: -"Descrição da imagem: A figura mostra uma charge. Um grupo de pessoas está embaixo de uma grande bota com a palavra 'IMPOSTOS'. A legenda diz que a charge critica como os impostos pesam sobre a população." - -Saída esperada: -- Extraia apenas: - - "Os impostos pesam sobre a população." - -Não extraia: -- "Um grupo de pessoas está embaixo de uma grande bota" -- "Há uma bota com a palavra 'IMPOSTOS'" - -## Formato de Saída: - -Você deve retornar um objeto JSON com um array "claims". Cada alegação deve ter: -- text: O texto normalizado e independente da alegação -- entities: Array de entidades principais mencionadas na alegação -- llm_comment: Sua breve análise do por que esta alegação é verificável - -Se nenhuma alegação verificável for encontrada, retorne um array vazio de claims. - -IMPORTANTE: Extraia apenas alegações autocontidas que podem ser compreendidas sem -ler o texto ao redor. Substitua pronomes e referências vagas por sujeitos específicos. - -Nota: Não inclua os campos "id" ou "source" - eles serão adicionados automaticamente. -""" - -VIDEO_CLAIM_EXTRACTION_USER_PROMPT = """Extraia todas as alegações verificáveis do seguinte texto extraído de um vídeo. - -====Texto Extraído (transcrito) da Imagem ==== -{text} - -Lembre-se: -- Extraia APENAS alegações autocontidas e verificáveis que podem ser compreendidas sozinhas -- A alegação deve ser sobre a realidade fora do texto ou da imagem (mundo real) -- Se for uma descrição de vídeo, curta, meme, IGNORE frases que apenas descrevem o que aparece na cena (objetos, posições cotidianos) e extraia somente afirmações sobre mundo, famosos, políticos, sociedade, fatos, grupos, instituições ou conceitos -- Normalize alegações substituindo pronomes e referências vagas por sujeitos específicos -- Se o texto perguntar "X é verdade?", extraia a alegação X -- Identifique entidades em cada alegação -- Forneça breve análise para cada alegação -- Retorne array vazio se nenhuma alegação autocontida for encontradaq - -Retorne as alegações como um objeto JSON estruturado.""" - - - -def get_video_claim_extraction_prompt() -> ChatPromptTemplate: - """ - Returns the ChatPromptTemplate for claim extraction from video transcripts. - - Expected input variables: - - text: The transcribed text from the video - - Returns: - ChatPromptTemplate configured for video transcript claim extraction - """ - return ChatPromptTemplate.from_messages([ - ("system", VIDEO_CLAIM_EXTRACTION_SYSTEM_PROMPT), - ("user", VIDEO_CLAIM_EXTRACTION_USER_PROMPT) - ]) - -# ===== PROMPT SELECTOR ===== - -def get_claim_extraction_prompt_for_source_type( - source_type: str -) -> ChatPromptTemplate: - """ - Selects and returns the appropriate claim extraction prompt based on source type. - - This is the main entry point for getting prompts in the claim extraction pipeline. - It routes to specialized prompts for different data modalities. - - Args: - source_type: The type of data source (original_text, image, video_transcript, etc.) - - Returns: - ChatPromptTemplate configured for the specific source type - - Source type mappings: - - "image" -> image-specific prompt (OCR text handling) - - "video_transcript" -> video-specific prompt (spoken language handling) - - "audio_transcript" -> audio-specific prompt (spoken language handling) - - "original_text" -> default prompt (written text) - - "link_context" -> default prompt (article/web content) - - "other" -> default prompt (fallback) - - Example: - >>> prompt = get_claim_extraction_prompt_for_source_type("image") - >>> # Returns image-specific ChatPromptTemplate - """ - match source_type: - case "image": - print("[IMAGE PROMPT]") - return get_image_claim_extraction_prompt() - case "video_transcript": - return get_video_claim_extraction_prompt() - case "original_text" | "link_context" | "other": - return get_claim_extraction_prompt_default() - case _: - # fallback for any unknown types added in the future - return get_claim_extraction_prompt_default() - - -def get_claim_extraction_prompt() -> ChatPromptTemplate: - """ - Returns the default ChatPromptTemplate for claim extraction. - - DEPRECATED: This function is kept for backward compatibility. - New code should use get_claim_extraction_prompt_for_source_type() instead. - - Expected input variables: - - text: The text content to extract claims from (source-agnostic) - - Returns: - ChatPromptTemplate configured for claim extraction - """ - return get_claim_extraction_prompt_default() - - -# ===== ADJUDICATION PROMPTS ===== - -ADJUDICATION_SYSTEM_PROMPT = """Você é um especialista em verificação de fatos (fact-checking) para um sistema de checagem de notícias e alegações. - -DATA ATUAL: {current_date} - -Esta é a data de hoje. Leve isso em consideração ao fazer a verificação de fatos, especialmente para eventos recentes ou alegações temporais. -No entanto, NÃO descarte fontes com datas anteriores à data atual, pois elas podem conter informações válidas e relevantes para a verificação. - -Sua tarefa é analisar alegações extraídas de diferentes fontes de dados e emitir um veredito fundamentado para cada uma, baseando-se estritamente nas evidências e citações fornecidas. - -Após todas as afirmações individuais terem seu veredito, você irá analizar o contexto de todas elas juntas, verificando como cada afirmação interaje com a outra -a partir dessa análise geral, você irá emitir uma resumo/sumário geral de todos as informações enviadas. Esse sumário irá abordar o contexto geral e irá mencionar se -as afirmações tem uma linha coerente de pensamento, ou se algumas delas estão desconexas. - -Sempre assuma que a afirmação pode estar sendo utiliza para promover a desinformação, seu julgamento deve partir desse contexto e ser rigoroso com quaisquer possíveis erros de interpretação, contexto -que falta dentro da afirmaçào. Entre assumir que a afirmação tem o intuito de espalhar desinformação ou outra intenção, assuma que a afirmação é desinformação e faça o julgamento a partir disso. - -Ex: Afirmação: "Vídeo é compartilhado sobre o presidente X sendo corrupto". Contexto: "Presidente não é corrupto" -Veredito: Falso -Justificativa: o contexto apresentado não apoia as notícias que se espalham sobre o presidente - -Nesse caso você não deve julgar baseado no fato das notícias se espalharem ou não, e sim no contexto de que a afirmação sobre o presidente pode ser fake ou não - -## Categorias de Veredito: - -Você deve classificar cada alegação em UMA das seguintes categorias: - -1. **Verdadeiro**: A alegação é comprovadamente verdadeira com base nas evidências apresentadas. As fontes são confiáveis e concordam que a alegação é factual. A afirmação não pode estar fora de contexto, interpretada de forma errada e faltando informações cruciais - -2. **Falso**: A alegação é comprovadamente falsa com base nas evidências apresentadas. As fontes confiáveis contradizem diretamente a alegação. - -3. **Fora de Contexto**: A alegação contém elementos verdadeiros, mas foi apresentada de forma enganosa, omitindo contexto importante, ou misturando fatos verdadeiros com interpretações falsas. - - **IMPORTANTE - Descontextualização Temporal/Espacial**: Se uma alegação é tecnicamente verdadeira MAS está sendo apresentada em um contexto temporal ou espacial DIFERENTE do original, classifique como "Fora de Contexto". - - Exemplos comuns: - - Vídeo/foto de evento em novembro sendo compartilhado como se fosse de dezembro - - Evento da cidade A sendo apresentado como se fosse da cidade B - - Declaração de 2020 sendo compartilhada como se fosse recente - - Evento que aconteceu num contexto X apresentado como parte do contexto Y - - **Como identificar**: Se o resumo geral (overall_summary) identifica que há uma desconexão temporal/espacial entre os fatos verdadeiros e como estão sendo apresentados, classifique as alegações envolvidas como "Fora de Contexto", MESMO que os fatos individuais sejam verdadeiros. - -4. **Fontes Insuficentes**: Não há evidências suficientes nas fontes fornecidas para confirmar ou refutar a alegação. As fontes são insuficientes, contraditórias demais, ou a alegação requer informação que não está disponível. - -## Diretrizes para Julgamento: - -1. **Baseie-se PRINCIPALMENTE e FORTEMENTE nas evidências fornecidas**: Use exclusivamente as citações, fontes e contexto apresentados. Não use conhecimento externo. - - **EXCEÇÃO - Alegações Atemporais**: Para alegações que não requerem verificação externa de fatos por serem "atemporais" e NÃO relacionadas a notícias, eventos, pessoas ou sociedade - como: - - Operações matemáticas (exemplo: "2+2=4", "a raiz quadrada de 16 é 4") - - Definições estabelecidas (exemplo: "um triângulo tem três lados") - - Você PODE usar seu conhecimento interno para verificar a veracidade dessas alegações, MESMO que as fontes sejam insuficientes ou inexistentes. Nesses casos, classifique como "Verdadeiro" ou "Falso" baseando-se no seu conhecimento, e explique na justificativa que se trata de um fato atemporal verificável. - Porém atenção, uma fonte que você considere como confiável e relevante ainda deve ser a fonte da verdade principal. - -2. **Avalie a qualidade das fontes**: Considere a confiabilidade do publicador (órgãos governamentais, instituições científicas, veículos de imprensa estabelecidos vs. sites desconhecidos). - -3. **Priorize fontes especializadas em fact-checking**: Quando disponíveis, dê preferência a fontes de organizações especializadas em verificação de fatos, como agências de fact-checking (Agência Lupa, Aos Fatos, Comprova, E-farsas, Boatos.org, Fato ou Fake), APIs de verificação de fatos (Google Fact Check Tool), e organizações internacionais de fact-checking. Essas fontes fornecem verificação especializada de alegações e devem ter peso maior na sua análise. - -4. **Como referenciar fontes**: - - No resumo geral e nas justificativas de cada alegação, use APENAS números entre colchetes para referenciar fontes (exemplo: [1], [2], [3]) - - NÃO inclua URLs diretamente no texto do resumo geral ou das justificativas - - NÃO escreva links como "https://..." ou "www..." no resumo ou justificativas - - As fontes serão listadas separadamente ao final, então basta numerá-las - -5. **Seja claro e objetivo**: Explique seu raciocínio de forma concisa mas completa. O usuário precisa entender POR QUE você chegou àquela conclusão. - -6. **Identifique contexto faltante**: Se uma alegação é tecnicamente verdadeira mas apresentada de forma enganosa, classifique como "Fora de Contexto" e explique o que está faltando. - -7. **Verifique descontextualização temporal/espacial**: Quando múltiplas alegações forem verdadeiras individualmente, mas o conjunto revelar que um evento está sendo associado ao momento/local errado, classifique como "Fora de Contexto". Por exemplo: - - Se alegação A diz "houve caminhões parados em novembro" (verdadeiro) - - E alegação B diz "paralisação anunciada para dezembro" (verdadeiro) - - Mas o contexto geral indica que o vídeo de novembro está sendo compartilhado COMO SE fosse de dezembro - - Classifique AMBAS as alegações como "Fora de Contexto", pois estão sendo usadas para criar uma narrativa enganosa - -8. **Reconheça limitações**: Se as evidências são insuficientes ou contraditórias demais, seja honesto e classifique como "Fontes insuficientes para verificar". - -9. **Favorece Dados mais recente**: Se tivermos 2 evidências contraditórias sobre a mesma afirmação, favoreça a mais recente - -10. **Busque diversidade de fontes**: Caso tenhamos diversas fontes confiáveis, de diversos domínios, autores e orgãos. Busque citar uma gama diversa de domínios e autores na sua resposta, -também utilize essa diversidade de fontes conviáveis na sua resposta de fact-checking. - -## Formato de Resposta: - -Para cada fonte de dados (data source), você receberá: -- As informações da fonte (tipo, id, texto original, metadados) -- Uma ou mais alegações extraídas dessa fonte -- Para cada alegação, as citações e evidências coletadas (URLs, títulos, trechos, avaliações prévias) - -Você deve retornar um objeto JSON estruturado contendo: -- Para cada fonte de dados, um objeto com: - - data_source_id: o ID da fonte de dados (você verá no cabeçalho "Source: ... (ID: xxx)") - - claim_verdicts: lista de vereditos para alegações desta fonte -- Cada veredito contém: - - claim_id: o ID da alegação (você verá em "Afirmação ID: xxx") - - claim_text: o texto da alegação (exatamente como foi apresentado) - - verdict: uma das quatro categorias ("Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar") - - justification: sua explicação detalhada, citando as fontes -- Um sumário geral sobre o output: - - O sumário deve ser conciso, cerca de 3-4 linhas - - Não formate o sumário com caracteres * - -IMPORTANTE: -- Inclua o data_source_id e claim_id quando possível para identificar cada grupo de vereditos, mas não mencione essa fonte de dados no resumo final/justificativa -- Mantenha os resultados NA MESMA ORDEM das fontes apresentadas -- Mencione na justificativa se todas as afirmações contêm uma mesma narrativa/contexto ou se existe alguma afirmação que é um outlier. Não mencione IDs nessa parte -- Use APENAS números entre colchetes [1], [2], [3] para referenciar fontes no texto -- Casos você tenha uma gama de fontes confiáveis, busque referenciar fontes de diferentes domínios e autores. -- NÃO inclua URLs (https://...) diretamente no resumo geral ou nas justificativas -- No sumário geral seja conciso, escreva cerca de 3-4 linhas nele. Não formate o sumário com caracteres * - -## Exemplos de Justificação: - -BOM: -"Segundo o Ministério da Saúde [1], um estudo com 50.000 participantes não encontrou evidências ligando a vacina X a problemas de fertilidade. A alegação é contradita por múltiplas fontes científicas confiáveis [2][3]." - -RUIM: -"Segundo o Ministério da Saúde (https://saude.gov.br/estudo-vacinas), um estudo com 50.000 participantes..." (NÃO inclua URLs no texto) - -BOM: -"Segundo o jornal Globo [1], tal afirmação é verdadeira e foi confirmada por dados oficiais [2]." - -RUIM: -"Esta alegação é falsa." (Falta fundamentação e citação de fontes) - -RUIM: -"Segundo https://globo.com, a informação é verdadeira" (NÃO use URLs diretamente, use números) - -## Importante: - -- Seja rigoroso mas justo -- Assuma que a afirmação possa ser desinformaçào até que uma fonte confiável prove que o conceito principal abordado não é -- Prefira "Fontes insuficientes para verificar" a fazer suposições -- Contexto importa: "Fora de Contexto" é tão importante quanto "Falso" -- Use SEMPRE números entre colchetes [1], [2], [3] para referenciar fontes, NUNCA URLs diretamente -- Mantenha um tom profissional e imparcial -- Seja conciso no sumário, escreva cerca de 3-4 linhas de texto e não utiliza caracteres * -""" - -ADJUDICATION_USER_PROMPT = """Analise as alegações abaixo e forneça um veredito fundamentado para cada uma. - -{formatted_sources_and_claims} - -{additional_context} - -Para cada alegação, forneça: -1. O veredito (Verdadeiro, Falso, Fora de Contexto, ou Fontes insuficientes para verificar) -2. Uma justificativa detalhada citando as fontes fornecidas com números referentes à fonte. Ex: [1] -3. Caso existam diversas fontes confiáveis de domínios e orgãos diferentes, busque citar fontes diversas (no quesito domínio e/ou autor) na justificativa da sua resposta. - -Também forneça um sumário da mensagem, seja conciso e escreva cerca de 3-4 linhas de texto - -Retorne sua análise como um objeto JSON estruturado conforme especificado.""" - - -def get_adjudication_prompt() -> ChatPromptTemplate: - """ - Returns the ChatPromptTemplate for claim adjudication. - - Expected input variables: - - current_date: The current date in DD-MM-YYYY format (e.g., "08-12-2024") - - formatted_sources_and_claims: The formatted string with all data sources and their enriched claims - - additional_context: Optional additional context for the adjudication - - Returns: - ChatPromptTemplate configured for adjudication - """ - return ChatPromptTemplate.from_messages([ - ("system", ADJUDICATION_SYSTEM_PROMPT), - ("user", ADJUDICATION_USER_PROMPT) - ]) - - -# ===== ADJUDICATION WITH GOOGLE SEARCH PROMPTS ===== - -ADJUDICATION_WITH_SEARCH_SYSTEM_PROMPT = """Você é um especialista em verificação de fatos (fact-checking) para um sistema de checagem de notícias e alegações. - -DATA ATUAL: {current_date} - -Esta é a data de hoje. Leve isso em consideração ao fazer a verificação de fatos, especialmente para eventos recentes ou alegações temporais. - -CRÍTICO - UNICODE ENCODING: Sua resposta DEVE usar encoding Unicode válido (UTF-8) com TODOS os caracteres não-ASCII preservados corretamente. - -NUNCA use: -- Bytes nulos (\x00, \u0000) -- Sequências de escape inválidas -- Caracteres de controle inválidos (exceto \n, \r, \t) -- Substituições ASCII para caracteres acentuados - -SEMPRE preserve TODOS os caracteres especiais do português: -- Acentos agudos: á, é, í, ó, ú, Á, É, Í, Ó, Ú -- Acentos circunflexos: â, ê, ô, Â, Ê, Ô -- Til: ã, õ, Ã, Õ -- Cedilha: ç, Ç -- Crases: à, À - -Exemplos de texto CORRETO (Unicode válido): -- "eleições", "não", "após", "prisão", "São Paulo", "manifestações", "informação" -- "decisão", "reação", "população", "situação", "política", "econômica" - -A resposta JSON DEVE conter apenas caracteres Unicode válidos. Teste cada string antes de retornar para garantir que não há bytes nulos ou sequências inválidas. - -Sua tarefa é analisar alegações e verificá-las usando a **busca do Google** para encontrar evidências em tempo real. -Após todas as afirmações individuais terem seu veredito, você irá analizar o contexto de todas elas juntas, verificando como cada afirmação interaje com a outra -a partir dessa análise geral, você irá emitir uma resumo/sumário geral de todos as informações enviadas. Esse sumário irá abordar o contexto geral e irá mencionar se -as afirmações tem uma linha coerente de pensamento, ou se algumas delas estão desconexas. - -## Categorias de Veredito: - -Você deve classificar cada alegação em UMA das seguintes categorias: - -1. **Verdadeiro**: A alegação é comprovadamente verdadeira com base nas evidências encontradas na busca. As fontes são confiáveis e concordam que a alegação é factual. - -2. **Falso**: A alegação é comprovadamente falsa com base nas evidências encontradas na busca. As fontes confiáveis contradizem diretamente a alegação. - -3. **Fora de Contexto**: A alegação contém elementos verdadeiros, mas foi apresentada de forma enganosa, omitindo contexto importante, ou misturando fatos verdadeiros com interpretações falsas. - - **IMPORTANTE - Descontextualização Temporal/Espacial**: Se uma alegação é tecnicamente verdadeira MAS está sendo apresentada em um contexto temporal ou espacial DIFERENTE do original, classifique como "Fora de Contexto". - - Exemplos comuns: - - Vídeo/foto de evento em novembro sendo compartilhado como se fosse de dezembro - - Evento da cidade A sendo apresentado como se fosse da cidade B - - Declaração de 2020 sendo compartilhada como se fosse recente - - **Como identificar**: Se o resumo geral (overall_summary) identifica que há uma desconexão temporal/espacial entre os fatos verdadeiros e como estão sendo apresentados, classifique as alegações envolvidas como "Fora de Contexto", MESMO que os fatos individuais sejam verdadeiros. - -4. **Fontes insuficientes para verificar**: Não há evidências suficientes encontradas na busca para confirmar ou refutar a alegação. As fontes são insuficientes, contraditórias demais, ou a alegação requer informação que não está disponível. - -## Diretrizes para Julgamento: - -1. **Use a busca do Google para encontrar evidências**: Para cada alegação, execute buscas para encontrar fontes confiáveis que confirmem ou refutem a alegação. - -2. **Avalie a qualidade das fontes**: Considere a confiabilidade do publicador (órgãos governamentais, instituições científicas, veículos de imprensa estabelecidos vs. sites desconhecidos). - -3. **Priorize fontes especializadas em fact-checking**: Ao buscar evidências, dê preferência a fontes de organizações especializadas em verificação de fatos, como agências de fact-checking (Agência Lupa, Aos Fatos, Comprova, E-farsas, Boatos.org, Fato ou Fake), e organizações internacionais de fact-checking. Essas fontes fornecem verificação especializada de alegações e devem ter peso maior na sua análise quando disponíveis nos resultados da busca. - -4. **Seja claro e objetivo**: Explique seu raciocínio de forma concisa mas completa. O usuário precisa entender POR QUE você chegou àquela conclusão, citando as fontes encontradas. - -5. **Identifique contexto faltante**: Se uma alegação é tecnicamente verdadeira mas apresentada de forma enganosa, classifique como "Fora de Contexto" e explique o que está faltando. - -6. **Verifique descontextualização temporal/espacial**: Quando múltiplas alegações forem verdadeiras individualmente, mas o conjunto revelar que um evento está sendo associado ao momento/local errado, classifique como "Fora de Contexto". Por exemplo: - - Se alegação A diz "houve caminhões parados em novembro" (verdadeiro) - - E alegação B diz "paralisação anunciada para dezembro" (verdadeiro) - - Mas o contexto geral indica que o vídeo de novembro está sendo compartilhado COMO SE fosse de dezembro - - Classifique AMBAS as alegações como "Fora de Contexto", pois estão sendo usadas para criar uma narrativa enganosa - -7. **Reconheça limitações**: Se as evidências são insuficientes ou contraditórias demais, seja honesto e classifique como "Fontes insuficientes para verificar". - -8. **Favorece dados mais recentes**: Se tivermos 2 evidências contraditórias sobre a mesma afirmação, favoreça a mais recente. - -## Formato de Resposta: - -Você receberá alegações agrupadas por fonte de dados. Cada fonte tem um ID (por exemplo, "msg-mixed") e uma lista de alegações. - -Você DEVE retornar um objeto JSON com: -- Um array "results" onde cada elemento representa UMA fonte de dados -- Cada resultado deve ter: - - data_source_id: o ID da fonte fornecido no prompt (por exemplo, "msg-mixed", "msg-001", etc.) - - claim_verdicts: array com TODOS os vereditos das alegações daquela fonte -- Cada veredito deve ter: - - claim_id: o ID da alegação fornecido - - claim_text: o texto da alegação - - verdict: "Verdadeiro", "Falso", "Fora de Contexto", ou "Fontes insuficientes para verificar" - - justification: sua explicação com citações das fontes encontradas -- Todos os links/URL devem ser strings com "". sem markdown e sem caracteres especiais no meio. - Exemplo válido: "https://meusite.com.br" - Exemplo inválido: https://meusite.com.br (sem "" para a string do URL) - Exemplo inválido: "https://meusi\\nte.com.br" (caracter especial \\n no meio do link) - Exemplo inválido: "[Site](https://example.com)" (markdown no lugar de uma string de URL) -- Um campo overall_summary com um sumário geral sobre a checagem e como as afirmações se relacionam, não coloque links nesse sumário. - -IMPORTANTE: -- O campo "verdict" DEVE ser exatamente um destes valores: "Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar" -- Inclua todos os claim_ids e claim_texts fornecidos -- AGRUPE os vereditos por data_source_id - se 3 alegações vêm da mesma fonte, retorne 1 resultado com 3 vereditos -- Use o data_source_id exato fornecido no prompt para cada fonte -- Use APENAS números entre colchetes [1], [2], [3] para referenciar fontes no texto -- Justificativas devem citar as fontes encontradas na busca do Google de forma clara -- Não coloque links no sumário geral (overall_summary) - -## REGRAS CRÍTICAS DE FORMATAÇÃO JSON: - -**VOCÊ DEVE RETORNAR JSON VÁLIDO E BEM FORMATADO:** - -1. **Escape de caracteres especiais**: SEMPRE escape aspas duplas, barras invertidas e caracteres de controle em strings: - - Aspas duplas dentro de strings: use \\" - - Barras invertidas: use \\\\ - - Nova linha: use \\n - - Tab: use \\t - -2. **URLs em strings**: URLs devem estar entre aspas e caracteres especiais devem ser escapados: - - Correto: "url": "https://example.com/article?id=123&source=google" - - NUNCA deixe aspas sem fechar ou parênteses sem escape - -3. **Strings longas**: Mantenha strings longas em uma única linha, usando \\n para quebras de linha reais - -4. **Números em citações**: Use colchetes simples como [1], [2], [3] - NUNCA use (1) ou outros formatos - -5. **Validação**: Seu JSON DEVE: - - Ter todas as aspas fechadas corretamente - - Ter todos os colchetes e chaves balanceados - - Não ter vírgulas extras no final de arrays ou objetos - - Ser parseable por qualquer parser JSON padrão - -6. **NUNCA inclua**: - - URLs nuas fora de strings JSON - - Parênteses desbalanceados em strings - - Aspas não escapadas dentro de strings - - Texto explicativo fora do JSON - -SE VOCÊ RETORNAR JSON INVÁLIDO, O SISTEMA FALHARÁ. VALIDE SEU JSON ANTES DE RETORNAR. - -## Exemplos de Justificação: - -BOM: -"Segundo o Ministério da Saúde [1], um estudo com 50.000 participantes não encontrou evidências ligando a vacina X a problemas de fertilidade. A alegação é contradita por múltiplas fontes científicas confiáveis [2][3]." - -RUIM: -"Segundo o Ministério da Saúde (https://saude.gov.br/estudo-vacinas), um estudo com 50.000 participantes..." (NÃO inclua URLs no texto) -""" - - -NO_CLAIMS_FALLBACK_SYSTEM_PROMPT = """Você é um assistente especializado em fact-checking integrado a uma pipeline de verificação de fatos. - -Sua tarefa é explicar para o usuário, de forma educada e clara, por que não foi possível extrair alegações verificáveis do texto fornecido. - -## Contexto: -O texto do usuário passou por um sistema de extração de alegações, mas nenhuma alegação verificável foi encontrada. Agora você precisa explicar o motivo de forma amigável e construtiva. - -## Possíveis Razões: - -1. **Opinião Pessoal Não Verificável** - - Opiniões puramente subjetivas sem conexão com fatos do mundo - - Exemplo: "Eu gosto de azul", "Prefiro café ao chá" - -2. **Cumprimentos ou Conversa Casual** - - Saudações, agradecimentos, despedidas - - Exemplo: "Olá, bom dia!", "Obrigado pela ajuda" - -3. **Perguntas Sem Alegações Implícitas** - - Perguntas que não contêm afirmações sobre fatos - - Exemplo: "Como você está?", "O que você acha?" - -4. **Instruções ou Comandos** - - Pedidos de ação sem afirmações verificáveis - - Exemplo: "Me ajude com isso", "Explique sobre X" - -5. **Texto Muito Vago ou Ambíguo** - - Afirmações muito genéricas sem detalhes específicos - - Falta de entidades ou fatos concretos para verificar - -## Diretrizes para sua Resposta: - -1. **Seja Caloroso e Acolhedor**: Se o usuário cumprimentou, retribua a saudação com entusiasmo! -2. **Seja Educado e Empático**: Explique de forma construtiva, nunca crítica -3. **Seja Específico**: Identifique a razão pela qual não há alegações verificáveis -4. **Seja Útil**: Quando apropriado, dê exemplos do que você pode verificar -5. **Seja Conciso**: 3-4 frases são suficientes - -## Exemplos de Boas Respostas: - -Para "Olá, bom dia": -"Olá! Bom dia! 😊 Não identifiquei nenhuma alegação verificável em sua mensagem. Posso ajudar a verificar afirmações sobre eventos, pessoas, fatos, estatísticas ou notícias. Se tiver algo específico que gostaria de verificar, compartilhe comigo!" - -Para "Oi, tudo bem?": -"Oi! Tudo ótimo, obrigado! 😊 Vejo que você não enviou nenhuma alegação para verificar. Posso checar afirmações sobre fatos, eventos, dados ou notícias. O que você gostaria de verificar?" - -Para "Eu gosto de pizza": -"Sua mensagem expressa uma preferência pessoal, que não pode ser verificada como verdadeira ou falsa. Posso verificar alegações sobre fatos objetivos do mundo, como eventos, estatísticas, declarações de pessoas públicas ou notícias. Tem algo assim que gostaria de checar?" - -Para texto vago: -"Não consegui identificar alegações específicas e verificáveis em seu texto. Para verificar algo, é útil incluir detalhes concretos como nomes de pessoas, lugares, datas, números ou eventos específicos. Por exemplo: 'O presidente X anunciou Y', ou 'Estudos mostram que Z'. Posso ajudar com algo assim?" - -## Formato de Saída: - -Retorne apenas o texto da explicação para o usuário, de forma amigável e acolhedora. Use emojis quando apropriado para tornar a resposta mais calorosa.""" - -NO_CLAIMS_FALLBACK_USER_PROMPT = """O texto a seguir foi analisado mas não teve nenhuma alegação verificável extraída: - -====Texto do Usuário==== -{text} - -Por favor, explique ao usuário de forma educada e construtiva por que não foi possível extrair alegações verificáveis deste texto. Use 2-3 frases no máximo.""" - - -def get_no_claims_fallback_prompt() -> ChatPromptTemplate: - """ - get the ChatPromptTemplate for no claims fallback. - - this prompt template explains to users why no verifiable claims were found. - - returns: - ChatPromptTemplate configured for no claims fallback - """ - return ChatPromptTemplate.from_messages([ - ("system", NO_CLAIMS_FALLBACK_SYSTEM_PROMPT), - ("user", NO_CLAIMS_FALLBACK_USER_PROMPT) - ]) \ No newline at end of file diff --git a/app/ai/pipeline/steps.py b/app/ai/pipeline/steps.py deleted file mode 100644 index 9abbe18..0000000 --- a/app/ai/pipeline/steps.py +++ /dev/null @@ -1,451 +0,0 @@ -""" -Pipeline Steps Interface and Default Implementation. - -This module defines the interface for all pipeline steps and provides a default -implementation. This allows for easy testing, mocking, and customization of -individual pipeline steps without modifying the main pipeline orchestration. - -Architecture: -- PipelineSteps: Protocol defining the interface for all steps -- DefaultPipelineSteps: Default implementation using the standard step functions -- Dependency injection pattern: main_pipeline receives a PipelineSteps instance -""" - -from typing import Protocol, List -from app.models import ( - DataSource, - PipelineConfig, - ClaimExtractionInput, - ClaimExtractionOutput, - EvidenceRetrievalInput, - EvidenceRetrievalResult, - LLMConfig, - DataSourceWithExtractedClaims, - FactCheckResult, - AdjudicationInput, -) -from app.ai.context import EvidenceGatherer -from app.ai.pipeline.no_claims_fallback import NoClaimsFallbackOutput - -from app.ai.context.factcheckapi import ( - GoogleFactCheckGatherer -) - -from app.config import get_trusted_domains - - -class PipelineSteps(Protocol): - """ - Protocol defining the interface for all fact-checking pipeline steps. - - Each method corresponds to one step in the pipeline: - 1. Link expansion - expand URLs from original text - 2. Claim extraction - extract fact-checkable claims from text - 3. Evidence retrieval - gather supporting/refuting evidence for claims - - This protocol enables: - - Easy testing with mock implementations - - Custom implementations for specific use cases - - Clear separation of concerns - - Type-safe dependency injection - """ - - def expand_links_from_sources( - self, - sources: List[DataSource], - config: PipelineConfig - ) -> List[DataSource]: - """ - Expand links from data sources with enhanced logging. - - Used as a callback function in the fire-and-forget pipeline. - Processes data sources, identifies those with type 'original_text', - extracts URLs from them, and creates new 'link_context' data sources for each URL. - - Args: - sources: List of data sources to expand links from - config: Pipeline configuration with timeout settings - - Returns: - List of new 'link_context' data sources created from expanding links - """ - ... - - async def extract_claims_from_all_sources( - self, - data_sources: List[DataSource], - llm_config: LLMConfig - ) -> List[ClaimExtractionOutput]: - """ - Extract claims from all data sources. - - Processes each data source, extracts fact-checkable claims using an LLM, - and returns all extraction results. - - Args: - data_sources: List of data sources to extract claims from - llm_config: LLM configuration (model, temperature, timeout) - - Returns: - List of ClaimExtractionOutput, one per data source - """ - ... - - def get_evidence_gatherers(self) -> List[EvidenceGatherer]: - """ - Get the list of evidence gatherers to use for evidence retrieval. - - Returns: - List of EvidenceGatherer instances configured for this pipeline - """ - ... - - async def gather_evidence( - self, - retrieval_input: EvidenceRetrievalInput, - gatherers: List[EvidenceGatherer] | None = None, - timeout: float = 45.0 - ) -> EvidenceRetrievalResult: - """ - Gather evidence for claims from multiple sources. - - Runs each claim through evidence gatherers (web search, fact-check APIs, etc.) - and accumulates citations. - - Args: - retrieval_input: Input containing claims to gather evidence for - gatherers: List of evidence gatherers. If None, uses defaults. - timeout: Timeout in seconds for evidence gathering operations (default: 45.0) - - Returns: - EvidenceRetrievalResult mapping claim IDs to enriched claims with citations - """ - ... - - async def handle_no_claims_fallback( - self, - data_sources: List[DataSource], - config: PipelineConfig - ) -> NoClaimsFallbackOutput: - """ - Generate friendly explanation when no claims are found. - - Uses an LLM to explain to the user why no verifiable claims could be - extracted from their input. - - Args: - data_sources: List of data sources that had no claims extracted - config: Pipeline configuration with fallback LLM config - - Returns: - NoClaimsFallbackOutput with explanation and original text - """ - ... - - def adjudicate_claims( - self, - adjudication_input: AdjudicationInput, - llm_config: LLMConfig - ) -> FactCheckResult: - """ - Adjudicate claims using traditional evidence-based adjudication. - - Uses pre-gathered evidence (citations) to make verdicts on claims. - This is the standard adjudication method that analyzes enriched claims - with their citations. - - Args: - adjudication_input: Input with data sources and enriched claims - llm_config: LLM configuration (model, temperature, timeout) - - Returns: - FactCheckResult with verdicts for all claims - """ - ... - - def adjudicate_claims_with_search( - self, - sources_with_claims: List[DataSourceWithExtractedClaims], - model: str = "gpt-4o-mini" - ) -> FactCheckResult: - """ - Adjudicate claims using web search in a single API call. - - This is an alternative to the traditional evidence gathering + adjudication flow. - Instead of pre-gathering evidence, this uses OpenAI's web search tool - to find evidence and generate verdicts in one LLM call. - - Args: - sources_with_claims: List of data sources with their extracted claims - model: OpenAI model to use (default: gpt-4o-mini) - - Returns: - FactCheckResult with verdicts for all claims - """ - ... - - -class DefaultPipelineSteps: - """ - Default implementation of PipelineSteps using the standard step functions. - - This implementation delegates to the actual step functions in their respective - modules, providing a convenient way to inject the standard pipeline behavior. - - Example: - >>> from app.ai.pipeline.steps import DefaultPipelineSteps - >>> from app.ai.main_pipeline import run_fact_check_pipeline - >>> steps = DefaultPipelineSteps() - >>> result = await run_fact_check_pipeline( - ... data_sources=[...], - ... config=config, - ... steps=steps - ... ) - """ - - def get_evidence_gatherers(self) -> List[EvidenceGatherer]: - """ - Get the default list of evidence gatherers. - - Returns a standard set of evidence gatherers with reasonable timeout values. - - Returns: - List of EvidenceGatherer instances (WebSearchGatherer, GoogleFactCheckGatherer) - """ - from app.ai.context.web import WebSearchGatherer - allowed_domains = get_trusted_domains() - return [ - GoogleFactCheckGatherer(timeout=15.0), - WebSearchGatherer(max_results=5, timeout=15.0,allowed_domains=allowed_domains) - ] - - def expand_links_from_sources( - self, - sources: List[DataSource], - config: PipelineConfig - ) -> List[DataSource]: - """ - wrapper for _expand_data_sources_with_links with enhanced logging. - - expands links from sources and returns new DataSource objects with detailed logging. - used as callback in fire-and-forget pipeline. - """ - from app.observability.logger import get_logger, PipelineStep - - link_logger = get_logger(__name__, PipelineStep.LINK_EXPANSION) - - link_logger.info(f"expand_links_from_sources called with {len(sources)} sources") - link_logger.debug(f"source types: {[s.source_type for s in sources]}") - - # run link expansion (synchronous function using ThreadPoolManager internally) - expanded_sources = self._expand_data_sources_with_links(sources, config) - - # ensure we always return a list - if expanded_sources is None: - link_logger.warning("link expansion returned None") - return [] - - link_logger.debug(f"expanded {len(expanded_sources)} link sources") - - for i, source in enumerate(expanded_sources, 1): - url = source.metadata.get("url", "unknown") if source.metadata else "unknown" - success = source.metadata.get("success", False) if source.metadata else False - status = "✓" if success else "✗" - content_preview = ( - source.original_text[:1000] if source.original_text else "(no content)" - ) - - link_logger.debug(f"{i}. {status} {source.source_type} (id: {source.id})") - link_logger.debug(f" URL: {url}") - link_logger.debug(f" content preview: {content_preview}...") - - return expanded_sources - - def _expand_data_sources_with_links( - self, - data_sources: List[DataSource], - config: PipelineConfig - ) -> List[DataSource]: - """ - Private method: processes all data sources and expands links. - - Iterates through data sources, identifies 'original_text' types, - and expands their links to create new 'link_context' data sources. - Returns only the new link_context sources, not the original sources. - """ - from app.ai.pipeline.link_context_expander import expand_link_contexts - from app.observability.logger import get_logger, PipelineStep - - link_logger = get_logger(__name__, PipelineStep.LINK_EXPANSION) - - link_logger.info(f"_expand_data_sources_with_links called with {len(data_sources)} sources") - link_logger.debug( - f"source types: {[s.source_type for s in data_sources]}" - ) - - expanded_link_sources: List[DataSource] = [] - - for source in data_sources: - if source.source_type == "original_text": - text_preview = source.original_text[:100] if source.original_text else "" - link_logger.info(f"processing original_text source: {source.id}") - link_logger.debug(f"text preview: {text_preview}...") - - try: - # expand link contexts for this source - expanded_sources = expand_link_contexts(source, config) - - # handle None return - if expanded_sources is None: - link_logger.warning("link expansion returned None") - continue - - if expanded_sources: - link_logger.info( - f"created {len(expanded_sources)} new link_context data source(s)" - ) - for expanded in expanded_sources: - url = expanded.metadata.get("url", "unknown") - success = expanded.metadata.get("success", False) - status = "✓" if success else "✗" - link_logger.debug(f"{status} {url}") - - expanded_link_sources.extend(expanded_sources) - else: - link_logger.debug("no links found or expanded") - - except Exception as e: - link_logger.error( - f"link expansion failed for source {source.id}: {e}", - exc_info=True - ) - - return expanded_link_sources - - - async def extract_claims_from_all_sources( - self, - data_sources: List[DataSource], - llm_config: LLMConfig - ) -> List[ClaimExtractionOutput]: - """ - Default implementation: processes each data source and extracts claims. - - Iterates through all data sources, creates ClaimExtractionInput for each, - calls extract_claims, and returns all results. - """ - claim_outputs: List[ClaimExtractionOutput] = [] - - for source in data_sources: - # create input for claim extractor - extraction_input = ClaimExtractionInput(data_source=source) - - # extract claims using the single-source method - result = await self.__extract_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - claim_outputs.append(result) - - return claim_outputs - - async def __extract_claims( - self, - extraction_input: ClaimExtractionInput, - llm_config: LLMConfig - ) -> ClaimExtractionOutput: - """ - Default implementation: calls extract_claims_async from claim_extractor. - - See claim_extractor.extract_claims_async for detailed documentation. - """ - from app.ai.pipeline.claim_extractor import extract_claims_async - return await extract_claims_async(extraction_input, llm_config) - - async def gather_evidence( - self, - retrieval_input: EvidenceRetrievalInput, - gatherers: List[EvidenceGatherer] | None = None, - timeout: float = 45.0 - ) -> EvidenceRetrievalResult: - """ - Default implementation: calls gather_evidence_async from evidence_retrieval. - - Uses timeout from configuration to initialize gatherers with proper timeout values. - - See evidence_retrieval.gather_evidence_async for detailed documentation. - """ - from app.ai.pipeline.evidence_retrieval import gather_evidence_async - from app.ai.context.web import WebSearchGatherer - - # if no gatherers provided, create default gatherers with configured timeout - if gatherers is None: - gatherers = [ - GoogleFactCheckGatherer(timeout=timeout), - WebSearchGatherer(max_results=5, timeout=timeout) - ] - - return await gather_evidence_async(retrieval_input, gatherers) - - async def handle_no_claims_fallback( - self, - data_sources: List[DataSource], - config: PipelineConfig - ) -> NoClaimsFallbackOutput: - """ - Default implementation: generates explanation when no claims are found. - - Combines text from all data sources and uses LLM to generate a friendly - explanation for why no verifiable claims could be extracted. - - See no_claims_fallback.generate_no_claims_explanation_async for details. - """ - from app.ai.pipeline.no_claims_fallback import ( - generate_no_claims_explanation_async, - get_combined_text_from_sources - ) - - # combine text from all data sources - combined_text = get_combined_text_from_sources(data_sources) - - # generate explanation using fallback from config - return await generate_no_claims_explanation_async(combined_text, config) - - def adjudicate_claims( - self, - adjudication_input: AdjudicationInput, - llm_config: LLMConfig - ) -> FactCheckResult: - """ - Default implementation: calls adjudicate_claims from judgement.py. - - Uses the standard evidence-based adjudication with pre-gathered citations. - - See judgement.adjudicate_claims for detailed documentation. - """ - from app.ai.pipeline.judgement import adjudicate_claims - - return adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - def adjudicate_claims_with_search( - self, - sources_with_claims: List[DataSourceWithExtractedClaims], - model: str = "gpt-4o-mini" - ) -> FactCheckResult: - """ - Default implementation: calls adjudicate_claims_with_search from adjudication_with_search. - - Uses OpenAI web search to find evidence and generate verdicts in a single LLM call. - - See adjudication_with_search.adjudicate_claims_with_search for detailed documentation. - """ - from app.ai.pipeline.adjudication_with_search import adjudicate_claims_with_search - - return adjudicate_claims_with_search( - sources_with_claims=sources_with_claims, - model=model - ) diff --git a/app/ai/pipeline/tests/README.md b/app/ai/pipeline/tests/README.md deleted file mode 100644 index 8f17ec3..0000000 --- a/app/ai/pipeline/tests/README.md +++ /dev/null @@ -1,98 +0,0 @@ -# Claim Extractor Tests - -Integration tests for the claim extraction pipeline step that make **real calls to the LLM**. - -## Prerequisites - -1. **Set OpenAI API Key**: - ```bash - export OPENAI_API_KEY="sk-your-key-here" - ``` - -2. **Install dependencies**: - ```bash - pip install pytest - # Also ensure langchain and other dependencies are installed - ``` - -## Running Tests - -### Run all tests with output: -```bash -pytest app/ai/pipeline/tests/claim_extractor_test.py -v -s -``` - -### Run a specific test: -```bash -pytest app/ai/pipeline/tests/claim_extractor_test.py::test_basic_claim_extraction -v -s -``` - -### Run without LLM output (quieter): -```bash -pytest app/ai/pipeline/tests/claim_extractor_test.py -v -``` - -## Flags Explained - -- `-v` : Verbose output (shows test names) -- `-s` : Show stdout (you'll see the LLM responses for debugging) -- `-k ` : Run tests matching pattern (e.g., `-k portuguese`) - -## What These Tests Do - -✅ **Make real LLM calls** - Not mocked, actual OpenAI API requests -✅ **Validate structure** - Check that outputs have correct types and fields -✅ **Print results** - Show LLM responses in stdout for debugging -✅ **Don't validate content** - We don't check if LLM answers are "correct" (that's subjective) - -## Test Coverage - -1. **test_basic_claim_extraction** - Single claim with context -2. **test_multiple_claims_extraction** - Multiple claims in one message -3. **test_portuguese_message_extraction** - Language preservation -4. **test_no_context_extraction** - Claim extraction without expanded context -5. **test_empty_message** - Edge case: empty input -6. **test_opinion_vs_claim** - LLM should distinguish opinions from facts -7. **test_validate_claims_function** - Test the validation helper -8. **test_chain_building** - Ensure chain builds without errors -9. **test_return_type_is_list** - Verify we return List, not wrapper - -## Expected Behavior - -Each test will: -1. Create test data (user message + expanded context) -2. Call the claim extractor -3. Print the LLM's response to stdout -4. Validate the structure (types, required fields) -5. Assert structural requirements (not content accuracy) - -## Cost Warning ⚠️ - -These tests make **real API calls** to OpenAI, which costs money. Running all tests might make ~10 API calls using the `gpt-4o-mini` model (cheaper option). - -Estimated cost per full test run: **< $0.01 USD** - -## Debugging Failed Tests - -If a test fails: -1. Check the stdout output to see what the LLM returned -2. Verify `OPENAI_API_KEY` is set correctly -3. Check your OpenAI API quota/billing -4. Look at the assertion error to see which validation failed - -## Example Output - -``` -TEST: Basic Claim Extraction -================================================================================ - -✓ Extracted 1 claim(s): - - Claim 1: - ID: test-msg-001-claim-uuid-abc123 - Text: Vaccine X causes infertility in women - Entities: ['Vaccine X', 'infertility', 'women'] - Links: ['https://example.com/vaccine-article'] - LLM Comment: This is a medical claim that can be verified... - -``` diff --git a/app/ai/pipeline/tests/__init__.py b/app/ai/pipeline/tests/__init__.py deleted file mode 100644 index a01f9dc..0000000 --- a/app/ai/pipeline/tests/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -""" -Tests for the AI pipeline components. - -Run tests with: - pytest app/ai/pipeline/tests/ -v -s -""" diff --git a/app/ai/pipeline/tests/adjudication_with_search_test.py b/app/ai/pipeline/tests/adjudication_with_search_test.py deleted file mode 100644 index d7fe7ff..0000000 --- a/app/ai/pipeline/tests/adjudication_with_search_test.py +++ /dev/null @@ -1,322 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Tests for the adjudication with web search pipeline step. - -These tests make REAL calls to the OpenAI API with web search tool. - -IMPORTANT: Set OPENAI_API_KEY in your environment before running. - -Run with: - pytest app/ai/pipeline/tests/adjudication_with_search_test.py -v -s - -The -s flag shows stdout so you can see the LLM responses for debugging. -""" - -import pytest -from app.models import ( - ExtractedClaim, - ClaimSource, - FactCheckResult, - DataSource, - DataSourceWithExtractedClaims, -) -from app.ai.pipeline.adjudication_with_search import ( - adjudicate_claims_with_search, - adjudicate_claims_with_search_async, -) - - -# ===== HELPER FUNCTIONS ===== - -def create_source_with_claims(claims: list[ExtractedClaim], source_id: str = "test-source") -> DataSourceWithExtractedClaims: - """Helper to create DataSourceWithExtractedClaims for testing.""" - data_source = DataSource( - id=source_id, - source_type="original_text", - original_text="Test message for fact-checking", - metadata={}, - ) - - return DataSourceWithExtractedClaims( - data_source=data_source, - extracted_claims=claims - ) - - -def print_result(result: FactCheckResult, test_name: str): - """Print fact-check result for debugging.""" - print("\n" + "=" * 80) - print(f"TEST: {test_name}") - print("=" * 80) - - for ds_result in result.results: - print(f"\nData Source: {ds_result.data_source_id}") - for verdict in ds_result.claim_verdicts: - print(f"\n Claim: {verdict.claim_text}") - print(f" Verdict: {verdict.verdict}") - print(f" Justification: {verdict.justification[:200]}...") - - if result.overall_summary: - print(f"\nOverall Summary: {result.overall_summary}") - - print("\n" + "=" * 80) - - -def validate_result(result: FactCheckResult): - """Validate that result has correct structure.""" - print("\n[TEST DEBUG] Validating result structure...") - - assert isinstance(result, FactCheckResult), f"Expected FactCheckResult, got {type(result)}" - assert isinstance(result.results, list), f"Expected list for results, got {type(result.results)}" - assert len(result.results) > 0, f"Expected at least one result, got {len(result.results)}" - - print(f"[TEST DEBUG] Result has {len(result.results)} data source result(s)") - - for idx, ds_result in enumerate(result.results, 1): - print(f"[TEST DEBUG] Validating data source result {idx}...") - print(f" - data_source_id: {ds_result.data_source_id}") - print(f" - source_type: {ds_result.source_type}") - print(f" - Number of verdicts: {len(ds_result.claim_verdicts)}") - - assert ds_result.data_source_id, f"Result {idx} missing data_source_id" - assert ds_result.source_type, f"Result {idx} missing source_type" - assert isinstance(ds_result.claim_verdicts, list), f"Result {idx} verdicts should be list, got {type(ds_result.claim_verdicts)}" - - for v_idx, verdict in enumerate(ds_result.claim_verdicts, 1): - print(f" - Verdict {v_idx}:") - print(f" claim_id: {verdict.claim_id}") - print(f" verdict: {verdict.verdict}") - print(f" claim_text: {verdict.claim_text[:60]}...") - - assert verdict.claim_id, f"Result {idx}, Verdict {v_idx} missing claim_id" - assert verdict.claim_text, f"Result {idx}, Verdict {v_idx} missing claim_text" - - valid_verdicts = [ - "Verdadeiro", - "Falso", - "Fora de Contexto", - "Fontes insuficientes para verificar" - ] - assert verdict.verdict in valid_verdicts, \ - f"Result {idx}, Verdict {v_idx}: Invalid verdict '{verdict.verdict}'. Must be one of: {valid_verdicts}" - - assert verdict.justification, f"Result {idx}, Verdict {v_idx} missing justification" - - print("[TEST DEBUG] Validation passed!") - - -# ===== BASIC TESTS ===== - -def test_single_claim_true(): - """Test adjudication with search for a single true claim.""" - claim = ExtractedClaim( - id="claim-test-1", - text="A Terra orbita ao redor do Sol", - source=ClaimSource(source_type="original_text", source_id="msg-test-1"), - entities=["Terra", "Sol"] - ) - - source_with_claims = create_source_with_claims([claim], source_id="msg-test-1") - result = adjudicate_claims_with_search([source_with_claims]) - - print_result(result, "Single Claim - True") - validate_result(result) - - # Should have one verdict - print(f"\n[TEST DEBUG] Checking verdict count...") - print(f" Expected: 1 verdict") - print(f" Got: {len(result.results[0].claim_verdicts)} verdicts") - assert len(result.results[0].claim_verdicts) == 1, \ - f"Expected 1 verdict, got {len(result.results[0].claim_verdicts)}" - - verdict = result.results[0].claim_verdicts[0] - - # Should be classified as Verdadeiro (this is a well-known scientific fact) - print(f"\n[TEST DEBUG] Checking verdict value...") - print(f" Claim: {verdict.claim_text}") - print(f" Expected: Verdadeiro") - print(f" Got: {verdict.verdict}") - print(f" Justification: {verdict.justification[:150]}...") - - assert verdict.verdict == "Verdadeiro", \ - f"Expected verdict 'Verdadeiro' for well-known fact, but got '{verdict.verdict}'. " \ - f"Justification: {verdict.justification[:200]}" - - assert len(verdict.justification) > 50, \ - f"Justification should be detailed (>50 chars), got {len(verdict.justification)} chars" - - -def test_single_claim_false(): - """Test adjudication with search for a single false claim.""" - claim = ExtractedClaim( - id="claim-test-2", - text="A Terra é plana", - source=ClaimSource(source_type="original_text", source_id="msg-test-2"), - entities=["Terra"] - ) - - source_with_claims = create_source_with_claims([claim], source_id="msg-test-2") - result = adjudicate_claims_with_search([source_with_claims]) - - print_result(result, "Single Claim - False") - validate_result(result) - - # Should have one verdict - assert len(result.results[0].claim_verdicts) == 1 - verdict = result.results[0].claim_verdicts[0] - - # Should be classified as Falso - assert verdict.verdict == "Falso", f"Expected Falso, got {verdict.verdict}" - assert len(verdict.justification) > 50, "Justification should be detailed" - - -def test_multiple_claims(): - """Test adjudication with search for multiple claims.""" - claims = [ - ExtractedClaim( - id="claim-test-3a", - text="A água ferve a 100°C ao nível do mar", - source=ClaimSource(source_type="original_text", source_id="msg-test-3"), - entities=["água", "100°C"] - ), - ExtractedClaim( - id="claim-test-3b", - text="A velocidade da luz é aproximadamente 300.000 km/s", - source=ClaimSource(source_type="original_text", source_id="msg-test-3"), - entities=["luz", "300.000 km/s"] - ) - ] - - source_with_claims = create_source_with_claims(claims, source_id="msg-test-3") - result = adjudicate_claims_with_search([source_with_claims]) - - print_result(result, "Multiple Claims - Scientific Facts") - validate_result(result) - - # Should have verdicts for both claims - assert len(result.results[0].claim_verdicts) == 2 - - # Both should be Verdadeiro - for verdict in result.results[0].claim_verdicts: - assert verdict.verdict == "Verdadeiro", f"Scientific facts should be Verdadeiro, got {verdict.verdict}" - - -def test_recent_event(): - """Test adjudication with search for a recent event (uses web search).""" - claim = ExtractedClaim( - id="claim-test-4", - text="Portugal venceu a Eurocopa de 2024", - source=ClaimSource(source_type="original_text", source_id="msg-test-4"), - entities=["Portugal", "Eurocopa", "2024"] - ) - - source_with_claims = create_source_with_claims([claim], source_id="msg-test-4") - result = adjudicate_claims_with_search([source_with_claims]) - - print_result(result, "Recent Event - Euro 2024") - validate_result(result) - - # Should have one verdict - verdict = result.results[0].claim_verdicts[0] - - # This should be Falso (Spain won Euro 2024) - # But we accept any verdict as long as it's justified - assert verdict.verdict in ["Verdadeiro", "Falso", "Fora de Contexto"], "Should have a clear verdict" - assert len(verdict.justification) > 50, "Should have detailed justification with search results" - - -def test_unverifiable_claim(): - """Test adjudication with search for an unverifiable claim.""" - claim = ExtractedClaim( - id="claim-test-5", - text="Existe um mineral secreto chamado Vibranium na Antártida", - source=ClaimSource(source_type="original_text", source_id="msg-test-5"), - entities=["Vibranium", "Antártida"] - ) - - source_with_claims = create_source_with_claims([claim], source_id="msg-test-5") - result = adjudicate_claims_with_search([source_with_claims]) - - print_result(result, "Unverifiable Claim - Fictional Element") - validate_result(result) - - verdict = result.results[0].claim_verdicts[0] - - # Should likely be Falso or Fontes insuficientes - assert verdict.verdict in ["Falso", "Fontes insuficientes para verificar"], \ - f"Fictional claim should be Falso or unverifiable, got {verdict.verdict}" - - -# ===== ASYNC TESTS ===== - -@pytest.mark.asyncio -async def test_async_adjudication(): - """Test async version of adjudication with search.""" - claim = ExtractedClaim( - id="claim-test-6", - text="O Brasil ganhou 5 Copas do Mundo de futebol", - source=ClaimSource(source_type="original_text", source_id="msg-test-6"), - entities=["Brasil", "Copa do Mundo", "5"] - ) - - source_with_claims = create_source_with_claims([claim], source_id="msg-test-6") - result = await adjudicate_claims_with_search_async([source_with_claims]) - - print_result(result, "Async - Brazil World Cups") - validate_result(result) - - verdict = result.results[0].claim_verdicts[0] - - # Should be Verdadeiro (Brazil won in 1958, 1962, 1970, 1994, 2002) - assert verdict.verdict == "Verdadeiro", f"Expected Verdadeiro, got {verdict.verdict}" - - -# ===== INTEGRATION TESTS ===== - -def test_full_pipeline_mixed_verdicts(): - """Test full pipeline with claims that should get different verdicts.""" - claims = [ - ExtractedClaim( - id="claim-mixed-1", - text="A Lua é feita de queijo", - source=ClaimSource(source_type="original_text", source_id="msg-mixed"), - entities=["Lua", "queijo"] - ), - ExtractedClaim( - id="claim-mixed-2", - text="A capital do Brasil é Brasília", - source=ClaimSource(source_type="original_text", source_id="msg-mixed"), - entities=["Brasil", "Brasília"] - ), - ExtractedClaim( - id="claim-mixed-3", - text="Vacinas contêm microchips de rastreamento", - source=ClaimSource(source_type="original_text", source_id="msg-mixed"), - entities=["vacinas", "microchips"] - ) - ] - - source_with_claims = create_source_with_claims(claims, source_id="msg-mixed") - result = adjudicate_claims_with_search([source_with_claims]) - - print_result(result, "Mixed Verdicts - True, False, Conspiracy") - validate_result(result) - - verdicts = result.results[0].claim_verdicts - assert len(verdicts) == 3, "Should have verdicts for all 3 claims" - - # Check that we have at least some variety in verdicts - verdict_types = [v.verdict for v in verdicts] - assert "Falso" in verdict_types, "Should have at least one Falso verdict" - assert "Verdadeiro" in verdict_types, "Should have at least one Verdadeiro verdict" - - # Overall summary should be present - assert result.overall_summary, "Should have overall summary" - assert len(result.overall_summary) > 20, "Summary should be substantive" - - -# ===== PYTEST CONFIGURATION ===== - -if __name__ == "__main__": - """Run tests manually with: python -m app.ai.pipeline.tests.adjudication_with_search_test""" - pytest.main([__file__, "-v", "-s"]) diff --git a/app/ai/pipeline/tests/claim_extractor_test.py b/app/ai/pipeline/tests/claim_extractor_test.py deleted file mode 100644 index b8ceb6a..0000000 --- a/app/ai/pipeline/tests/claim_extractor_test.py +++ /dev/null @@ -1,522 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Tests for the claim extraction pipeline step. - -These tests make REAL calls to the LLM (OpenAI API) to validate: -- The structure of outputs -- The LangChain chain works correctly -- The prompt produces valid results -- Source tracking works properly - -IMPORTANT: Set OPENAI_API_KEY in your environment before running. - -Run with: - pytest app/ai/pipeline/tests/claim_extractor_test.py -v -s - -The -s flag shows stdout so you can see the LLM responses for debugging. -""" - -import pytest -from typing import List -from langchain_openai import ChatOpenAI - -from app.models import ClaimExtractionInput, ExtractedClaim, ClaimExtractionOutput, LLMConfig, DataSource -from app.ai.pipeline import ( - extract_claims, - extract_and_validate_claims, - validate_claims, -) - - -# ===== HELPER FUNCTIONS ===== - -def print_claim_results( - claims: List[ExtractedClaim], - test_name: str, - input_text: str | None = None -): - """Print claim extraction results for debugging, including input for verification.""" - print("\n" + "=" * 80) - print(f"TEST: {test_name}") - print("=" * 80) - - # Print input for verification - if input_text: - print(f"\n📥 INPUT TEXT:") - print(f" {input_text}") - - # Print output - print(f"\n📤 OUTPUT:") - print(f" Extracted {len(claims)} claim(s):\n") - - for i, claim in enumerate(claims, 1): - print(f" Claim {i}:") - print(f" ID: {claim.id}") - print(f" Text: {claim.text}") - print(f" Entities: {claim.entities}") - print(f" Source Type: {claim.source.source_type}") - print(f" Source ID: {claim.source.source_id}") - print(f" LLM Comment: {claim.llm_comment}") - print() - - -def validate_claim_structure(claim: ExtractedClaim): - """Validate that a claim has the correct structure.""" - # Required fields - assert claim.id is not None and claim.id != "", "Claim ID should not be empty" - assert claim.text is not None and claim.text != "", "Claim text should not be empty" - assert claim.source is not None, "Claim should have a source" - - # Type checks - assert isinstance(claim.id, str), "Claim ID should be a string" - assert isinstance(claim.text, str), "Claim text should be a string" - assert isinstance(claim.entities, list), "Entities should be a list" - - # Source validation - assert isinstance(claim.source.source_type, str), "Source type should be a string" - assert isinstance(claim.source.source_id, str), "Source ID should be a string" - - # Optional field type check - if claim.llm_comment is not None: - assert isinstance(claim.llm_comment, str), "LLM comment should be a string" - - # List element type checks - for entity in claim.entities: - assert isinstance(entity, str), "Each entity should be a string" - - -def validate_claims_list(claims: List[ExtractedClaim]): - """Validate that a list of claims has the correct structure.""" - assert isinstance(claims, list), "Result should be a list" - - for claim in claims: - validate_claim_structure(claim) - - -# ===== TESTS ===== - -def test_basic_claim_extraction_from_user_message(): - """Test basic claim extraction from a user message.""" - # Setup - text = "Ouvi dizer que a vacina X causa infertilidade em mulheres, isso é verdade?" - - data_source = DataSource( - id="msg-001", - source_type="original_text", - original_text=text - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - llm_config = LLMConfig( - llm=ChatOpenAI( - model="gpt-4o-mini", - temperature=0.0, - timeout=30.0 - ) - ) - - # Execute - result = extract_and_validate_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - # Validate wrapper type - assert isinstance(result, ClaimExtractionOutput), "Result should be ClaimExtractionOutput" - - # Print for debugging - print_claim_results( - result.claims, - "Basic Claim Extraction from User Message", - input_text=text - ) - - # Validate structure - validate_claims_list(result.claims) - assert len(result.claims) > 0, "Should extract at least one claim" - - # Check source tracking - for claim in result.claims: - assert claim.source.source_type == "original_text" - assert claim.source.source_id == "msg-001" - - -def test_claim_extraction_from_link_context(): - """Test claim extraction from link/article content.""" - # Setup - simulate content extracted from a link - text = """=== Artigo: Novo Estudo sobre Segurança de Vacinas === - -Um estudo abrangente publicado hoje não encontrou evidências ligando -a Vacina X a problemas de fertilidade em mulheres. O estudo examinou mais de -50.000 participantes e concluiu que a vacina é segura. - -A pesquisa foi conduzida pelo Ministério da Saúde ao longo de 3 anos.""" - - data_source = DataSource( - id="link-456", - source_type="link_context", - original_text=text - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - llm_config = LLMConfig( - llm=ChatOpenAI( - model="gpt-4o-mini", - temperature=0.0, - timeout=30.0 - ) - ) - - # Execute - result = extract_and_validate_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - # Validate wrapper type - assert isinstance(result, ClaimExtractionOutput), "Result should be ClaimExtractionOutput" - - # Print for debugging - print_claim_results( - result.claims, - "Claim Extraction from Link Context", - input_text=text - ) - - # Validate structure - validate_claims_list(result.claims) - assert len(result.claims) > 0, "Should extract claims from article" - - # Check source tracking - for claim in result.claims: - assert claim.source.source_type == "link_context" - assert claim.source.source_id == "link-456" - - -def test_multiple_claims_extraction(): - """Test extraction of multiple claims from one text.""" - # Setup - text = """O presidente anunciou um novo imposto sobre carbono de R$250 por tonelada. -Além disso, o governo vai investir R$500 bilhões em energia renovável na próxima década. -Isso torna o maior investimento climático da história.""" - - data_source = DataSource( - id="msg-002", - source_type="original_text", - original_text=text - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - llm_config = LLMConfig(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.0, timeout=30.0)) - - # Execute - result = extract_and_validate_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - # Validate wrapper type - assert isinstance(result, ClaimExtractionOutput), "Result should be ClaimExtractionOutput" - - # Print for debugging - print_claim_results( - result.claims, - "Multiple Claims Extraction", - input_text=text - ) - - # Validate structure - validate_claims_list(result.claims) - # We expect multiple claims but don't assert specific number - # as LLM behavior may vary - - -def test_portuguese_message_extraction(): - """Test claim extraction with Portuguese text.""" - # Setup - text = "Dizem que a vacina da COVID causa problemas no coração. Isso é verdade mesmo?" - - data_source = DataSource( - id="msg-003", - source_type="original_text", - original_text=text - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - llm_config = LLMConfig(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.0, timeout=30.0)) - - # Execute - result = extract_and_validate_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - # Validate wrapper type - assert isinstance(result, ClaimExtractionOutput), "Result should be ClaimExtractionOutput" - - # Print for debugging - print_claim_results( - result.claims, - "Portuguese Message Extraction", - input_text=text - ) - - # Validate structure - validate_claims_list(result.claims) - assert len(result.claims) > 0, "Should extract at least one claim from Portuguese text" - - -def test_image_ocr_extraction(): - """Test claim extraction from simulated image OCR text.""" - # Setup - simulate OCR output from an image - text = "URGENTE: Vacina X causa infertilidade. Compartilhe antes que apaguem isso!" - - data_source = DataSource( - id="img-789", - source_type="image", - original_text=text - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - llm_config = LLMConfig(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.0, timeout=30.0)) - - # Execute - result = extract_and_validate_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - # Validate wrapper type - assert isinstance(result, ClaimExtractionOutput), "Result should be ClaimExtractionOutput" - - # Print for debugging - print_claim_results( - result.claims, - "Image OCR Extraction", - input_text=text - ) - - # Validate structure - validate_claims_list(result.claims) - assert len(result.claims) > 0, "Should extract claim from OCR text" - - # Check source tracking - for claim in result.claims: - assert claim.source.source_type == "image" - assert claim.source.source_id == "img-789" - - -def test_empty_text(): - """Test behavior with empty text.""" - # Setup - text = "" - - data_source = DataSource( - id="msg-004", - source_type="original_text", - original_text=text - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - llm_config = LLMConfig(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.0, timeout=30.0)) - - # Execute - result = extract_and_validate_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - # Validate wrapper type - assert isinstance(result, ClaimExtractionOutput), "Result should be ClaimExtractionOutput" - - # Print for debugging - print_claim_results( - result.claims, - "Empty Text", - input_text=text - ) - - # Validate structure - validate_claims_list(result.claims) - # With empty text, should return empty list or handle gracefully - assert len(result.claims) == 0, "Empty text should result in no claims" - - -def test_opinion_vs_claim(): - """Test that LLM can distinguish opinions from fact-checkable claims.""" - # Setup - text = "Acho que vacinas são assustadoras e não gosto delas. O que você acha?" - - data_source = DataSource( - id="msg-005", - source_type="original_text", - original_text=text - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - llm_config = LLMConfig(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.0, timeout=30.0)) - - # Execute - result = extract_and_validate_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - # Validate wrapper type - assert isinstance(result, ClaimExtractionOutput), "Result should be ClaimExtractionOutput" - - # Print for debugging - print_claim_results( - result.claims, - "Opinion vs Claim", - input_text=text - ) - - # Validate structure - validate_claims_list(result.claims) - # This is pure opinion, not a fact-checkable claim - # LLM should ideally return empty or very few claims - - -def test_validate_claims_function(): - """Test the validate_claims helper function.""" - # Setup: Create mock claims with some that should be filtered - from app.models import ExtractedClaim, ClaimSource - - claims = [ - ExtractedClaim( - id="claim-1", - text="Afirmação válida sobre vacinas", - source=ClaimSource( - source_type="original_text", - source_id="msg-1" - ), - llm_comment="Esta é válida", - entities=["vacinas"] - ), - ExtractedClaim( - id="claim-2", - text="", # Empty text - should be filtered - source=ClaimSource( - source_type="original_text", - source_id="msg-1" - ), - llm_comment="Vazia", - entities=[] - ), - ExtractedClaim( - id="claim-3", - text="Afirmação válida sobre vacinas", # Duplicate - should be filtered - source=ClaimSource( - source_type="original_text", - source_id="msg-1" - ), - llm_comment="Esta é duplicada", - entities=["vacinas"] - ), - ExtractedClaim( - id="claim-4", - text="Outra afirmação válida", - source=ClaimSource( - source_type="original_text", - source_id="msg-1" - ), - llm_comment="Também válida", - entities=["afirmação"] - ), - ] - - # Execute - validated = validate_claims(claims) - - # Print for debugging - print("\n" + "=" * 80) - print("TEST: Validate Claims Function") - print("=" * 80) - print(f"\nInput: {len(claims)} claims") - print(f"Output: {len(validated)} claims") - print("\nFiltered out:") - print(" - 1 empty claim") - print(" - 1 duplicate claim") - print() - - # Validate - assert len(validated) == 2, "Should filter out empty and duplicate claims" - assert validated[0].text == "Afirmação válida sobre vacinas" - assert validated[1].text == "Outra afirmação válida" - - -def test_chain_building(): - """Test that the chain can be built without errors.""" - from app.ai.pipeline import build_claim_extraction_chain - - llm_config = LLMConfig( - llm=ChatOpenAI( - model="gpt-4o-mini", - temperature=0.0, - timeout=30.0 - ) - ) - - # Build chain with default source type - chain = build_claim_extraction_chain( - llm_config=llm_config, - source_type="original_text" - ) - - # Validate - assert chain is not None, "Chain should be built successfully" - print("\n" + "=" * 80) - print("TEST: Chain Building") - print("=" * 80) - print(f"\n✓ Chain built successfully: {type(chain).__name__}") - print() - - -def test_return_type_is_wrapper(): - """Test that extract_claims returns ClaimExtractionOutput wrapper for type safety.""" - # Setup - text = "Mensagem de teste para verificação de tipo" - - data_source = DataSource( - id="msg-006", - source_type="original_text", - original_text=text - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - llm_config = LLMConfig(llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.0, timeout=30.0)) - - # Execute - result = extract_claims( - extraction_input=extraction_input, - llm_config=llm_config - ) - - # Validate type - should be wrapper, not raw list - assert isinstance(result, ClaimExtractionOutput), "Result should be ClaimExtractionOutput wrapper" - assert hasattr(result, 'claims'), "Wrapper should have 'claims' attribute" - assert isinstance(result.claims, list), "The 'claims' attribute should be a list" - - print("\n" + "=" * 80) - print("TEST: Return Type Check") - print("=" * 80) - print(f"\n✓ Correct return type: {type(result).__name__}") - print(f"✓ Returns ClaimExtractionOutput wrapper for type safety") - print(f"✓ Wrapper contains {len(result.claims)} claim(s)") - print() - - -# ===== PYTEST CONFIGURATION ===== - -if __name__ == "__main__": - """Run tests manually with: python -m app.ai.pipeline.tests.claim_extractor_test""" - pytest.main([__file__, "-v", "-s"]) diff --git a/app/ai/pipeline/tests/evidence_retrieval_test.py b/app/ai/pipeline/tests/evidence_retrieval_test.py deleted file mode 100644 index 1de058b..0000000 --- a/app/ai/pipeline/tests/evidence_retrieval_test.py +++ /dev/null @@ -1,550 +0,0 @@ -import pytest -import os - -# TESTING ONLY: Disable SSL verification for local testing -# WARNING: Remove this before committing to production! -os.environ["DISABLE_SSL_VERIFY"] = "1" - -# configure pytest to automatically handle async tests -pytest_plugins = ('pytest_asyncio',) - -from app.ai.pipeline.evidence_retrieval import ( - WebSearchGatherer, - gather_evidence_async, - gather_and_filter_evidence, - deduplicate_citations, - filter_low_quality_citations, -) -from app.models import ( - EvidenceRetrievalInput, - ExtractedClaim, - ClaimSource, - Citation, -) - - -# ===== UNIT TESTS FOR HELPER FUNCTIONS ===== - -def test_deduplicate_citations_removes_duplicates(): - """should remove citations with duplicate URLs""" - citations = [ - Citation( - url="https://example.com/article1", - title="Article 1", - publisher="Example", - citation_text="Some text", - source="apify_web_search" - ), - Citation( - url="https://example.com/article2", - title="Article 2", - publisher="Example", - citation_text="Other text", - source="apify_web_search" - ), - Citation( - url="https://example.com/article1", # duplicate - title="Article 1 Again", - publisher="Example", - citation_text="Different text", - source="apify_web_search" - ), - ] - - result = deduplicate_citations(citations) - - assert len(result) == 2 - assert result[0].url == "https://example.com/article1" - assert result[1].url == "https://example.com/article2" - - -def test_deduplicate_citations_case_insensitive(): - """should treat URLs as case-insensitive when deduplicating""" - citations = [ - Citation( - url="https://Example.com/Article", - title="Article 1", - publisher="Example", - citation_text="Text", - source="apify_web_search" - ), - Citation( - url="https://example.com/article", # same URL, different case - title="Article 2", - publisher="Example", - citation_text="Text", - source="apify_web_search" - ), - ] - - result = deduplicate_citations(citations) - - assert len(result) == 1 - - -def test_deduplicate_citations_empty_list(): - """should handle empty list""" - result = deduplicate_citations([]) - assert result == [] - - -def test_filter_low_quality_citations_removes_short_text(): - """should remove citations with very short citation text""" - citations = [ - Citation( - url="https://example.com/1", - title="Good Article", - publisher="Example", - citation_text="This is a good citation with enough text content", - source="apify_web_search" - ), - Citation( - url="https://example.com/2", - title="Bad Article", - publisher="Example", - citation_text="short", # too short - source="apify_web_search" - ), - ] - - result = filter_low_quality_citations(citations, min_text_length=10) - - assert len(result) == 1 - assert result[0].url == "https://example.com/1" - - -def test_filter_low_quality_citations_removes_missing_fields(): - """should remove citations with missing critical fields""" - citations = [ - Citation( - url="https://example.com/1", - title="Good Article", - publisher="Example", - citation_text="Good content here", - source="apify_web_search" - ), - Citation( - url="", # missing URL - title="Bad Article", - publisher="Example", - citation_text="Some content", - source="apify_web_search" - ), - Citation( - url="https://example.com/3", - title="", # missing title - publisher="Example", - citation_text="Some content", - source="apify_web_search" - ), - ] - - result = filter_low_quality_citations(citations) - - assert len(result) == 1 - assert result[0].url == "https://example.com/1" - - -def test_filter_low_quality_citations_empty_list(): - """should handle empty list""" - result = filter_low_quality_citations([]) - assert result == [] - - -# ===== INTEGRATION TESTS FOR WEB SEARCH GATHERER ===== -# these tests make REAL network calls to the Google Custom Search API - -@pytest.mark.asyncio -@pytest.mark.timeout(45) # 45 second timeout -async def test_web_search_gatherer_real_claim(): - """should search the web for a real claim and return citations""" - import os - - # check if google search credentials are set - api_key = os.getenv("GOOGLE_SEARCH_API_KEY") - cse_cx = os.getenv("GOOGLE_CSE_CX") - print(f"\n[DEBUG] GOOGLE_SEARCH_API_KEY present: {api_key is not None}") - print(f"\n[DEBUG] GOOGLE_CSE_CX present: {cse_cx is not None}") - if not api_key or not cse_cx: - print("[DEBUG] Google Search credentials are NOT set") - - gatherer = WebSearchGatherer(max_results=3) - - claim = ExtractedClaim( - id="claim-test-001", - text="A vacina contra COVID-19 é segura para mulheres grávidas", - source=ClaimSource( - source_type="original_text", - source_id="msg-001" - ), - entities=["COVID-19", "vacina", "mulheres grávidas"] - ) - - # call gather and check the result - citations = await gatherer.gather(claim) - - # debug: check if gather returned empty and why - if len(citations) == 0: - print("[DEBUG] No citations returned - checking search result...") - # call the underlying search function directly to see the error - from app.ai.context.web import searchGoogleClaim - search_result = await searchGoogleClaim(claim.text, maxResults=3) - print(f"[DEBUG] Search result: {search_result}") - - print(f"\n{'=' * 80}") - print(f"TEST: Web Search for COVID-19 Vaccine Safety Claim") - print(f"{'=' * 80}") - print(f"Claim: {claim.text}") - print(f"Citations found: {len(citations)}") - - # validate structure - assert isinstance(citations, list), "Should return a list" - assert len(citations) > 0, "Should find at least one citation" - assert len(citations) <= 3, "Should respect max_results limit" - - # validate each citation - for i, citation in enumerate(citations, 1): - print(f"\n--- Citation {i} ---") - print(f"Title: {citation.title}") - print(f"URL: {citation.url}") - print(f"Publisher: {citation.publisher}") - print(f"Source: {citation.source}") - print(f"Text preview: {citation.citation_text[:150]}...") - - assert citation.url != "", "URL should not be empty" - assert citation.title != "", "Title should not be empty" - assert citation.source == "google_web_search", "Source should be google_web_search" - assert citation.rating is None, "Web search shouldn't provide ratings" - - print(f"{'=' * 80}\n") - - -@pytest.mark.asyncio -@pytest.mark.timeout(45) # 45 second timeout -async def test_web_search_gatherer_english_claim(): - """should handle English language claims""" - gatherer = WebSearchGatherer(max_results=3) - - claim = ExtractedClaim( - id="claim-test-002", - text="Climate change is caused by human activities", - source=ClaimSource( - source_type="original_text", - source_id="msg-002" - ), - entities=["climate change", "human activities"] - ) - - citations = await gatherer.gather(claim) - - print(f"\n{'=' * 80}") - print(f"TEST: Web Search for Climate Change Claim (English)") - print(f"{'=' * 80}") - print(f"Claim: {claim.text}") - print(f"Citations found: {len(citations)}") - - assert len(citations) > 0, "Should find citations for English claims" - - for i, citation in enumerate(citations, 1): - print(f"\n--- Citation {i} ---") - print(f"Title: {citation.title}") - print(f"URL: {citation.url}") - print(f"Publisher: {citation.publisher}") - - print(f"{'=' * 80}\n") - - -@pytest.mark.asyncio -async def test_web_search_gatherer_source_name(): - """should return correct source name""" - gatherer = WebSearchGatherer(max_results=5) - assert gatherer.source_name == "google_web_search" - - -# ===== INTEGRATION TESTS FOR MAIN EVIDENCE RETRIEVAL ===== - -@pytest.mark.asyncio -@pytest.mark.timeout(45) # 45 second timeout -async def test_gather_evidence_async_single_claim(): - """should gather evidence for a single claim""" - claim = ExtractedClaim( - id="claim-single-001", - text="A vitamina D ajuda a prevenir gripes e resfriados", - source=ClaimSource( - source_type="original_text", - source_id="msg-single-001" - ), - entities=["vitamina D", "gripe", "resfriado"] - ) - - retrieval_input = EvidenceRetrievalInput(claims=[claim]) - - result = await gather_evidence_async(retrieval_input) - - print(f"\n{'=' * 80}") - print(f"TEST: Evidence Gathering for Single Claim") - print(f"{'=' * 80}") - print(f"Claim: {claim.text}") - - # validate result structure - assert claim.id in result.claim_evidence_map - enriched_claim = result.claim_evidence_map[claim.id] - - print(f"Citations gathered: {len(enriched_claim.citations)}") - - # enriched claim should preserve original fields - assert enriched_claim.id == claim.id - assert enriched_claim.text == claim.text - assert enriched_claim.source == claim.source - assert enriched_claim.entities == claim.entities - - # should have citations - assert len(enriched_claim.citations) > 0 - - for i, citation in enumerate(enriched_claim.citations[:3], 1): - print(f"\n--- Citation {i} ---") - print(f"Title: {citation.title}") - print(f"URL: {citation.url}") - - print(f"{'=' * 80}\n") - - -@pytest.mark.asyncio -@pytest.mark.timeout(60) # 60 second timeout for multiple claims -async def test_gather_evidence_async_multiple_claims(): - """should gather evidence for multiple claims""" - claims = [ - ExtractedClaim( - id="claim-multi-001", - text="Beber água com limão em jejum emagrece", - source=ClaimSource( - source_type="original_text", - source_id="msg-multi-001" - ), - entities=["água com limão", "jejum", "emagrecer"] - ), - ExtractedClaim( - id="claim-multi-002", - text="O 5G causa câncer", - source=ClaimSource( - source_type="original_text", - source_id="msg-multi-001" - ), - entities=["5G", "câncer"] - ), - ] - - retrieval_input = EvidenceRetrievalInput(claims=claims) - - result = await gather_evidence_async(retrieval_input) - - print(f"\n{'=' * 80}") - print(f"TEST: Evidence Gathering for Multiple Claims") - print(f"{'=' * 80}") - - # should have evidence for both claims - assert len(result.claim_evidence_map) == 2 - assert "claim-multi-001" in result.claim_evidence_map - assert "claim-multi-002" in result.claim_evidence_map - - for claim in claims: - enriched = result.claim_evidence_map[claim.id] - print(f"\nClaim: {enriched.text}") - print(f"Citations: {len(enriched.citations)}") - - # both claims should have citations - assert len(enriched.citations) > 0 - - print(f"{'=' * 80}\n") - - -@pytest.mark.asyncio -async def test_gather_evidence_async_empty_claims(): - """should handle empty claims list""" - retrieval_input = EvidenceRetrievalInput(claims=[]) - - result = await gather_evidence_async(retrieval_input) - - assert result.claim_evidence_map == {} - - -# ===== INTEGRATION TESTS FOR CONVENIENCE FUNCTION ===== - -@pytest.mark.asyncio -@pytest.mark.timeout(45) # 45 second timeout -async def test_gather_and_filter_evidence_deduplicates(): - """should deduplicate and filter citations""" - claim = ExtractedClaim( - id="claim-filter-001", - text="Tomar café diariamente faz bem para a saúde", - source=ClaimSource( - source_type="original_text", - source_id="msg-filter-001" - ), - entities=["café", "saúde"] - ) - - retrieval_input = EvidenceRetrievalInput(claims=[claim]) - - # gather with filtering enabled - result = await gather_and_filter_evidence( - retrieval_input, - deduplicate=True, - filter_quality=True - ) - - print(f"\n{'=' * 80}") - print(f"TEST: Gather and Filter Evidence") - print(f"{'=' * 80}") - print(f"Claim: {claim.text}") - - enriched = result.claim_evidence_map[claim.id] - print(f"Filtered citations: {len(enriched.citations)}") - - # all citations should be high quality (no empty URLs or titles) - for citation in enriched.citations: - assert citation.url != "" - assert citation.title != "" - assert len(citation.citation_text) >= 10 - - print(f"{'=' * 80}\n") - - -@pytest.mark.asyncio -@pytest.mark.timeout(45) # 45 second timeout -async def test_gather_and_filter_evidence_no_filters(): - """should work without filters""" - claim = ExtractedClaim( - id="claim-no-filter-001", - text="Exercícios físicos melhoram a saúde mental", - source=ClaimSource( - source_type="original_text", - source_id="msg-no-filter-001" - ), - entities=["exercícios físicos", "saúde mental"] - ) - - retrieval_input = EvidenceRetrievalInput(claims=[claim]) - - # gather without filters - result = await gather_and_filter_evidence( - retrieval_input, - deduplicate=False, - filter_quality=False - ) - - enriched = result.claim_evidence_map[claim.id] - - # should still have citations - assert len(enriched.citations) > 0 - - -# ===== TESTS FOR CUSTOM GATHERERS ===== - -@pytest.mark.asyncio -async def test_custom_gatherer_composition(): - """should support custom evidence gatherers""" - - # create a mock gatherer for testing - class MockGatherer: - @property - def source_name(self) -> str: - return "apify_web_search" - - async def gather(self, claim: ExtractedClaim): - # return a fixed citation for testing - return [ - Citation( - url="https://mock.com/article", - title="Mock Article", - publisher="Mock Publisher", - citation_text="This is a mock citation for testing purposes", - source="apify_web_search" - ) - ] - - claim = ExtractedClaim( - id="claim-custom-001", - text="Test claim for custom gatherer", - source=ClaimSource( - source_type="original_text", - source_id="msg-custom-001" - ) - ) - - retrieval_input = EvidenceRetrievalInput(claims=[claim]) - - # use custom gatherer - result = await gather_evidence_async( - retrieval_input, - gatherers=[MockGatherer()] - ) - - enriched = result.claim_evidence_map[claim.id] - - # should have exactly one citation from mock gatherer - assert len(enriched.citations) == 1 - assert enriched.citations[0].source == "apify_web_search" - assert enriched.citations[0].url == "https://mock.com/article" - - -@pytest.mark.asyncio -async def test_multiple_gatherers_composition(): - """should combine citations from multiple gatherers""" - - class MockGatherer1: - @property - def source_name(self) -> str: - return "apify_web_search" - - async def gather(self, claim: ExtractedClaim): - return [ - Citation( - url="https://mock1.com/article", - title="Mock Article 1", - publisher="Mock Publisher 1", - citation_text="Citation from source 1", - source="apify_web_search" - ) - ] - - class MockGatherer2: - @property - def source_name(self) -> str: - return "google_fact_checking_api" - - async def gather(self, claim: ExtractedClaim): - return [ - Citation( - url="https://mock2.com/article", - title="Mock Article 2", - publisher="Mock Publisher 2", - citation_text="Citation from source 2", - source="google_fact_checking_api" - ) - ] - - claim = ExtractedClaim( - id="claim-multi-gatherer-001", - text="Test claim for multiple gatherers", - source=ClaimSource( - source_type="original_text", - source_id="msg-multi-gatherer-001" - ) - ) - - retrieval_input = EvidenceRetrievalInput(claims=[claim]) - - # use both gatherers - result = await gather_evidence_async( - retrieval_input, - gatherers=[MockGatherer1(), MockGatherer2()] - ) - - enriched = result.claim_evidence_map[claim.id] - - # should have citations from both gatherers - assert len(enriched.citations) == 2 - sources = {cit.source for cit in enriched.citations} - assert "apify_web_search" in sources - assert "google_fact_checking_api" in sources diff --git a/app/ai/pipeline/tests/fixtures/mock_linkexpander.py b/app/ai/pipeline/tests/fixtures/mock_linkexpander.py deleted file mode 100644 index f0efc13..0000000 --- a/app/ai/pipeline/tests/fixtures/mock_linkexpander.py +++ /dev/null @@ -1,210 +0,0 @@ -""" -Hybrid link expander for testing without Apify API calls. - -uses regex to extract URLs, mocks social media URLs (that would use Apify), -and allows real simple scraping for generic URLs. -""" - -import re -from typing import List -from uuid import uuid4 - -from app.models import DataSource, PipelineConfig -from app.ai.context.web.apify_utils import detectPlatform, PlatformType - - -# mock dictionary mapping URLs to their content -# this simulates what would be fetched from the web -MOCK_LINK_CONTENT = { - # social media URLs (would use Apify - always mocked) - "https://www.facebook.com/post/12345": { - "title": "Facebook Post About Climate", - "content": "Breaking: New climate study shows alarming trends. " - "Scientists worldwide are calling for immediate action.", - "success": True - }, - "https://www.instagram.com/p/abc123": { - "title": "Instagram Post - Vaccine Information", - "content": "Educational post about vaccine safety and efficacy. " - "Multiple peer-reviewed studies confirm safety profile.", - "success": True - }, - "https://twitter.com/user/status/987654": { - "title": "Tweet About Renewable Energy", - "content": "Solar energy costs have dropped dramatically. " - "Now cheaper than fossil fuels in most markets.", - "success": True - }, - "https://x.com/scientist/status/111222": { - "title": "X Post - Scientific Study", - "content": "New research published in Nature. " - "Groundbreaking findings on climate adaptation.", - "success": True - }, - "https://www.tiktok.com/@user/video/555666": { - "title": "TikTok Video - Fact Check", - "content": "Educational content debunking common misinformation. " - "Sources cited in video description.", - "success": True - }, - # generic URLs (would use simple scraping - included for backward compatibility) - "https://example.com": { - "title": "Example Domain", - "content": "This domain is for use in illustrative examples in documents.", - "success": True - }, - "https://invalid-url.fake": { - "title": None, - "content": None, - "success": False - } -} - -# default content for URLs not in the mock dictionary -DEFAULT_MOCK_CONTENT = { - "title": "Mock Page Title", - "content": "This is mock content for a URL not in the test dictionary. " - "In a real scenario, this would be fetched from the web.", - "success": True -} - - -def extract_urls_from_text(text: str) -> List[str]: - """ - extract URLs from text using regex. - - args: - text: text to extract URLs from - - returns: - list of URLs found in the text - """ - # regex pattern for URLs (http/https) - url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+' - - urls = re.findall(url_pattern, text) - return urls - - -def hybrid_expand_link_contexts( - data_source: DataSource, - _config: PipelineConfig -) -> List[DataSource]: - """ - hybrid implementation: mocks social media URLs, uses real scraping for generic URLs. - - social media URLs (Facebook, Instagram, Twitter, TikTok) would use Apify API, - so they are mocked using a dictionary. generic URLs use real simple HTTP scraping - (no Apify fallback) using scrapeGenericSimple directly. - - args: - data_source: data source to extract links from - _config: pipeline configuration (unused, kept for compatibility) - - returns: - list of new 'link_context' data sources (mocked + real) - """ - if not data_source.original_text: - return [] - - # extract URLs from text - urls = extract_urls_from_text(data_source.original_text) - - if not urls: - return [] - - expanded_sources: List[DataSource] = [] - - # separate URLs into social media (mock) and generic (real scraping) - social_media_urls = [] - generic_urls = [] - - for url in urls: - platform = detectPlatform(url) - if platform in [PlatformType.FACEBOOK, PlatformType.INSTAGRAM, - PlatformType.TWITTER, PlatformType.TIKTOK]: - social_media_urls.append(url) - else: - generic_urls.append(url) - - # process social media URLs with mocks - for url in social_media_urls: - # get mock content for this URL (or use default) - mock_data = MOCK_LINK_CONTENT.get(url, DEFAULT_MOCK_CONTENT) - - # create metadata - metadata = { - "url": url, - "success": mock_data["success"], - "title": mock_data["title"], - "mock": True # flag to indicate this is mock data - } - - # create link_context data source - link_source = DataSource( - id=f"link-{uuid4().hex[:8]}", - source_type="link_context", - original_text=mock_data["content"] if mock_data["success"] else None, - metadata=metadata, - parent_source_id=data_source.id - ) - - expanded_sources.append(link_source) - - # process generic URLs with simple HTTP scraping (no Apify fallback) - if generic_urls: - import asyncio - from app.ai.context.web.apify_utils import scrapeGenericSimple - - # scrape each generic URL with simple HTTP only - for url in generic_urls: - try: - # use asyncio to run the async scraping function - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - result = loop.run_until_complete(scrapeGenericSimple(url, maxChars=None)) - finally: - loop.close() - - # create DataSource from result - metadata = { - "url": url, - "title": result.get("title", ""), - "success": result.get("success", False), - "error": result.get("error"), - "mock": False - } - - link_source = DataSource( - id=f"link-{uuid4().hex[:8]}", - source_type="link_context", - original_text=result.get("content", ""), - metadata=metadata, - locale=data_source.locale, - timestamp=data_source.timestamp - ) - - expanded_sources.append(link_source) - - except Exception as e: - # on error, create failed DataSource - metadata = { - "url": url, - "success": False, - "error": str(e), - "mock": False - } - - link_source = DataSource( - id=f"link-{uuid4().hex[:8]}", - source_type="link_context", - original_text="", - metadata=metadata, - locale=data_source.locale, - timestamp=data_source.timestamp - ) - - expanded_sources.append(link_source) - - return expanded_sources diff --git a/app/ai/pipeline/tests/judgment_test.py b/app/ai/pipeline/tests/judgment_test.py deleted file mode 100644 index ebac0be..0000000 --- a/app/ai/pipeline/tests/judgment_test.py +++ /dev/null @@ -1,818 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Tests for the adjudication/judgment pipeline step. - -These tests make REAL calls to the LLM (Google Gemini API) to validate: -- The structure of outputs -- The LangChain chain works correctly -- The prompt produces valid results -- Verdict generation works properly - -IMPORTANT: Set GOOGLE_API_KEY in your environment before running. - -Run with: - pytest app/ai/pipeline/tests/judgment_test.py -v -s - -The -s flag shows stdout so you can see the LLM responses for debugging. -""" - -import pytest - -from app.models import ( - AdjudicationInput, - FactCheckResult, - DataSourceResult, - ClaimVerdict, - DataSourceWithClaims, - DataSource, - EnrichedClaim, - Citation, - ClaimSource, -) -from app.ai.pipeline import ( - adjudicate_claims, - build_adjudication_chain, -) -from app.config.gemini_models import get_gemini_default_pipeline_config - - -# ===== HELPER FUNCTIONS ===== - -def print_adjudication_input(adjudication_input: AdjudicationInput, test_name: str): - """Print the EXACT formatted input that the LLM sees.""" - from app.ai.pipeline.judgement import format_adjudication_input - - print("\n" + "=" * 80) - print(f"TEST: {test_name}") - print("=" * 80) - print("\n📥 EXACT LLM INPUT (what the model sees):\n") - - # Format exactly as the LLM will see it - formatted_sources = format_adjudication_input(adjudication_input) - print(formatted_sources) - - if adjudication_input.additional_context: - print(f"\n**Contexto Adicional**: {adjudication_input.additional_context}\n") - - print("=" * 80) - - -def print_fact_check_result(result: FactCheckResult, test_name: str): - """Print the complete fact-check result for debugging.""" - print("\n" + "=" * 80) - print(f"📤 FACT-CHECK RESULT FOR: {test_name}") - print("=" * 80 + "\n") - - if result.overall_summary: - print(f"OVERALL SUMMARY:\n{result.overall_summary}\n") - print("=" * 80 + "\n") - - for i, data_source_result in enumerate(result.results, 1): - print(f"DATA SOURCE {i}:") - print(f" ID: {data_source_result.data_source_id}") - print(f" Type: {data_source_result.source_type}") - print(f" Number of verdicts: {len(data_source_result.claim_verdicts)}\n") - - for j, verdict in enumerate(data_source_result.claim_verdicts, 1): - print(f" VERDICT {j}:") - print(f" Claim ID: {verdict.claim_id}") - print(f" Claim Text: {verdict.claim_text}") - print(f" Verdict: {verdict.verdict}") - print(f" Justification: {verdict.justification}") - print(f" Citations Used: {len(verdict.citations_used)} citation(s)") - for k, citation in enumerate(verdict.citations_used, 1): - print(f" [{k}] {citation.title} ({citation.url})") - print() - - print("-" * 80 + "\n") - - -def validate_claim_verdict(verdict: ClaimVerdict): - """Validate that a ClaimVerdict has the correct structure.""" - # Required fields - assert verdict.claim_id is not None and verdict.claim_id != "", "Claim ID should not be empty" - assert verdict.claim_text is not None and verdict.claim_text != "", "Claim text should not be empty" - assert verdict.verdict is not None, "Verdict should not be None" - assert verdict.justification is not None and verdict.justification != "", "Justification should not be empty" - - # Type checks - assert isinstance(verdict.claim_id, str), "Claim ID should be a string" - assert isinstance(verdict.claim_text, str), "Claim text should be a string" - assert isinstance(verdict.verdict, str), "Verdict should be a string" - assert isinstance(verdict.justification, str), "Justification should be a string" - - # Verdict should be one of the valid options - valid_verdicts = ["Verdadeiro", "Falso", "Fora de Contexto", "Fontes insuficientes para verificar"] - assert verdict.verdict in valid_verdicts, f"Verdict must be one of {valid_verdicts}, got: {verdict.verdict}" - - # Validate citations_used field - assert hasattr(verdict, 'citations_used'), "ClaimVerdict should have citations_used field" - assert isinstance(verdict.citations_used, list), "citations_used should be a list" - for citation in verdict.citations_used: - assert isinstance(citation, Citation), f"Each citation should be a Citation object, got {type(citation)}" - - -def validate_data_source_result(data_source_result: DataSourceResult): - """Validate that a DataSourceResult has the correct structure.""" - # Required fields - assert data_source_result.data_source_id is not None, "Data source ID should not be None" - assert data_source_result.source_type is not None, "Source type should not be None" - assert data_source_result.claim_verdicts is not None, "Claim verdicts should not be None" - - # Type checks - assert isinstance(data_source_result.data_source_id, str), "Data source ID should be a string" - assert isinstance(data_source_result.source_type, str), "Source type should be a string" - assert isinstance(data_source_result.claim_verdicts, list), "Claim verdicts should be a list" - - # Validate each verdict - for verdict in data_source_result.claim_verdicts: - validate_claim_verdict(verdict) - - -def validate_fact_check_result(result: FactCheckResult): - """Validate that a FactCheckResult has the correct structure.""" - # Type check - assert isinstance(result, FactCheckResult), "Result should be a FactCheckResult" - assert isinstance(result.results, list), "Results should be a list" - - # Validate each data source result - for data_source_result in result.results: - validate_data_source_result(data_source_result) - - # Overall summary is optional but should be string if present - if result.overall_summary is not None: - assert isinstance(result.overall_summary, str), "Overall summary should be a string" - - -# ===== TESTS ===== - -def test_basic_adjudication_single_claim(): - """Test basic adjudication with a single claim and evidence.""" - # Setup - data_source = DataSource( - id="msg-001", - source_type="original_text", - original_text="Ouvi dizer que a vacina X causa infertilidade em mulheres, isso é verdade?", - metadata={}, - locale="pt-BR" - ) - - # Create enriched claim with evidence - enriched_claim = EnrichedClaim( - id="claim-uuid-1", - text="A vacina X causa infertilidade em mulheres", - source=ClaimSource(source_type="original_text", source_id="msg-001"), - citations=[ - Citation( - url="https://saude.gov.br/estudo-vacinas", - title="Estudo de Segurança de Vacinas", - publisher="Ministério da Saúde", - citation_text="Um estudo com 50.000 participantes não encontrou evidências ligando a vacina X a problemas de fertilidade.", - rating="Falso", - date="2024-11-05" - ) - ] - ) - - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=[enriched_claim] - ) - - adjudication_input = AdjudicationInput( - sources_with_claims=[source_with_claims], - additional_context="Usuário demonstra preocupação com segurança de vacinas" - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Print input for debugging - print_adjudication_input(adjudication_input, "Basic Adjudication Single Claim") - - # Execute - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Print output for debugging - print_fact_check_result(result, "Basic Adjudication Single Claim") - - # Validate structure - validate_fact_check_result(result) - assert len(result.results) == 1, "Should have results for 1 data source" - assert len(result.results[0].claim_verdicts) == 1, "Should have 1 verdict" - - # Check that verdict was generated for the correct claim - verdict = result.results[0].claim_verdicts[0] - assert verdict.claim_id == "claim-uuid-1", "Verdict should be for the correct claim" - - -def test_adjudication_multiple_claims_same_source(): - """Test adjudication with multiple claims from the same data source.""" - # Setup - data_source = DataSource( - id="msg-002", - source_type="original_text", - original_text="O presidente anunciou um novo imposto sobre carbono de R$250 por tonelada. Além disso, o governo vai investir R$500 bilhões em energia renovável.", - metadata={}, - locale="pt-BR" - ) - - enriched_claims = [ - EnrichedClaim( - id="claim-uuid-2a", - text="O presidente anunciou um imposto sobre carbono de R$250 por tonelada", - source=ClaimSource(source_type="original_text", source_id="msg-002"), - citations=[ - Citation( - url="https://g1.globo.com/politica", - title="Presidente anuncia imposto sobre carbono", - publisher="G1", - citation_text="O presidente confirmou o novo imposto sobre carbono no valor de R$250 por tonelada.", - rating="Verdadeiro", - date="2024-11-10" - ) - ] - ), - EnrichedClaim( - id="claim-uuid-2b", - text="O governo vai investir R$500 bilhões em energia renovável", - source=ClaimSource(source_type="original_text", source_id="msg-002"), - citations=[] - ) - ] - - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=enriched_claims - ) - - adjudication_input = AdjudicationInput( - sources_with_claims=[source_with_claims] - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Print input for debugging - print_adjudication_input(adjudication_input, "Multiple Claims Same Source") - - # Execute - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Print output for debugging - print_fact_check_result(result, "Multiple Claims Same Source") - - # Validate structure - validate_fact_check_result(result) - assert len(result.results) == 1, "Should have results for 1 data source" - assert len(result.results[0].claim_verdicts) == 2, "Should have 2 verdicts" - - # Check that verdicts were generated for both claims - verdict_ids = {v.claim_id for v in result.results[0].claim_verdicts} - assert "claim-uuid-2a" in verdict_ids, "Should have verdict for first claim" - assert "claim-uuid-2b" in verdict_ids, "Should have verdict for second claim" - - -def test_adjudication_multiple_data_sources(): - """Test adjudication with claims from multiple data sources.""" - # Setup - First data source (original message) - data_source_1 = DataSource( - id="msg-003", - source_type="original_text", - original_text="Dizem que a vacina da COVID causa problemas no coração.", - metadata={}, - locale="pt-BR" - ) - - enriched_claim_1 = EnrichedClaim( - id="claim-uuid-3a", - text="A vacina da COVID causa problemas no coração", - source=ClaimSource(source_type="original_text", source_id="msg-003"), - citations=[ - Citation( - url="https://www.fiocruz.br/covid-vacinas", - title="Segurança das Vacinas COVID-19", - publisher="Fiocruz", - citation_text="Estudos mostram que casos de miocardite são raros e geralmente leves, com benefícios da vacinação superando riscos.", - rating="Fora de Contexto", - date="2024-10-20" - ) - ] - ) - - # Setup - Second data source (link context) - data_source_2 = DataSource( - id="link-004", - source_type="link_context", - original_text="Novo estudo revela que uso de máscaras reduziu transmissão de COVID em 70%.", - metadata={ - "title": "Eficácia de Máscaras", - "url": "https://example.com/mascaras" - }, - locale="pt-BR" - ) - - enriched_claim_2 = EnrichedClaim( - id="claim-uuid-3b", - text="O uso de máscaras reduziu a transmissão de COVID em 70%", - source=ClaimSource(source_type="link_context", source_id="link-004"), - citations=[] - ) - - sources_with_claims = [ - DataSourceWithClaims( - data_source=data_source_1, - enriched_claims=[enriched_claim_1] - ), - DataSourceWithClaims( - data_source=data_source_2, - enriched_claims=[enriched_claim_2] - ) - ] - - adjudication_input = AdjudicationInput( - sources_with_claims=sources_with_claims - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Print input for debugging - print_adjudication_input(adjudication_input, "Multiple Data Sources") - - # Execute - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Print output for debugging - print_fact_check_result(result, "Multiple Data Sources") - - # Validate structure - validate_fact_check_result(result) - assert len(result.results) == 2, "Should have results for 2 data sources" - assert len(result.results[0].claim_verdicts) == 1, "First source should have 1 verdict" - assert len(result.results[1].claim_verdicts) == 1, "Second source should have 1 verdict" - - # Check data source IDs match - source_ids = {r.data_source_id for r in result.results} - assert "msg-003" in source_ids, "Should have result for first data source" - assert "link-004" in source_ids, "Should have result for second data source" - - -def test_adjudication_no_evidence(): - """Test adjudication when no evidence is available.""" - # Setup - data_source = DataSource( - id="msg-005", - source_type="original_text", - original_text="Li que existe uma nova tecnologia que permite carros voarem a 500 km/h.", - metadata={}, - locale="pt-BR" - ) - - enriched_claim = EnrichedClaim( - id="claim-uuid-4", - text="Existe uma nova tecnologia que permite carros voarem a 500 km/h", - source=ClaimSource(source_type="original_text", source_id="msg-005"), - citations=[] - ) - - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=[enriched_claim] - ) - - adjudication_input = AdjudicationInput( - sources_with_claims=[source_with_claims] - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Print input for debugging - print_adjudication_input(adjudication_input, "No Evidence Available") - - # Execute - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Print output for debugging - print_fact_check_result(result, "No Evidence Available") - - # Validate structure - validate_fact_check_result(result) - assert len(result.results) == 1, "Should have results for 1 data source" - assert len(result.results[0].claim_verdicts) == 1, "Should have 1 verdict" - - # With no evidence, verdict should be "Fontes insuficientes para verificar" - # Note: We don't assert this because LLM behavior may vary, but it's expected - - -def test_adjudication_with_contradictory_sources(): - """Test adjudication when evidence sources contradict each other.""" - # Setup - data_source = DataSource( - id="msg-006", - source_type="original_text", - original_text="O café aumenta o risco de doenças cardíacas.", - metadata={}, - locale="pt-BR" - ) - - enriched_claim = EnrichedClaim( - id="claim-uuid-5", - text="O café aumenta o risco de doenças cardíacas", - source=ClaimSource(source_type="original_text", source_id="msg-006"), - citations=[ - Citation( - url="https://example.com/estudo1", - title="Café e Saúde Cardíaca - Estudo A", - publisher="Instituto de Pesquisa A", - citation_text="Consumo moderado de café não está associado a aumento de risco cardíaco.", - rating=None, - date="2024-09-15" - ), - Citation( - url="https://example.com/estudo2", - title="Riscos do Café - Estudo B", - publisher="Instituto de Pesquisa B", - citation_text="Consumo excessivo de café pode aumentar pressão arterial temporariamente.", - rating=None, - date="2024-10-01" - ) - ] - ) - - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=[enriched_claim] - ) - - adjudication_input = AdjudicationInput( - sources_with_claims=[source_with_claims] - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Print input for debugging - print_adjudication_input(adjudication_input, "Contradictory Sources") - - # Execute - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Print output for debugging - print_fact_check_result(result, "Contradictory Sources") - - # Validate structure - validate_fact_check_result(result) - assert len(result.results) == 1, "Should have results for 1 data source" - assert len(result.results[0].claim_verdicts) == 1, "Should have 1 verdict" - - -def test_chain_building(): - """Test that the adjudication chain can be built without errors.""" - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Build chain - chain = build_adjudication_chain(llm_config=llm_config) - - # Validate - assert chain is not None, "Chain should be built successfully" - print("\n" + "=" * 80) - print("TEST: Chain Building") - print("=" * 80) - print(f"\n✓ Adjudication chain built successfully: {type(chain).__name__}") - print() - - -def test_return_type_is_fact_check_result(): - """Test that adjudicate_claims returns FactCheckResult wrapper for type safety.""" - # Setup - data_source = DataSource( - id="msg-007", - source_type="original_text", - original_text="Mensagem de teste para verificação de tipo", - metadata={}, - locale="pt-BR" - ) - - enriched_claim = EnrichedClaim( - id="claim-uuid-6", - text="Esta é uma afirmação de teste", - source=ClaimSource(source_type="original_text", source_id="msg-007"), - citations=[] - ) - - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=[enriched_claim] - ) - - adjudication_input = AdjudicationInput( - sources_with_claims=[source_with_claims] - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Execute - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Validate type - should be FactCheckResult wrapper - assert isinstance(result, FactCheckResult), "Result should be FactCheckResult wrapper" - assert hasattr(result, 'results'), "Wrapper should have 'results' attribute" - assert isinstance(result.results, list), "The 'results' attribute should be a list" - - print("\n" + "=" * 80) - print("TEST: Return Type Check") - print("=" * 80) - print(f"\n✓ Correct return type: {type(result).__name__}") - print(f"✓ Returns FactCheckResult wrapper for type safety") - print(f"✓ Wrapper contains {len(result.results)} data source result(s)") - print() - - -def test_citations_used_field_in_verdict(): - """ - Test that the LLM returns the citations_used field in verdicts. - - This test verifies that: - 1. The _LLMClaimVerdict model includes a citations_used field - 2. The LLM actually populates this field with the citations it used - 3. The citations are properly formatted Citation objects - """ - # Setup - data_source = DataSource( - id="msg-008", - source_type="original_text", - original_text="A Terra é plana e não gira ao redor do Sol.", - metadata={}, - locale="pt-BR" - ) - - enriched_claim = EnrichedClaim( - id="claim-uuid-7", - text="A Terra é plana", - source={ - "source_type": "original_text", - "source_id": "msg-008" - }, - citations=[ - Citation( - url="https://www.nasa.gov/earth-round", - title="A Terra é Redonda - NASA", - publisher="NASA", - citation_text="Evidências científicas e fotografias do espaço confirmam que a Terra é redonda.", - rating="Falso", - date="2024-01-15" - ), - Citation( - url="https://www.iag.usp.br/astronomia/terra-formato", - title="O Formato da Terra", - publisher="IAG-USP", - citation_text="Observações astronômicas e medições geodésicas demonstram que a Terra é um esferoide.", - date="2024-03-20" - ) - ], - entities=["Terra"] - ) - - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=[enriched_claim] - ) - - adjudication_input = AdjudicationInput( - sources_with_claims=[source_with_claims] - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Print input for debugging - print_adjudication_input(adjudication_input, "Citations Used Field Test") - - # Execute - this will invoke the LLM with the new schema - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Print output for debugging - print_fact_check_result(result, "Citations Used Field Test") - - # Validate basic structure - validate_fact_check_result(result) - assert len(result.results) == 1, "Should have results for 1 data source" - assert len(result.results[0].claim_verdicts) == 1, "Should have 1 verdict" - - verdict = result.results[0].claim_verdicts[0] - - # Assert that citations_used field is present and properly populated - assert hasattr(verdict, 'citations_used'), "Verdict should have citations_used field" - assert isinstance(verdict.citations_used, list), "citations_used should be a list" - - # The LLM should return at least some citations (though it may choose to use all or subset) - # We provided 2 citations, so we expect the LLM to use at least one - print(f"\n📊 LLM used {len(verdict.citations_used)} out of 2 available citations") - - # Validate each citation in citations_used - for i, citation in enumerate(verdict.citations_used, 1): - assert isinstance(citation, Citation), f"Citation {i} should be a Citation object" - assert citation.url, f"Citation {i} should have a URL" - assert citation.title, f"Citation {i} should have a title" - assert citation.citation_text, f"Citation {i} should have citation_text" - print(f" ✓ Citation {i}: {citation.title}") - - print("\n" + "=" * 80) - print("TEST: Citations Used Field") - print("=" * 80) - print(f"\n✓ Verdict generated successfully with citations_used field") - print(f" Verdict: {verdict.verdict}") - print(f" Justification length: {len(verdict.justification)} chars") - print(f" Citations used by LLM: {len(verdict.citations_used)}") - print("\n✅ SUCCESS: citations_used field is properly propagated from LLM output to ClaimVerdict!") - print() - - -def test_insufficient_sources_no_citations(): - """ - Test that verdict is 'Fontes insuficientes para verificar' when no citations exist. - - This tests the case where evidence gathering found absolutely nothing - - no fact-check results, no web search results, nothing. - """ - # Setup - data_source = DataSource( - id="msg-insufficient-1", - source_type="original_text", - original_text="Dizem que existe um novo mineral chamado Unobtanium que pode curar todas as doenças.", - metadata={}, - locale="pt-BR" - ) - - # Claim with absolutely no citations - enriched_claim = EnrichedClaim( - id="claim-insufficient-1", - text="Existe um novo mineral chamado Unobtanium que pode curar todas as doenças", - source=ClaimSource(source_type="original_text", source_id="msg-insufficient-1"), - citations=[] # No evidence at all - ) - - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=[enriched_claim] - ) - - adjudication_input = AdjudicationInput( - sources_with_claims=[source_with_claims] - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Print input for debugging - print_adjudication_input(adjudication_input, "No Citations Available") - - # Execute - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Print output for debugging - print_fact_check_result(result, "No Citations Available") - - # Validate structure - validate_fact_check_result(result) - assert len(result.results) == 1, "Should have results for 1 data source" - assert len(result.results[0].claim_verdicts) == 1, "Should have 1 verdict" - - # Assert the verdict is "Fontes insuficientes para verificar" - verdict = result.results[0].claim_verdicts[0] - assert verdict.verdict == "Fontes insuficientes para verificar", ( - f"Expected verdict 'Fontes insuficientes para verificar' when no citations exist, " - f"got '{verdict.verdict}'" - ) - - print("\n" + "=" * 80) - print("TEST: Insufficient Sources - No Citations") - print("=" * 80) - print(f"✓ Verdict correctly set to: {verdict.verdict}") - print(f"✓ Justification: {verdict.justification[:100]}...") - print() - - -def test_insufficient_sources_unverifiable_claim(): - """ - Test that verdict is 'Fontes insuficientes para verificar' for highly specific/unverifiable claims. - - This tests claims that are too specific, obscure, or recent to have reliable evidence, - even if some weak sources exist. - """ - # Setup - data_source = DataSource( - id="msg-insufficient-2", - source_type="original_text", - original_text="Um estudo secreto realizado em laboratório privado provou que comer 47 gramas de chocolate por dia aumenta QI em 15 pontos.", - metadata={}, - locale="pt-BR" - ) - - # Claim with weak/unreliable citations - enriched_claim = EnrichedClaim( - id="claim-insufficient-2", - text="Um estudo secreto provou que comer 47 gramas de chocolate por dia aumenta QI em 15 pontos", - source=ClaimSource(source_type="original_text", source_id="msg-insufficient-2"), - citations=[ - # Only vague or unreliable sources - Citation( - url="https://example.com/blog-post", - title="10 Fatos Surpreendentes Sobre Chocolate", - publisher="Blog Pessoal", - citation_text="Alguns especialistas acreditam que chocolate pode ter benefícios cognitivos.", - source=None, - rating=None, - date=None - ) - ] - ) - - source_with_claims = DataSourceWithClaims( - data_source=data_source, - enriched_claims=[enriched_claim] - ) - - adjudication_input = AdjudicationInput( - sources_with_claims=[source_with_claims] - ) - - # Get Gemini config - pipeline_config = get_gemini_default_pipeline_config() - llm_config = pipeline_config.adjudication_llm_config - - # Print input for debugging - print_adjudication_input(adjudication_input, "Weak/Unreliable Citations") - - # Execute - result = adjudicate_claims( - adjudication_input=adjudication_input, - llm_config=llm_config - ) - - # Print output for debugging - print_fact_check_result(result, "Weak/Unreliable Citations") - - # Validate structure - validate_fact_check_result(result) - assert len(result.results) == 1, "Should have results for 1 data source" - assert len(result.results[0].claim_verdicts) == 1, "Should have 1 verdict" - - # Assert the verdict is "Fontes insuficientes para verificar" - verdict = result.results[0].claim_verdicts[0] - assert verdict.verdict == "Fontes insuficientes para verificar", ( - f"Expected verdict 'Fontes insuficientes para verificar' for unverifiable claim, " - f"got '{verdict.verdict}'" - ) - - print("\n" + "=" * 80) - print("TEST: Insufficient Sources - Unverifiable Claim") - print("=" * 80) - print(f"✓ Verdict correctly set to: {verdict.verdict}") - print(f"✓ Justification: {verdict.justification[:100]}...") - print() - - -# ===== PYTEST CONFIGURATION ===== - -if __name__ == "__main__": - """Run tests manually with: python -m app.ai.pipeline.tests.judgment_test""" - pytest.main([__file__, "-v", "-s"]) - diff --git a/app/ai/pipeline/tests/link_context_expander_test.py b/app/ai/pipeline/tests/link_context_expander_test.py deleted file mode 100644 index 30f37da..0000000 --- a/app/ai/pipeline/tests/link_context_expander_test.py +++ /dev/null @@ -1,468 +0,0 @@ -import pytest - -# Configure pytest to automatically handle async tests -pytest_plugins = ('pytest_asyncio',) - -from app.ai.pipeline.link_context_expander import ( - extract_links, - expand_link_contexts, -) -from app.models import DataSource -from app.config import get_default_pipeline_config -from app.ai.threads.thread_utils import ThreadPoolManager - - -_cfg = get_default_pipeline_config() - - -@pytest.fixture(scope="session", autouse=True) -def initialize_thread_pool(): - """initialize ThreadPoolManager once for all tests""" - manager = ThreadPoolManager.get_instance() - manager.initialize() - yield - # cleanup after all tests - manager.shutdown() - -# ===== UNIT TESTS FOR extract_links ===== - -def test_extract_single_https_url(): - """should extract a single https URL from text""" - text = "Check out this article at https://example.com for more info." - result = extract_links(text) - assert result == ["https://example.com"] - - -def test_extract_single_http_url(): - """should extract a single http URL from text""" - text = "Visit http://test.org to learn more." - result = extract_links(text) - assert result == ["http://test.org"] - - -def test_extract_multiple_urls(): - """should extract multiple URLs from text""" - text = "Check https://example.com and http://test.org for details." - result = extract_links(text) - assert result == ["https://example.com", "http://test.org"] - - -def test_extract_urls_with_paths(): - """should extract URLs with paths and query parameters""" - text = "See https://example.com/article/123?ref=social and http://test.org/page" - result = extract_links(text) - assert result == ["https://example.com/article/123?ref=social", "http://test.org/page"] - - -def test_remove_duplicate_urls(): - """should remove duplicate URLs while preserving order""" - text = "Visit https://example.com and also https://example.com again." - result = extract_links(text) - assert result == ["https://example.com"] - assert len(result) == 1 - - -def test_empty_text(): - """should return empty list for empty text""" - result = extract_links("") - assert result == [] - - -def test_text_without_urls(): - """should return empty list when no URLs are present""" - text = "This is just plain text with no links at all." - result = extract_links(text) - assert result == [] - - -def test_url_with_special_characters(): - """should handle URLs with hyphens, underscores and other valid chars""" - text = "Check https://my-site.example.com/path_to/resource-123" - result = extract_links(text) - assert result == ["https://my-site.example.com/path_to/resource-123"] - - -def test_multiple_urls_preserves_order(): - """should preserve the order of URLs as they appear in text""" - text = "First https://first.com then https://second.com and https://third.com" - result = extract_links(text) - assert result == ["https://first.com", "https://second.com", "https://third.com"] - - -def test_url_at_end_of_sentence(): - """should extract URL that ends with punctuation""" - text = "Visit our website at https://example.com." - result = extract_links(text) - # the dot should not be part of the URL - assert result == ["https://example.com"] - - -def test_url_in_parentheses(): - """should extract URL surrounded by parentheses""" - text = "See the docs (https://docs.example.com) for details." - result = extract_links(text) - assert result == ["https://docs.example.com"] - - -def test_multiple_protocols_mixed(): - """should handle mix of http and https URLs""" - text = "http://old.example.com and https://secure.example.com" - result = extract_links(text) - assert result == ["http://old.example.com", "https://secure.example.com"] - - -def test_url_with_port(): - """should extract URLs with port numbers""" - text = "Connect to https://localhost:8080/api for testing." - result = extract_links(text) - assert result == ["https://localhost:8080/api"] - - -def test_url_with_fragment(): - """should extract URLs with fragments/anchors""" - text = "Jump to https://example.com/page#section-2 directly." - result = extract_links(text) - assert result == ["https://example.com/page#section-2"] - - -def test_multiline_text_with_urls(): - """should extract URLs from multiline text""" - text = """First line with https://example.com - Second line with http://test.org - Third line with https://another.com""" - result = extract_links(text) - assert result == ["https://example.com", "http://test.org", "https://another.com"] - - -def test_urls_without_protocol_not_extracted(): - """should not extract URLs without http/https protocol""" - text = "Visit www.example.com or example.com for info." - result = extract_links(text) - assert result == [] - - -def test_real_world_whatsapp_message(): - """should handle typical WhatsApp message with URLs""" - text = "Olha essa notícia importante: https://g1.globo.com/economia/noticia.html compartilha aí!" - result = extract_links(text) - assert result == ["https://g1.globo.com/economia/noticia.html"] - - -# ===== INTEGRATION TESTS FOR WEB SCRAPING ===== -# expand_link_context tests moved to app/agentic_ai/tests/nodes/test_link_expander.py - -@pytest.mark.asyncio -async def test_expand_link_contexts_with_multiple_real_urls(): - """should extract and expand multiple real URLs from DataSource""" - # create a DataSource with text containing multiple URLs - text = """ - Veja essas notícias importantes: - - 1. Arsenal encontrado em SP: https://g1.globo.com/sp/sao-paulo/noticia/2025/11/16/policia-encontra-arsenal-de-guerra-na-zona-sul-de-sp.ghtml - - 2. Cúpula dos Povos em Belém: https://www.cnnbrasil.com.br/nacional/em-belem-cupula-dos-povos-cobra-participacao-popular-nas-acoes-climaticas/ - - 3. Arte moderna na BBC: https://www.bbc.com/culture/article/20251112-why-this-1768-painting-could-be-the-real-birth-of-modern-art - """ - - data_source = DataSource( - id="msg-test-001", - source_type="original_text", - original_text=text, - locale="pt-BR" - ) - - # expand all links - expanded_sources = await expand_link_contexts(data_source,_cfg) - - # validate results - assert len(expanded_sources) == 3, f"Expected 3 expanded sources, got {len(expanded_sources)}" - - # validate each expanded source - for i, source in enumerate(expanded_sources, 1): - print(f"\n{'=' * 80}") - print(f"EXPANDED SOURCE {i}") - print(f"{'=' * 80}") - - assert source.source_type == "link_context" - assert source.metadata["parent_source_id"] == "msg-test-001" - assert "url" in source.metadata - assert source.metadata["success"] is True, f"Source {i} scraping failed" - assert source.original_text != "", f"Source {i} content is empty" - assert source.metadata["content_length"] > 0 - - print(f"ID: {source.id}") - print(f"URL: {source.metadata['url']}") - print(f"Success: {source.metadata['success']}") - print(f"Content length: {source.metadata['content_length']} chars") - print(f"Content preview (first 150 chars):\n{source.original_text[:150]}...") - print(f"{'=' * 80}\n") - - -@pytest.mark.asyncio -async def test_expand_link_contexts_no_links(): - """should return empty list when DataSource has no links""" - data_source = DataSource( - id="msg-no-links", - source_type="original_text", - original_text="This is just plain text with no URLs at all." - ) - - expanded_sources = await expand_link_contexts(data_source,_cfg) - - assert expanded_sources == [] - assert len(expanded_sources) == 0 - - -@pytest.mark.asyncio -async def test_expand_link_contexts_validates_source_type(): - """should raise ValueError if DataSource is not original_text type""" - data_source = DataSource( - id="link-001", - source_type="link_context", # wrong type! - original_text="Some text with https://example.com" - ) - - with pytest.raises(ValueError) as exc_info: - await expand_link_contexts(data_source,_cfg) - - assert "original_text" in str(exc_info.value) - assert "link_context" in str(exc_info.value) - - -@pytest.mark.asyncio -async def test_expand_link_context_preserves_locale_and_timestamp(): - """should preserve locale and timestamp from original DataSource""" - text = "Check this: https://g1.globo.com/sp/sao-paulo/noticia/2025/11/16/policia-encontra-arsenal-de-guerra-na-zona-sul-de-sp.ghtml" - - data_source = DataSource( - id="msg-locale-test", - source_type="original_text", - original_text=text, - locale="en-US", - timestamp="2025-11-16T10:30:00Z" - ) - - expanded_sources = await expand_link_contexts(data_source,_cfg) - - assert len(expanded_sources) == 1 - expanded = expanded_sources[0] - - assert expanded.locale == "en-US" - assert expanded.timestamp == "2025-11-16T10:30:00Z" - - -@pytest.mark.asyncio -async def test_expand_link_contexts_single_url(): - """should handle DataSource with single URL""" - text = "Veja esta notícia: https://g1.globo.com/sp/sao-paulo/noticia/2025/11/16/policia-encontra-arsenal-de-guerra-na-zona-sul-de-sp.ghtml" - - data_source = DataSource( - id="msg-single", - source_type="original_text", - original_text=text - ) - - expanded_sources = await expand_link_contexts(data_source,_cfg) - - assert len(expanded_sources) == 1 - assert expanded_sources[0].metadata["success"] is True - assert expanded_sources[0].original_text != "" - - -@pytest.mark.asyncio -async def test_expand_link_contexts_timeout_with_nonexistent_site(): - """should handle timeout gracefully when scraping takes too long or site doesn't exist""" - from app.models import PipelineConfig, LLMConfig, TimeoutConfig - from langchain_openai import ChatOpenAI - - # create a config with very short timeouts - short_timeout_config = PipelineConfig( - claim_extraction_llm_config=LLMConfig( - llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.0, timeout=30.0) - ), - adjudication_llm_config=LLMConfig( - llm=ChatOpenAI(model="o3-mini", timeout=60.0) - ), - timeout_config=TimeoutConfig( - link_content_expander_timeout_per_link=2.0, # very short timeout - link_content_expander_timeout_total=5.0, # very short total timeout - claim_extractor_timeout_per_source=10.0, - claim_extractor_timeout_total=20.0, - evidence_retrieval_timeout_per_claim=20.0, - evidence_retrieval_timeout_total=40.0, - adjudication_timeout=20.0 - ), - max_links_to_expand=5, - max_claims_to_extract=10, - max_evidence_sources_per_claim=5 - ) - - # create a DataSource with non-existent sites - text = """ - Check these sites: - http://this-site-definitely-does-not-exist-12345.com - http://another-fake-site-that-will-timeout-67890.net - http://third-nonexistent-domain-99999.org - """ - - data_source = DataSource( - id="msg-timeout-test", - source_type="original_text", - original_text=text - ) - - # expand links with short timeout config - expanded_sources = await expand_link_contexts(data_source, short_timeout_config) - - # should return empty list or partial results due to timeouts - # the function should handle timeouts gracefully and not crash - assert isinstance(expanded_sources, list) - - # if any sources were expanded before timeout, they should have proper structure - for source in expanded_sources: - assert source.source_type == "link_context" - assert "url" in source.metadata - assert "parent_source_id" in source.metadata - assert source.metadata["parent_source_id"] == "msg-timeout-test" - - # extract timeout values for printing - timeout_cfg: TimeoutConfig = short_timeout_config.timeout_config - per_link_timeout = timeout_cfg.link_content_expander_timeout_per_link - total_timeout = timeout_cfg.link_content_expander_timeout_total - - print(f"\n{'=' * 80}") - print(f"TEST: Timeout Handling with Non-Existent Sites") - print(f"{'=' * 80}") - print(f"Total expanded sources: {len(expanded_sources)}") - print(f"Config timeout per link: {per_link_timeout}s") - print(f"Config timeout total: {total_timeout}s") - print(f"Result: Timeout handled gracefully without crashing") - print(f"{'=' * 80}\n") - - -@pytest.mark.asyncio -async def test_parallel_expansion_with_social_media_and_news(): - """ - simple test: scrape 4 links in parallel and measure time. - """ - import time - from app.ai.threads.thread_utils import ThreadPoolManager, OperationType - from app.ai.pipeline.link_context_expander import expand_link_context_sync - - # 4 links to test - urls = [ - "https://www.facebook.com/share/p/1GXv2qwKbE/", - "https://www.instagram.com/p/DRKnubCjgtM/?utm_source=ig_web_copy_link&igsh=NTc4MTIwNjQ2YQ==", - "https://www.tiktok.com/@roteiro.em.dia/video/7572058473853226260?is_from_webapp=1&sender_device=pc", - "https://noticias.uol.com.br/colunas/jamil-chade/2025/11/17/conselho-da-onu-aprova-plano-de-trump-para-gaza-com-tropas-internacionais.htm" - ] - - print(f"\n{'=' * 80}") - print(f"PARALLEL SCRAPING TEST - 4 LINKS") - print(f"{'=' * 80}") - for i, url in enumerate(urls, 1): - platform = "Unknown" - if "facebook.com" in url: - platform = "Facebook" - elif "instagram.com" in url: - platform = "Instagram" - elif "tiktok.com" in url: - platform = "TikTok" - elif "uol.com.br" in url: - platform = "UOL" - print(f"{i}. {platform}: {url[:60]}...") - print(f"{'=' * 80}\n") - - # get thread pool manager - manager = ThreadPoolManager.get_instance() - if not manager._initialized: - manager.initialize() - - print(f"ThreadPool: {manager.max_workers} workers") - print(f"Starting parallel scraping...\n") - - # submit all 4 jobs at once - start_time = time.time() - - futures = [] - for url in urls: - future = manager.submit( - OperationType.LINK_CONTEXT_EXPANDING, - expand_link_context_sync, - url, - timeout_per_link=90.0 # 90s timeout per link (increased for TikTok) - ) - futures.append(future) - - print(f"✓ Submitted {len(futures)} jobs to ThreadPool") - print(f"⏳ Waiting for all to complete...\n") - - # wait for all results - from app.ai.threads.thread_utils import wait_all - try: - results = wait_all(futures, timeout=120.0) - except TimeoutError: - print("❌ Timeout waiting for results") - results = [] - - end_time = time.time() - elapsed = end_time - start_time - - # print results - print(f"\n{'=' * 80}") - print(f"RESULTS") - print(f"{'=' * 80}") - print(f"Total time: {elapsed:.2f}s") - print(f"Results received: {len(results)}/{len(urls)}") - print(f"{'=' * 80}\n") - - success_count = 0 - fail_count = 0 - - for i, result in enumerate(results, 1): - platform = "Unknown" - url = urls[i-1] - if "facebook.com" in url: - platform = "Facebook" - elif "instagram.com" in url: - platform = "Instagram" - elif "tiktok.com" in url: - platform = "TikTok" - elif "uol.com.br" in url: - platform = "UOL" - - print(f"--- LINK {i}: {platform} ---") - print(f" URL: {url}") - - if result is None: - print(f"❌ FAILED (returned None - check logs above for timeout/error)") - fail_count += 1 - elif result.success: - print(f"✅ SUCCESS") - print(f" Content: {result.content_length} chars") - if result.content: - print(f" Preview: {result.content[:100]}...") - if result.metadata: - print(f" Metadata: {result.metadata}") - success_count += 1 - else: - print(f"❌ FAILED") - print(f" Error: {result.error}") - print(f" Content length: {result.content_length}") - if result.metadata: - print(f" Metadata: {result.metadata}") - fail_count += 1 - print() - - print(f"{'=' * 80}") - print(f"SUMMARY") - print(f"{'=' * 80}") - print(f"Success: {success_count}/{len(urls)}") - print(f"Failed: {fail_count}/{len(urls)}") - print(f"Total time: {elapsed:.2f}s") - print(f"Avg per link: {elapsed/len(urls):.2f}s") - print(f"{'=' * 80}\n") - - # simple assertion: at least 1 should succeed - assert success_count > 0, "At least one link should scrape successfully" diff --git a/app/ai/pipeline/tests/no_claims_fallback_test.py b/app/ai/pipeline/tests/no_claims_fallback_test.py deleted file mode 100644 index 37955c0..0000000 --- a/app/ai/pipeline/tests/no_claims_fallback_test.py +++ /dev/null @@ -1,330 +0,0 @@ -# -*- coding: utf-8 -*- -""" -tests for the no claims fallback pipeline step. - -these tests make REAL calls to the LLM (Gemini API) to validate: -- the structure of outputs -- the LangChain chain works correctly -- the prompt produces valid results -- fallback logic works properly -- pipeline steps integration - -IMPORTANT: Set GOOGLE_API_KEY in your environment before running. - -run with: - pytest app/ai/pipeline/tests/no_claims_fallback_test.py -v -s - -the -s flag shows stdout so you can see the LLM responses for debugging. -""" - -import pytest -from typing import List - -from app.models import DataSource, PipelineConfig -from app.ai.pipeline.no_claims_fallback import ( - NoClaimsFallbackOutput, - should_use_fallback, - get_combined_text_from_sources, -) -from app.ai.pipeline.steps import DefaultPipelineSteps -from app.ai.tests.fixtures.mock_pipelinesteps import WithoutBrowsingPipelineSteps -from app.config.gemini_models import get_gemini_default_pipeline_config - - -# ===== HELPER FUNCTIONS ===== - -def print_fallback_result( - result: NoClaimsFallbackOutput, - test_name: str, - input_text: str | None = None -): - """print fallback result for debugging, including input for verification.""" - print("\n" + "=" * 80) - print(f"TEST: {test_name}") - print("=" * 80) - - # print input for verification - if input_text: - print(f"\nINPUT TEXT:") - print(f" {input_text}") - - # print output - print(f"\nOUTPUT:") - print(f" Explanation:") - print(f" {result.explanation}") - print() - print(f" Original Text (stored):") - print(f" {result.original_text[:100]}..." if len(result.original_text) > 100 else f" {result.original_text}") - print() - - -def validate_fallback_output(result: NoClaimsFallbackOutput): - """validate that a fallback output has the correct structure.""" - # type check - assert isinstance(result, NoClaimsFallbackOutput), "result should be NoClaimsFallbackOutput" - - # required fields - assert result.explanation is not None and result.explanation != "", "explanation should not be empty" - assert result.original_text is not None, "original_text should not be None" - - # type checks - assert isinstance(result.explanation, str), "explanation should be a string" - assert isinstance(result.original_text, str), "original_text should be a string" - - # content validation - assert len(result.explanation) > 10, "explanation should be meaningful (>10 chars)" - - -# ===== TESTS FOR HELPER FUNCTIONS ===== - -def test_should_use_fallback_zero_claims(): - """test that should_use_fallback returns true when no claims.""" - assert should_use_fallback(0) == True, "should use fallback when 0 claims" - - -def test_should_use_fallback_with_claims(): - """test that should_use_fallback returns false when claims exist.""" - assert should_use_fallback(1) == False, "should not use fallback when 1 claim" - assert should_use_fallback(5) == False, "should not use fallback when 5 claims" - - -def test_get_combined_text_single_source(): - """test combining text from a single data source.""" - sources = [ - DataSource( - id="msg-001", - source_type="original_text", - original_text="Olá, bom dia!" - ) - ] - - result = get_combined_text_from_sources(sources) - assert result == "Olá, bom dia!" - - -def test_get_combined_text_multiple_sources(): - """test combining text from multiple data sources.""" - sources = [ - DataSource( - id="msg-001", - source_type="original_text", - original_text="Primeira mensagem." - ), - DataSource( - id="msg-002", - source_type="link_context", - original_text="Segunda mensagem." - ) - ] - - result = get_combined_text_from_sources(sources) - assert result == "Primeira mensagem.\n\nSegunda mensagem." - - -def test_get_combined_text_empty_sources(): - """test combining text from empty sources list.""" - sources = [] - result = get_combined_text_from_sources(sources) - assert result == "" - - -def test_get_combined_text_sources_without_text(): - """test combining when sources have empty original_text.""" - sources = [ - DataSource( - id="msg-001", - source_type="original_text", - original_text="" # empty string instead of None - ) - ] - - result = get_combined_text_from_sources(sources) - assert result == "" - - -# ===== PIPELINE STEPS INTEGRATION TESTS ===== - -@pytest.mark.asyncio -async def test_default_pipeline_steps_fallback(): - """test fallback through DefaultPipelineSteps with gemini config.""" - # create data sources - sources = [ - DataSource( - id="msg-001", - source_type="original_text", - original_text="Olá! Como vai?" - ) - ] - - # get gemini config - config = get_gemini_default_pipeline_config() - - # create pipeline steps - steps = DefaultPipelineSteps() - - # execute fallback through pipeline steps - result = await steps.handle_no_claims_fallback(sources, config) - - # print for debugging - print_fallback_result( - result, - "DefaultPipelineSteps Fallback (Gemini)", - input_text=sources[0].original_text - ) - - # validate structure - validate_fallback_output(result) - - # validate content - assert sources[0].original_text in result.original_text, "should contain source text" - - -@pytest.mark.asyncio -async def test_without_browsing_pipeline_steps_fallback(): - """test fallback through WithoutBrowsingPipelineSteps with gemini config.""" - # create data sources - sources = [ - DataSource( - id="msg-001", - source_type="original_text", - original_text="Bom dia! Tudo bem?" - ) - ] - - # get gemini config - config = get_gemini_default_pipeline_config() - - # create pipeline steps - steps = WithoutBrowsingPipelineSteps() - - # execute fallback through pipeline steps - result = await steps.handle_no_claims_fallback(sources, config) - - # print for debugging - print_fallback_result( - result, - "WithoutBrowsingPipelineSteps Fallback (Gemini)", - input_text=sources[0].original_text - ) - - # validate structure - validate_fallback_output(result) - - # validate content - assert sources[0].original_text in result.original_text, "should contain source text" - - -@pytest.mark.asyncio -async def test_pipeline_steps_multiple_sources(): - """test fallback with multiple data sources through pipeline steps.""" - # create multiple data sources - sources = [ - DataSource( - id="msg-001", - source_type="original_text", - original_text="Olá!" - ), - DataSource( - id="link-001", - source_type="link_context", - original_text="Como posso ajudar?" - ), - DataSource( - id="img-001", - source_type="image", # valid source_type - original_text="Obrigado!" - ) - ] - - # get gemini config - config = get_gemini_default_pipeline_config() - - # create pipeline steps - steps = DefaultPipelineSteps() - - # execute fallback through pipeline steps - result = await steps.handle_no_claims_fallback(sources, config) - - # print for debugging - combined = get_combined_text_from_sources(sources) - print_fallback_result( - result, - "Pipeline Steps - Multiple Sources (Gemini)", - input_text=combined - ) - - # validate structure - validate_fallback_output(result) - - # validate all sources are combined - assert "Olá!" in result.original_text, "should contain first source" - assert "Como posso ajudar?" in result.original_text, "should contain second source" - assert "Obrigado!" in result.original_text, "should contain third source" - - -@pytest.mark.asyncio -async def test_pipeline_steps_empty_sources(): - """test fallback with empty sources through pipeline steps.""" - # create empty sources - sources = [] - - # get gemini config - config = get_gemini_default_pipeline_config() - - # create pipeline steps - steps = DefaultPipelineSteps() - - # execute fallback through pipeline steps - result = await steps.handle_no_claims_fallback(sources, config) - - # print for debugging - print_fallback_result( - result, - "Pipeline Steps - Empty Sources (Gemini)", - input_text="(no sources)" - ) - - # validate structure - assert isinstance(result, NoClaimsFallbackOutput), "should return valid output" - assert result.original_text == "", "should have empty original text" - - -@pytest.mark.asyncio -async def test_pipeline_config_fallback_llm_is_used(): - """test that the fallback LLM from config is actually used.""" - # create data source - sources = [ - DataSource( - id="msg-001", - source_type="original_text", - original_text="Oi, tudo bem?" - ) - ] - - # get gemini config (uses gemini-2.5-flash for fallback) - config = get_gemini_default_pipeline_config() - - # verify config has fallback LLM - assert config.fallback_llm_config is not None, "config should have fallback_llm_config" - assert config.fallback_llm_config.llm is not None, "fallback_llm_config should have llm" - - # create pipeline steps - steps = DefaultPipelineSteps() - - # execute fallback - result = await steps.handle_no_claims_fallback(sources, config) - - # print for debugging - print("\n" + "=" * 80) - print("TEST: Pipeline Config Fallback LLM Usage") - print("=" * 80) - print(f"\nConfig fallback LLM model: {config.fallback_llm_config.llm.model}") - print(f"\nGenerated explanation:") - print(f" {result.explanation}") - print() - - # validate structure - validate_fallback_output(result) - - # validate content - assert result.original_text == sources[0].original_text, "should store original text" diff --git a/app/ai/pipeline/tests/test_build_adjudication_input.py b/app/ai/pipeline/tests/test_build_adjudication_input.py deleted file mode 100644 index 8e8e2d2..0000000 --- a/app/ai/pipeline/tests/test_build_adjudication_input.py +++ /dev/null @@ -1,602 +0,0 @@ -# -*- coding: utf-8 -*- -""" -tests for build_adjudication_input function. - -these tests validate that: -- claim IDs are correctly preserved throughout the pipeline -- data source tracking works properly via ClaimSource.source_id -- enriched claims are correctly grouped by their original data sources -- the function handles edge cases (empty claims, missing evidence, etc.) - -run with: - pytest app/ai/pipeline/tests/test_build_adjudication_input.py -v -s - -the -s flag shows stdout so you can see detailed output for debugging. -""" - -import pytest -from typing import List, Dict - -from app.models import ( - DataSource, - ClaimExtractionOutput, - ExtractedClaim, - ClaimSource, - EnrichedClaim, - Citation, - EvidenceRetrievalResult, - AdjudicationInput, - DataSourceWithClaims, -) -from app.ai.main_pipeline import build_adjudication_input - - -# ===== HELPER FUNCTIONS ===== - -def create_test_data_source(ds_id: str, source_type: str, text: str) -> DataSource: - """create a test data source""" - return DataSource( - id=ds_id, - source_type=source_type, - original_text=text, - metadata={}, - locale="pt-BR", - timestamp="2024-11-18T00:00:00Z" - ) - - -def create_test_extracted_claim( - claim_id: str, - claim_text: str, - source_id: str, - source_type: str -) -> ExtractedClaim: - """create a test extracted claim""" - return ExtractedClaim( - id=claim_id, - text=claim_text, - source=ClaimSource( - source_type=source_type, - source_id=source_id - ), - entities=["test", "entity"], - llm_comment="test comment" - ) - - -def create_test_enriched_claim( - claim_id: str, - claim_text: str, - source_id: str, - source_type: str, - citations: List[Citation] -) -> EnrichedClaim: - """create a test enriched claim with citations""" - return EnrichedClaim( - id=claim_id, - text=claim_text, - source=ClaimSource( - source_type=source_type, - source_id=source_id - ), - entities=["test", "entity"], - llm_comment="test comment", - citations=citations - ) - - -def create_test_citation(url: str, title: str) -> Citation: - """create a test citation""" - return Citation( - url=url, - title=title, - publisher="Test Publisher", - citation_text="Test citation text", - source="google_fact_checking_api", - rating="Falso", - date="2024-11-18" - ) - - -def print_adjudication_input(adj_input: AdjudicationInput, test_name: str): - """print adjudication input for debugging""" - print("\n" + "=" * 80) - print(f"TEST: {test_name}") - print("=" * 80) - print(f"\n📦 ADJUDICATION INPUT:") - print(f" Total data sources: {len(adj_input.sources_with_claims)}") - - for i, ds_with_claims in enumerate(adj_input.sources_with_claims, 1): - ds = ds_with_claims.data_source - claims = ds_with_claims.enriched_claims - print(f"\n {i}. DataSource: {ds.id} ({ds.source_type})") - print(f" Text: {ds.original_text[:60]}...") - print(f" Enriched claims: {len(claims)}") - - for j, claim in enumerate(claims, 1): - print(f" {j}) Claim ID: {claim.id}") - print(f" Text: {claim.text[:60]}...") - print(f" Source: {claim.source.source_type} ({claim.source.source_id})") - print(f" Citations: {len(claim.citations)}") - - -# ===== TESTS ===== - -def test_claim_ids_are_preserved(): - """ - test that claim IDs are correctly preserved from extraction through enrichment. - - validates: - - extracted claim ID matches enriched claim ID - - claim ID is used as key in evidence map - - same claim ID appears in final adjudication input - """ - # setup: create data source - ds = create_test_data_source("ds-001", "original_text", "Test message") - - # setup: create extracted claims - claim_1 = create_test_extracted_claim( - "claim-123", - "Test claim 1", - "ds-001", - "original_text" - ) - claim_2 = create_test_extracted_claim( - "claim-456", - "Test claim 2", - "ds-001", - "original_text" - ) - - claim_output = ClaimExtractionOutput( - data_source=ds, - claims=[claim_1, claim_2] - ) - - # setup: create enriched claims with same IDs - enriched_1 = create_test_enriched_claim( - "claim-123", - "Test claim 1", - "ds-001", - "original_text", - [create_test_citation("https://example.com/1", "Test 1")] - ) - enriched_2 = create_test_enriched_claim( - "claim-456", - "Test claim 2", - "ds-001", - "original_text", - [create_test_citation("https://example.com/2", "Test 2")] - ) - - evidence_result = EvidenceRetrievalResult( - claim_evidence_map={ - "claim-123": enriched_1, - "claim-456": enriched_2 - } - ) - - # execute: build adjudication input - adj_input = build_adjudication_input([claim_output], evidence_result) - - # print for debugging - print_adjudication_input(adj_input, "Claim IDs Preserved") - - # assert: claim IDs are preserved - assert len(adj_input.sources_with_claims) == 1 - - ds_with_claims = adj_input.sources_with_claims[0] - assert len(ds_with_claims.enriched_claims) == 2 - - claim_ids = {claim.id for claim in ds_with_claims.enriched_claims} - assert "claim-123" in claim_ids - assert "claim-456" in claim_ids - - print("\n✅ PASSED: All claim IDs preserved correctly") - - -def test_source_tracking_preserved(): - """ - test that source tracking (source_id and source_type) is preserved. - - validates: - - ClaimSource.source_id matches DataSource.id - - ClaimSource.source_type matches DataSource.source_type - - enriched claims can be traced back to their original data source - """ - # setup: create multiple data sources - ds1 = create_test_data_source("msg-001", "original_text", "Original message") - ds2 = create_test_data_source("link-002", "link_context", "Link content") - - # setup: create claims from different sources - claim_from_msg = create_test_extracted_claim( - "claim-msg-1", - "Claim from original message", - "msg-001", - "original_text" - ) - claim_from_link = create_test_extracted_claim( - "claim-link-1", - "Claim from link", - "link-002", - "link_context" - ) - - claim_outputs = [ - ClaimExtractionOutput(data_source=ds1, claims=[claim_from_msg]), - ClaimExtractionOutput(data_source=ds2, claims=[claim_from_link]) - ] - - # setup: create enriched claims - enriched_msg = create_test_enriched_claim( - "claim-msg-1", - "Claim from original message", - "msg-001", - "original_text", - [] - ) - enriched_link = create_test_enriched_claim( - "claim-link-1", - "Claim from link", - "link-002", - "link_context", - [] - ) - - evidence_result = EvidenceRetrievalResult( - claim_evidence_map={ - "claim-msg-1": enriched_msg, - "claim-link-1": enriched_link - } - ) - - # execute - adj_input = build_adjudication_input(claim_outputs, evidence_result) - - # print for debugging - print_adjudication_input(adj_input, "Source Tracking Preserved") - - # assert: source tracking is correct - assert len(adj_input.sources_with_claims) == 2 - - # find DataSourceWithClaims for msg-001 - msg_sources = [ - ds for ds in adj_input.sources_with_claims - if ds.data_source.id == "msg-001" - ] - assert len(msg_sources) == 1 - assert len(msg_sources[0].enriched_claims) == 1 - - msg_claim = msg_sources[0].enriched_claims[0] - assert msg_claim.id == "claim-msg-1" - assert msg_claim.source.source_id == "msg-001" - assert msg_claim.source.source_type == "original_text" - - # find DataSourceWithClaims for link-002 - link_sources = [ - ds for ds in adj_input.sources_with_claims - if ds.data_source.id == "link-002" - ] - assert len(link_sources) == 1 - assert len(link_sources[0].enriched_claims) == 1 - - link_claim = link_sources[0].enriched_claims[0] - assert link_claim.id == "claim-link-1" - assert link_claim.source.source_id == "link-002" - assert link_claim.source.source_type == "link_context" - - print("\n✅ PASSED: Source tracking preserved correctly") - - -def test_grouping_by_data_source(): - """ - test that claims are correctly grouped by their original data source. - - validates: - - multiple claims from same source are grouped together - - claims from different sources are kept separate - - all claims for a source are included in the group - """ - # setup: one data source with multiple claims - ds = create_test_data_source("ds-multi", "original_text", "Message with multiple claims") - - claim_1 = create_test_extracted_claim("claim-1", "First claim", "ds-multi", "original_text") - claim_2 = create_test_extracted_claim("claim-2", "Second claim", "ds-multi", "original_text") - claim_3 = create_test_extracted_claim("claim-3", "Third claim", "ds-multi", "original_text") - - claim_output = ClaimExtractionOutput( - data_source=ds, - claims=[claim_1, claim_2, claim_3] - ) - - # setup: enriched claims - enriched_1 = create_test_enriched_claim("claim-1", "First claim", "ds-multi", "original_text", []) - enriched_2 = create_test_enriched_claim("claim-2", "Second claim", "ds-multi", "original_text", []) - enriched_3 = create_test_enriched_claim("claim-3", "Third claim", "ds-multi", "original_text", []) - - evidence_result = EvidenceRetrievalResult( - claim_evidence_map={ - "claim-1": enriched_1, - "claim-2": enriched_2, - "claim-3": enriched_3 - } - ) - - # execute - adj_input = build_adjudication_input([claim_output], evidence_result) - - # print for debugging - print_adjudication_input(adj_input, "Grouping By Data Source") - - # assert: all claims grouped under same data source - assert len(adj_input.sources_with_claims) == 1 - - ds_with_claims = adj_input.sources_with_claims[0] - assert ds_with_claims.data_source.id == "ds-multi" - assert len(ds_with_claims.enriched_claims) == 3 - - claim_ids = {claim.id for claim in ds_with_claims.enriched_claims} - assert claim_ids == {"claim-1", "claim-2", "claim-3"} - - print("\n✅ PASSED: Claims correctly grouped by data source") - - -def test_empty_claims_handled(): - """ - test edge case: data source with no claims extracted. - - validates: - - data sources with zero claims are included in output - - empty enriched_claims list is created - - no errors occur - """ - # setup: data source with no claims - ds = create_test_data_source("ds-empty", "original_text", "Message with no claims") - - claim_output = ClaimExtractionOutput( - data_source=ds, - claims=[] # no claims extracted - ) - - evidence_result = EvidenceRetrievalResult(claim_evidence_map={}) - - # execute - adj_input = build_adjudication_input([claim_output], evidence_result) - - # print for debugging - print_adjudication_input(adj_input, "Empty Claims Handled") - - # assert: data source included with empty claims - assert len(adj_input.sources_with_claims) == 1 - - ds_with_claims = adj_input.sources_with_claims[0] - assert ds_with_claims.data_source.id == "ds-empty" - assert len(ds_with_claims.enriched_claims) == 0 - - print("\n✅ PASSED: Empty claims handled correctly") - - -def test_missing_evidence_for_claim(): - """ - test edge case: claim exists but no evidence was found. - - validates: - - claims without evidence in the map are skipped - - only claims with evidence appear in adjudication input - - no errors occur for missing claims - """ - # setup: data source with claims - ds = create_test_data_source("ds-001", "original_text", "Test message") - - claim_1 = create_test_extracted_claim("claim-has-evidence", "Claim with evidence", "ds-001", "original_text") - claim_2 = create_test_extracted_claim("claim-no-evidence", "Claim without evidence", "ds-001", "original_text") - - claim_output = ClaimExtractionOutput( - data_source=ds, - claims=[claim_1, claim_2] - ) - - # setup: only one claim has evidence - enriched_1 = create_test_enriched_claim( - "claim-has-evidence", - "Claim with evidence", - "ds-001", - "original_text", - [create_test_citation("https://example.com", "Test")] - ) - # claim-no-evidence is NOT in the evidence map - - evidence_result = EvidenceRetrievalResult( - claim_evidence_map={ - "claim-has-evidence": enriched_1 - # "claim-no-evidence" is missing - } - ) - - # execute - adj_input = build_adjudication_input([claim_output], evidence_result) - - # print for debugging - print_adjudication_input(adj_input, "Missing Evidence For Claim") - - # assert: only claim with evidence is included - assert len(adj_input.sources_with_claims) == 1 - - ds_with_claims = adj_input.sources_with_claims[0] - assert len(ds_with_claims.enriched_claims) == 1 - assert ds_with_claims.enriched_claims[0].id == "claim-has-evidence" - - print("\n✅ PASSED: Missing evidence handled correctly (claim skipped)") - - -def test_citations_preserved(): - """ - test that citations are preserved in enriched claims. - - validates: - - citations from evidence gathering are present in adjudication input - - citation count matches - - citation details are intact - """ - # setup - ds = create_test_data_source("ds-001", "original_text", "Test message") - - claim = create_test_extracted_claim("claim-1", "Test claim", "ds-001", "original_text") - claim_output = ClaimExtractionOutput(data_source=ds, claims=[claim]) - - # setup: enriched claim with multiple citations - citations = [ - create_test_citation("https://example.com/1", "Source 1"), - create_test_citation("https://example.com/2", "Source 2"), - create_test_citation("https://example.com/3", "Source 3"), - ] - - enriched = create_test_enriched_claim( - "claim-1", - "Test claim", - "ds-001", - "original_text", - citations - ) - - evidence_result = EvidenceRetrievalResult(claim_evidence_map={"claim-1": enriched}) - - # execute - adj_input = build_adjudication_input([claim_output], evidence_result) - - # print for debugging - print_adjudication_input(adj_input, "Citations Preserved") - - # assert: citations preserved - ds_with_claims = adj_input.sources_with_claims[0] - claim_with_citations = ds_with_claims.enriched_claims[0] - - assert len(claim_with_citations.citations) == 3 - assert claim_with_citations.citations[0].url == "https://example.com/1" - assert claim_with_citations.citations[1].title == "Source 2" - assert claim_with_citations.citations[2].publisher == "Test Publisher" - - print("\n✅ PASSED: Citations preserved correctly") - - -def test_complex_pipeline_flow(): - """ - integration test simulating full pipeline flow with multiple sources and claims. - - validates: - - original_text source with 2 claims - - link_context source with 1 claim - - all IDs, sources, and citations preserved correctly - """ - # setup: multiple data sources - ds_original = create_test_data_source( - "msg-001", - "original_text", - "Original message mentioning vaccine and climate change" - ) - ds_link = create_test_data_source( - "link-001", - "link_context", - "Link content about vaccine safety" - ) - - # setup: claims from different sources - claim_vaccine_msg = create_test_extracted_claim( - "claim-vac-msg", - "Vaccine X causes infertility", - "msg-001", - "original_text" - ) - claim_climate_msg = create_test_extracted_claim( - "claim-climate-msg", - "Global warming is accelerating", - "msg-001", - "original_text" - ) - claim_vaccine_link = create_test_extracted_claim( - "claim-vac-link", - "Vaccine X was tested on 50000 participants", - "link-001", - "link_context" - ) - - claim_outputs = [ - ClaimExtractionOutput(data_source=ds_original, claims=[claim_vaccine_msg, claim_climate_msg]), - ClaimExtractionOutput(data_source=ds_link, claims=[claim_vaccine_link]) - ] - - # setup: enriched claims with citations - enriched_vac_msg = create_test_enriched_claim( - "claim-vac-msg", - "Vaccine X causes infertility", - "msg-001", - "original_text", - [ - create_test_citation("https://health.gov/vaccine-study", "Vaccine Study"), - create_test_citation("https://who.int/vaccines", "WHO Report") - ] - ) - enriched_climate = create_test_enriched_claim( - "claim-climate-msg", - "Global warming is accelerating", - "msg-001", - "original_text", - [create_test_citation("https://ipcc.ch/report", "IPCC Report")] - ) - enriched_vac_link = create_test_enriched_claim( - "claim-vac-link", - "Vaccine X was tested on 50000 participants", - "link-001", - "link_context", - [create_test_citation("https://clinicaltrials.gov/study", "Clinical Trial")] - ) - - evidence_result = EvidenceRetrievalResult( - claim_evidence_map={ - "claim-vac-msg": enriched_vac_msg, - "claim-climate-msg": enriched_climate, - "claim-vac-link": enriched_vac_link - } - ) - - # execute - adj_input = build_adjudication_input(claim_outputs, evidence_result) - - # print for debugging - print_adjudication_input(adj_input, "Complex Pipeline Flow") - - # assert: correct structure - assert len(adj_input.sources_with_claims) == 2 - - # find original_text source - original_sources = [ - ds for ds in adj_input.sources_with_claims - if ds.data_source.source_type == "original_text" - ] - assert len(original_sources) == 1 - assert len(original_sources[0].enriched_claims) == 2 - - # find link_context source - link_sources = [ - ds for ds in adj_input.sources_with_claims - if ds.data_source.source_type == "link_context" - ] - assert len(link_sources) == 1 - assert len(link_sources[0].enriched_claims) == 1 - - # verify specific claims - original_claim_ids = {claim.id for claim in original_sources[0].enriched_claims} - assert "claim-vac-msg" in original_claim_ids - assert "claim-climate-msg" in original_claim_ids - - link_claim = link_sources[0].enriched_claims[0] - assert link_claim.id == "claim-vac-link" - assert len(link_claim.citations) == 1 - - print("\n✅ PASSED: Complex pipeline flow handled correctly") - - -# ===== RUN ALL TESTS ===== - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) diff --git a/app/ai/pipeline/utils.py b/app/ai/pipeline/utils.py deleted file mode 100644 index 20db483..0000000 --- a/app/ai/pipeline/utils.py +++ /dev/null @@ -1,186 +0,0 @@ -from datetime import datetime, timezone -from typing import List, Optional, Union - -from app.models import ( - DataSourceResult, - ClaimVerdict, - DataSourceWithClaims, - DataSourceWithExtractedClaims, - LLMDataSourceResult, - ExtractedClaim, - EnrichedClaim, -) - - -# date format for fact-checking context: DD-MM-YYYY -DATE_FORMAT = "%d-%m-%Y" - - -# ===== DATE UTILITIES ===== - -def get_current_date() -> str: - """ - Returns the current date in DD-MM-YYYY format using UTC timezone. - - Returns: - Formatted date string (e.g., "08-12-2024") - """ - now = datetime.now(timezone.utc) - return now.strftime(DATE_FORMAT) - - -# ===== LLM OUTPUT CONVERSION UTILITIES ===== - -def get_data_source_with_claims( - llm_source_result: LLMDataSourceResult, - sources_with_claims: List[Union[DataSourceWithClaims, DataSourceWithExtractedClaims]], - result_index: int -) -> Optional[Union[DataSourceWithClaims, DataSourceWithExtractedClaims]]: - """ - Matches an LLM data source result back to the original input. - - Works with both DataSourceWithClaims (with citations) and - DataSourceWithExtractedClaims (without citations). - - Uses hybrid matching strategy: - 1. Try to match by data_source_id (if provided by LLM) - 2. Fall back to matching by position/order - - Args: - llm_source_result: LLM output for one data source - sources_with_claims: List of original sources with claims (either type) - result_index: Position of this result in the LLM output list - - Returns: - Matched DataSourceWithClaims or DataSourceWithExtractedClaims, or None if no match found - """ - # Create mapping of data_source_id to original source_with_claims - source_map = { - source_with_claims.data_source.id: source_with_claims - for source_with_claims in sources_with_claims - } - - # Try to match by data_source_id first - if llm_source_result.data_source_id: - source_with_claims = source_map.get(llm_source_result.data_source_id) - if source_with_claims: - return source_with_claims - - print(f"[WARNING] LLM returned unknown data_source_id: {llm_source_result.data_source_id}") - - # Fallback: match by order (position in list) - print(f"[INFO] data_source_id missing for result {result_index}, matching by order") - if result_index < len(sources_with_claims): - return sources_with_claims[result_index] - - print(f"[WARNING] No source at index {result_index}") - return None - - -def get_claim_verdicts( - llm_source_result: LLMDataSourceResult, - source_with_claims: Union[DataSourceWithClaims, DataSourceWithExtractedClaims] -) -> List[ClaimVerdict]: - """ - Converts LLM claim verdicts to ClaimVerdict objects with proper IDs. - - Works with both DataSourceWithClaims (with citations) and - DataSourceWithExtractedClaims (without citations). - - Uses hybrid matching strategy for claim IDs: - 1. Try to use claim_id from LLM output (if provided and valid) - 2. Fall back to matching by claim_text - - Args: - llm_source_result: LLM output for one data source - source_with_claims: Original input for this data source (either type) - - Returns: - List of ClaimVerdict objects with proper claim_id populated - """ - # Get claims list regardless of model type - claims = ( - source_with_claims.enriched_claims - if isinstance(source_with_claims, DataSourceWithClaims) - else source_with_claims.extracted_claims - ) - - # Create mappings for claim matching - claim_id_by_id = {claim.id: claim for claim in claims} - claim_id_by_text = {claim.text: claim.id for claim in claims} - - # Convert LLM verdicts to ClaimVerdict objects - claim_verdicts: List[ClaimVerdict] = [] - for llm_verdict in llm_source_result.claim_verdicts: - # Try to get claim_id: first from LLM output, then from claim_text matching - if llm_verdict.claim_id and llm_verdict.claim_id in claim_id_by_id: - # Use claim_id from LLM (most reliable) - claim_id = llm_verdict.claim_id - else: - # Fallback: match by claim_text - claim_id = claim_id_by_text.get(llm_verdict.claim_text, "unknown") - if llm_verdict.claim_id: - print(f"[WARNING] LLM returned unknown claim_id: {llm_verdict.claim_id}, matched by text instead") - - verdict = ClaimVerdict( - claim_id=claim_id, - claim_text=llm_verdict.claim_text, - verdict=llm_verdict.verdict, - justification=llm_verdict.justification, - citations_used=llm_verdict.citations_used - ) - claim_verdicts.append(verdict) - - return claim_verdicts - - -def convert_llm_output_to_data_source_results( - llm_results: List[LLMDataSourceResult], - sources_with_claims: List[Union[DataSourceWithClaims, DataSourceWithExtractedClaims]] -) -> List[DataSourceResult]: - """ - Converts LLM adjudication output to DataSourceResult objects. - - Works with both DataSourceWithClaims (with citations) and - DataSourceWithExtractedClaims (without citations). - - This function processes the raw LLM output and matches it back to the original - input sources, creating properly structured DataSourceResult objects with - correct IDs and metadata. - - Args: - llm_results: List of LLM output results (one per data source) - sources_with_claims: List of original sources with their claims (either type) - - Returns: - List of DataSourceResult objects ready to be included in FactCheckResult - """ - data_source_results: List[DataSourceResult] = [] - - # Process each LLM result - for idx, llm_source_result in enumerate(llm_results): - # Match LLM result to original input data source - source_with_claims = get_data_source_with_claims( - llm_source_result=llm_source_result, - sources_with_claims=sources_with_claims, - result_index=idx - ) - if not source_with_claims: - print(f"[WARNING] No source_with_claims match found for result {idx}") - continue # Skip if no match found - - # Convert LLM verdicts to ClaimVerdict objects with proper IDs - claim_verdicts = get_claim_verdicts( - llm_source_result=llm_source_result, - source_with_claims=source_with_claims - ) - - # Create DataSourceResult with info from original input - source_result = DataSourceResult( - data_source_id=source_with_claims.data_source.id, - source_type=source_with_claims.data_source.source_type, - claim_verdicts=claim_verdicts - ) - data_source_results.append(source_result) - - return data_source_results \ No newline at end of file diff --git a/app/ai/tests/fixtures/mock_pipelinesteps.py b/app/ai/tests/fixtures/mock_pipelinesteps.py deleted file mode 100644 index 7912ebe..0000000 --- a/app/ai/tests/fixtures/mock_pipelinesteps.py +++ /dev/null @@ -1,231 +0,0 @@ -""" -Mock pipeline steps implementations for testing. - -provides alternative implementations of PipelineSteps that avoid expensive operations -like web browsing, making tests faster and more predictable. -""" - -from typing import List - -from app.models import ( - DataSource, - PipelineConfig, - DataSourceWithExtractedClaims, - FactCheckResult, - AdjudicationInput, - DataSourceResult, - ClaimVerdict, - LLMConfig, -) -from app.ai.context import EvidenceGatherer -from app.ai.context.factcheckapi import GoogleFactCheckGatherer -from app.ai.pipeline.steps import DefaultPipelineSteps -from app.ai.context.web import WebSearchGatherer -from app.ai.pipeline.tests.fixtures.mock_linkexpander import hybrid_expand_link_contexts -from app.config import get_trusted_domains - - -class WithoutBrowsingPipelineSteps(DefaultPipelineSteps): - """ - hybrid pipeline steps implementation that minimizes expensive operations. - - mocks social media URLs (Facebook, Instagram, Twitter, TikTok) to avoid Apify API calls, - while allowing real simple HTTP scraping for generic URLs. only uses GoogleFactCheckGatherer - for evidence gathering (no web search). - - ideal for: - - fast unit tests - - offline development (with some limitations for generic URLs) - - predictable test results for social media content - - avoiding Apify API rate limits and costs - - testing with real HTTP scraping for generic websites - - example: - >>> from app.ai.tests.fixtures.mock_pipelinesteps import WithoutBrowsingPipelineSteps - >>> from app.ai.main_pipeline import run_fact_check_pipeline - >>> steps = WithoutBrowsingPipelineSteps() - >>> result = await run_fact_check_pipeline(sources, config, steps) - """ - - def get_evidence_gatherers(self) -> List[EvidenceGatherer]: - """ - get evidence gatherers for the pipeline. - - returns only GoogleFactCheckGatherer to avoid web browsing. - - returns: - list with only GoogleFactCheckGatherer (no WebSearchGatherer) - """ - allowed_domains = get_trusted_domains() - return [ - GoogleFactCheckGatherer(timeout=15.0),WebSearchGatherer(max_results=5, timeout=15.0,allowed_domains=allowed_domains) - ] - - def _expand_data_sources_with_links( - self, - data_sources: List[DataSource], - config: PipelineConfig - ) -> List[DataSource]: - """ - hybrid implementation: mocks social media URLs, uses real scraping for generic URLs. - - social media URLs (Facebook, Instagram, Twitter, TikTok) are mocked to avoid - Apify API calls. generic URLs use real simple HTTP scraping. - - args: - data_sources: list of data sources to process - config: pipeline configuration - - returns: - list of new 'link_context' data sources (mocked + real) - """ - expanded_link_sources: List[DataSource] = [] - - for source in data_sources: - if source.source_type == "original_text": - print(f"\n[HYBRID LINK EXPANSION] Processing original_text source: {source.id}") - print(f" Text preview: {source.original_text[:100]}...") - - try: - # use hybrid link expander (mocks social media, real scraping for generic) - expanded_sources = hybrid_expand_link_contexts(source, config) - - # handle None return - if expanded_sources is None: - print(" Warning: hybrid link expansion returned None") - continue - - if expanded_sources: - print(f" Created {len(expanded_sources)} link_context data source(s):") - for expanded in expanded_sources: - url = expanded.metadata.get("url", "unknown") - success = expanded.metadata.get("success", False) - is_mock = expanded.metadata.get("mock", False) - status = "✓" if success else "✗" - source_type = "[MOCK]" if is_mock else "[REAL]" - print(f" {status} {source_type} {url}") - - expanded_link_sources.extend(expanded_sources) - else: - print(" No links found or expanded") - - except Exception as e: - print(f" Error in hybrid link expansion for source {source.id}: {e}") - import logging - logging.getLogger(__name__).error( - f"Hybrid link expansion failed for source {source.id}: {e}", - exc_info=True - ) - - return expanded_link_sources - - def adjudicate_claims_with_search( - self, - sources_with_claims: List[DataSourceWithExtractedClaims], - model: str = "gpt-4o-mini" - ) -> FactCheckResult: - """ - Implementation using OpenAI web search for adjudication. - - This works the same as DefaultPipelineSteps since it doesn't use browser-based - scraping - it relies on OpenAI's web search tool. - - Args: - sources_with_claims: List of data sources with their extracted claims - model: OpenAI model to use (default: gpt-4o-mini) - - Returns: - FactCheckResult with verdicts for all claims - """ - from app.ai.pipeline.adjudication_with_search import adjudicate_claims_with_search - - return adjudicate_claims_with_search( - sources_with_claims=sources_with_claims, - model=model - ) - - # note: all other methods (extract_claims_from_all_sources, gather_evidence, - # handle_no_claims_fallback) are inherited from DefaultPipelineSteps and work as normal - - -class UnverifiableMockPipelineSteps(WithoutBrowsingPipelineSteps): - """ - pipeline steps that returns hard-coded unverifiable results for adjudication. - - inherits mock link expansion from WithoutBrowsingPipelineSteps, but overrides - the adjudicate_claims method to return hard-coded results where all claims - are marked as "Fontes insuficientes para verificar". - - ideal for: - - testing the adjudication_with_search fallback logic - - testing what happens when all claims are unverifiable - - integration tests that need predictable unverifiable results - - usage: - >>> from app.ai.tests.fixtures.mock_pipelinesteps import UnverifiableMockPipelineSteps - >>> from app.ai.main_pipeline import run_fact_check_pipeline - >>> steps = UnverifiableMockPipelineSteps() - >>> result = await run_fact_check_pipeline(sources, config, steps) - >>> # adjudication will return all unverifiable, triggering fallback - """ - - def adjudicate_claims( - self, - adjudication_input: AdjudicationInput, - llm_config: LLMConfig - ) -> FactCheckResult: - """ - override: returns hard-coded unverifiable results instead of real adjudication. - - creates a result where every claim gets verdict "Fontes insuficientes para verificar" - with a generic justification. useful for testing the fallback to adjudication_with_search. - - args: - adjudication_input: the adjudication input with sources and claims - llm_config: LLM configuration (ignored in mock) - - returns: - FactCheckResult with all claims marked as unverifiable - """ - print("\n[MOCK ADJUDICATION] Returning hard-coded unverifiable results (triggering fallback)") - - results: List[DataSourceResult] = [] - - for source_with_claims in adjudication_input.sources_with_claims: - data_source = source_with_claims.data_source - enriched_claims = source_with_claims.enriched_claims - - print(f" Source: {data_source.id} with {len(enriched_claims)} claims") - - # create unverifiable verdict for each claim - verdicts: List[ClaimVerdict] = [] - for claim in enriched_claims: - verdict = ClaimVerdict( - claim_id=claim.id, - claim_text=claim.text, - verdict="Fontes insuficientes para verificar", - justification=( - f"Não foram encontradas fontes suficientes para verificar a alegação: '{claim.text}'. " - f"As evidências disponíveis são insuficientes ou contraditórias." - ), - citations_used=[] - ) - verdicts.append(verdict) - print(f" - [{claim.id}] {claim.text[:60]}... -> Fontes insuficientes") - - # create result for this data source - source_result = DataSourceResult( - data_source_id=data_source.id, - source_type=data_source.source_type, - claim_verdicts=verdicts - ) - results.append(source_result) - - return FactCheckResult( - results=results, - overall_summary=( - "Todas as alegações não puderam ser verificadas devido à falta de fontes confiáveis. " - "As evidências disponíveis são insuficientes para confirmar ou refutar as alegações." - ), - sources_with_claims=adjudication_input.sources_with_claims - ) diff --git a/app/ai/tests/test_async_pipeline.py b/app/ai/tests/test_async_pipeline.py deleted file mode 100644 index 0605465..0000000 --- a/app/ai/tests/test_async_pipeline.py +++ /dev/null @@ -1,482 +0,0 @@ -""" -unit tests for async pipeline execution utilities. - -tests the fire-and-forget streaming pipeline and its helper functions -to ensure correct structure and error-free execution. -""" - -import pytest -from unittest.mock import Mock, patch - -from app.ai.async_code import ( - fire_evidence_jobs_for_claim, - collect_evidence_results, - fire_and_forget_streaming_pipeline, -) -from app.ai.threads.thread_utils import ThreadPoolManager, OperationType -from app.models import ( - DataSource, - ClaimExtractionInput, - ClaimExtractionOutput, - ExtractedClaim, - ClaimSource, - Citation, - EnrichedClaim, -) - - -# ===== FIXTURES ===== - -@pytest.fixture -def mock_thread_pool_manager(): - """create a mock thread pool manager""" - manager = Mock(spec=ThreadPoolManager) - manager.submit = Mock() - manager.wait_next_completed = Mock() - return manager - - -@pytest.fixture -def sample_data_source(): - """create a sample data source for testing""" - return DataSource( - id="test-source-1", - source_type="original_text", - original_text="Sample text for fact checking", - metadata={} - ) - - -@pytest.fixture -def sample_claim(): - """create a sample extracted claim""" - return ExtractedClaim( - id="claim-1", - text="Sample claim text", - source=ClaimSource(source_type="original_text", source_id="test-source-1"), - entities=["Entity1", "Entity2"], - llm_comment="This is a testable claim" - ) - - -@pytest.fixture -def sample_citation(): - """create a sample citation""" - return Citation( - url="https://example.com/article", - title="Test Article", - publisher="Test Publisher", - citation_text="Sample citation text", - source="apify_web_search", - rating=None, - date=None - ) - - -@pytest.fixture -def mock_evidence_gatherer(sample_citation): - """create a mock evidence gatherer""" - gatherer = Mock() - gatherer.source_name = "test_gatherer" - gatherer.gather_sync = Mock(return_value=[sample_citation]) - return gatherer - - -# ===== TESTS FOR fire_evidence_jobs_for_claim ===== - -def test_fire_evidence_jobs_for_claim_structure( - sample_claim, - mock_evidence_gatherer, - mock_thread_pool_manager -): - """test that fire_evidence_jobs_for_claim submits correct number of jobs""" - claim_id_to_claim = {} - evidence_jobs_by_claim = {} - - jobs_submitted = fire_evidence_jobs_for_claim( - claim=sample_claim, - evidence_gatherers=[mock_evidence_gatherer], - manager=mock_thread_pool_manager, - claim_id_to_claim=claim_id_to_claim, - evidence_jobs_by_claim=evidence_jobs_by_claim, - ) - - # verify structure - assert jobs_submitted == 1, "should submit 1 job for 1 gatherer" - assert sample_claim.id in claim_id_to_claim, "claim should be tracked" - assert claim_id_to_claim[sample_claim.id] == sample_claim - assert sample_claim.id in evidence_jobs_by_claim - assert "test_gatherer" in evidence_jobs_by_claim[sample_claim.id] - - # verify manager.submit was called - assert mock_thread_pool_manager.submit.call_count == 1 - call_args = mock_thread_pool_manager.submit.call_args - assert call_args[0][0] == OperationType.LINK_EVIDENCE_RETRIEVER - - -def test_fire_evidence_jobs_for_claim_multiple_gatherers( - sample_claim, - mock_thread_pool_manager -): - """test firing jobs for multiple evidence gatherers""" - gatherer1 = Mock() - gatherer1.source_name = "gatherer_1" - gatherer1.gather_sync = Mock(return_value=[]) - - gatherer2 = Mock() - gatherer2.source_name = "gatherer_2" - gatherer2.gather_sync = Mock(return_value=[]) - - claim_id_to_claim = {} - evidence_jobs_by_claim = {} - - jobs_submitted = fire_evidence_jobs_for_claim( - claim=sample_claim, - evidence_gatherers=[gatherer1, gatherer2], - manager=mock_thread_pool_manager, - claim_id_to_claim=claim_id_to_claim, - evidence_jobs_by_claim=evidence_jobs_by_claim, - ) - - # verify structure - assert jobs_submitted == 2, "should submit 2 jobs for 2 gatherers" - assert len(evidence_jobs_by_claim[sample_claim.id]) == 2 - assert "gatherer_1" in evidence_jobs_by_claim[sample_claim.id] - assert "gatherer_2" in evidence_jobs_by_claim[sample_claim.id] - assert mock_thread_pool_manager.submit.call_count == 2 - - -def test_fire_evidence_jobs_for_claim_no_errors( - sample_claim, - mock_evidence_gatherer, - mock_thread_pool_manager -): - """test that function completes without errors""" - claim_id_to_claim = {} - evidence_jobs_by_claim = {} - - # should not raise any exceptions - try: - jobs_submitted = fire_evidence_jobs_for_claim( - claim=sample_claim, - evidence_gatherers=[mock_evidence_gatherer], - manager=mock_thread_pool_manager, - claim_id_to_claim=claim_id_to_claim, - evidence_jobs_by_claim=evidence_jobs_by_claim, - ) - assert jobs_submitted >= 0, "function completed without errors" - except Exception as e: - pytest.fail(f"function raised unexpected exception: {e}") - - -# ===== TESTS FOR collect_evidence_results ===== - -def test_collect_evidence_results_structure( - sample_claim, - sample_citation, - mock_thread_pool_manager -): - """test that collect_evidence_results returns correct structure""" - # setup mock to return results - mock_thread_pool_manager.wait_next_completed.side_effect = [ - ("job-1", (sample_claim.id, [sample_citation])), - ] - - claim_id_to_claim = {sample_claim.id: sample_claim} - - result = collect_evidence_results( - manager=mock_thread_pool_manager, - evidence_jobs_submitted=1, - claim_id_to_claim=claim_id_to_claim, - ) - - # verify structure - assert isinstance(result, dict), "result should be a dict" - assert sample_claim.id in result, "result should contain claim id" - assert isinstance(result[sample_claim.id], list), "citations should be a list" - assert len(result[sample_claim.id]) == 1, "should have 1 citation" - assert result[sample_claim.id][0] == sample_citation - - -def test_collect_evidence_results_multiple_gatherers( - sample_claim, - sample_citation, - mock_thread_pool_manager -): - """test collecting results from multiple gatherers for same claim""" - citation2 = Citation( - url="https://example2.com", - title="Test Article 2", - publisher="Publisher 2", - citation_text="Citation 2", - source="google_fact_checking_api", - rating="Verdadeiro", - date=None - ) - - # simulate 2 gatherers completing - mock_thread_pool_manager.wait_next_completed.side_effect = [ - ("job-1", (sample_claim.id, [sample_citation])), - ("job-2", (sample_claim.id, [citation2])), - ] - - claim_id_to_claim = {sample_claim.id: sample_claim} - - result = collect_evidence_results( - manager=mock_thread_pool_manager, - evidence_jobs_submitted=2, - claim_id_to_claim=claim_id_to_claim, - ) - - # verify structure - assert len(result[sample_claim.id]) == 2, "should have 2 citations" - assert sample_citation in result[sample_claim.id] - assert citation2 in result[sample_claim.id] - - -def test_collect_evidence_results_handles_exceptions( - sample_claim, - mock_thread_pool_manager -): - """test that function handles exceptions from gatherers gracefully""" - # simulate gatherer failure - mock_thread_pool_manager.wait_next_completed.side_effect = [ - ("job-1", Exception("Gatherer failed")), - ] - - claim_id_to_claim = {sample_claim.id: sample_claim} - - result = collect_evidence_results( - manager=mock_thread_pool_manager, - evidence_jobs_submitted=1, - claim_id_to_claim=claim_id_to_claim, - ) - - # verify structure - should initialize empty list for claim - assert sample_claim.id in result - assert isinstance(result[sample_claim.id], list) - # exception should be handled, no citations added - assert len(result[sample_claim.id]) == 0 - - -def test_collect_evidence_results_no_errors( - sample_claim, - sample_citation, - mock_thread_pool_manager -): - """test that function completes without errors""" - mock_thread_pool_manager.wait_next_completed.return_value = ( - "job-1", - (sample_claim.id, [sample_citation]) - ) - - claim_id_to_claim = {sample_claim.id: sample_claim} - - try: - result = collect_evidence_results( - manager=mock_thread_pool_manager, - evidence_jobs_submitted=1, - claim_id_to_claim=claim_id_to_claim, - ) - assert isinstance(result, dict), "function completed without errors" - except Exception as e: - pytest.fail(f"function raised unexpected exception: {e}") - - -# ===== TESTS FOR fire_and_forget_streaming_pipeline ===== - -def test_fire_and_forget_pipeline_structure( - sample_data_source, - sample_claim, - sample_citation, - mock_evidence_gatherer -): - """test that pipeline returns correct structure""" - # create mock extract function - def mock_extract_fn(extraction_input: ClaimExtractionInput): - return ClaimExtractionOutput( - data_source=extraction_input.data_source, - claims=[sample_claim] - ) - - # create mock manager - with patch('app.ai.async_code.ThreadPoolManager') as MockManager: - mock_manager = Mock(spec=ThreadPoolManager) - MockManager.get_instance.return_value = mock_manager - - # simulate claim extraction completion - mock_manager.wait_next_completed.side_effect = [ - # claim extraction completes - ("job-1", ClaimExtractionOutput( - data_source=sample_data_source, - claims=[sample_claim] - )), - # evidence gathering completes - ("job-2", (sample_claim.id, [sample_citation])), - ] - - claim_outputs, enriched_claims = fire_and_forget_streaming_pipeline( - data_sources=[sample_data_source], - extract_fn=mock_extract_fn, - evidence_gatherers=[mock_evidence_gatherer], - manager=mock_manager, - ) - - # verify structure - assert isinstance(claim_outputs, list), "claim_outputs should be a list" - assert len(claim_outputs) == 1, "should have 1 claim output" - assert isinstance(claim_outputs[0], ClaimExtractionOutput) - - assert isinstance(enriched_claims, dict), "enriched_claims should be a dict" - assert sample_claim.id in enriched_claims - assert isinstance(enriched_claims[sample_claim.id], EnrichedClaim) - assert enriched_claims[sample_claim.id].id == sample_claim.id - assert enriched_claims[sample_claim.id].text == sample_claim.text - assert isinstance(enriched_claims[sample_claim.id].citations, list) - - -def test_fire_and_forget_pipeline_multiple_sources( - sample_citation, - mock_evidence_gatherer -): - """test pipeline with multiple data sources""" - source1 = DataSource( - id="source-1", - source_type="original_text", - original_text="Text 1", - metadata={} - ) - source2 = DataSource( - id="source-2", - source_type="original_text", - original_text="Text 2", - metadata={} - ) - - claim1 = ExtractedClaim( - id="claim-1", - text="Claim 1", - source=ClaimSource(source_type="original_text", source_id="source-1"), - entities=[], - llm_comment=None - ) - claim2 = ExtractedClaim( - id="claim-2", - text="Claim 2", - source=ClaimSource(source_type="original_text", source_id="source-2"), - entities=[], - llm_comment=None - ) - - def mock_extract_fn(extraction_input: ClaimExtractionInput): - if extraction_input.data_source.id == "source-1": - return ClaimExtractionOutput( - data_source=extraction_input.data_source, - claims=[claim1] - ) - else: - return ClaimExtractionOutput( - data_source=extraction_input.data_source, - claims=[claim2] - ) - - with patch('app.ai.async_code.ThreadPoolManager') as MockManager: - mock_manager = Mock(spec=ThreadPoolManager) - MockManager.get_instance.return_value = mock_manager - - # simulate both extractions and evidence gathering completing - mock_manager.wait_next_completed.side_effect = [ - # claim extractions - ("job-1", ClaimExtractionOutput(data_source=source1, claims=[claim1])), - ("job-2", ClaimExtractionOutput(data_source=source2, claims=[claim2])), - # evidence gathering - ("job-3", (claim1.id, [sample_citation])), - ("job-4", (claim2.id, [sample_citation])), - ] - - claim_outputs, enriched_claims = fire_and_forget_streaming_pipeline( - data_sources=[source1, source2], - extract_fn=mock_extract_fn, - evidence_gatherers=[mock_evidence_gatherer], - manager=mock_manager, - ) - - # verify structure - assert len(claim_outputs) == 2, "should have 2 claim outputs" - assert len(enriched_claims) == 2, "should have 2 enriched claims" - assert claim1.id in enriched_claims - assert claim2.id in enriched_claims - - -def test_fire_and_forget_pipeline_no_errors( - sample_data_source, - sample_claim, - sample_citation, - mock_evidence_gatherer -): - """test that pipeline completes without errors""" - def mock_extract_fn(extraction_input: ClaimExtractionInput): - return ClaimExtractionOutput( - data_source=extraction_input.data_source, - claims=[sample_claim] - ) - - with patch('app.ai.async_code.ThreadPoolManager') as MockManager: - mock_manager = Mock(spec=ThreadPoolManager) - MockManager.get_instance.return_value = mock_manager - - mock_manager.wait_next_completed.side_effect = [ - ("job-1", ClaimExtractionOutput( - data_source=sample_data_source, - claims=[sample_claim] - )), - ("job-2", (sample_claim.id, [sample_citation])), - ] - - try: - claim_outputs, enriched_claims = fire_and_forget_streaming_pipeline( - data_sources=[sample_data_source], - extract_fn=mock_extract_fn, - evidence_gatherers=[mock_evidence_gatherer], - manager=mock_manager, - ) - assert isinstance(claim_outputs, list), "pipeline completed without errors" - assert isinstance(enriched_claims, dict), "pipeline completed without errors" - except Exception as e: - pytest.fail(f"pipeline raised unexpected exception: {e}") - - -def test_fire_and_forget_pipeline_empty_claims( - sample_data_source, - mock_evidence_gatherer -): - """test pipeline when no claims are extracted""" - def mock_extract_fn(extraction_input: ClaimExtractionInput): - return ClaimExtractionOutput( - data_source=extraction_input.data_source, - claims=[] # no claims - ) - - with patch('app.ai.async_code.ThreadPoolManager') as MockManager: - mock_manager = Mock(spec=ThreadPoolManager) - MockManager.get_instance.return_value = mock_manager - - mock_manager.wait_next_completed.side_effect = [ - ("job-1", ClaimExtractionOutput( - data_source=sample_data_source, - claims=[] - )), - # no evidence jobs since no claims - ] - - claim_outputs, enriched_claims = fire_and_forget_streaming_pipeline( - data_sources=[sample_data_source], - extract_fn=mock_extract_fn, - evidence_gatherers=[mock_evidence_gatherer], - manager=mock_manager, - ) - - # verify structure - assert len(claim_outputs) == 1 - assert len(claim_outputs[0].claims) == 0 - assert len(enriched_claims) == 0, "no enriched claims if no claims extracted" diff --git a/app/ai/threads/README.md b/app/ai/threads/README.md deleted file mode 100644 index 47ecc57..0000000 --- a/app/ai/threads/README.md +++ /dev/null @@ -1,603 +0,0 @@ - -# async parallelization plan for fact-checking pipeline - -## executive summary - -this document outlines the strategy for parallelizing the IO-heavy fact-checking pipeline to achieve ~4x latency reduction (from 420s to 105s) through async concurrency and centralized resource management. - -**status**: planning complete, implementation deferred - -**key improvements**: -- parallel link expansion (10 links: 60s → 15s) -- parallel claim extraction (10 sources: 300s → 75s) -- parallel evidence gathering (15 claims × 3 gatherers: 60s → 15s) -- centralized async resource pool with semaphore limits -- real-time progress tracking and observability - ---- - -## current state analysis - -### sequential execution flow - -``` -link expansion (sequential) -├─ source 1: extract links → expand link 1 (6s) → expand link 2 (6s) → ... = 60s -└─ total: ~60s for 10 links - -claim extraction (sequential) -├─ source 1: extract claims (30s) -├─ source 2: extract claims (30s) -└─ total: ~300s for 10 sources - -evidence gathering (sequential) -├─ claim 1: google api (2s) + web search (2s) = 4s -├─ claim 2: google api (2s) + web search (2s) = 4s -└─ total: ~60s for 15 claims - -TOTAL LATENCY: ~420s -``` - -### bottlenecks - -1. **link expansion**: fetching 10 URLs takes 60s sequentially (6s each) -2. **claim extraction**: 10 LLM calls take 300s sequentially (30s each) -3. **evidence gathering**: 15 claims × 3 gatherers = 45 sequential API calls (60s) - -all three stages are IO-bound and can be parallelized. - ---- - -## proposed architecture - -### parallel execution flow - -``` -link expansion (parallel with concurrency=5) -├─ batch 1: [link 1, link 2, link 3, link 4, link 5] → max(6s) = 6s -├─ batch 2: [link 6, link 7, link 8, link 9, link 10] → max(6s) = 6s -└─ total: ~15s (4x faster) - -claim extraction (parallel with concurrency=4) -├─ batch 1: [source 1, source 2, source 3, source 4] → max(30s) = 30s -├─ batch 2: [source 5, source 6, source 7, source 8] → max(30s) = 30s -├─ batch 3: [source 9, source 10] → max(30s) = 30s -└─ total: ~90s (3.3x faster, limited by LLM quota) - -evidence gathering (parallel with nested concurrency) -├─ per claim: [google, web, news] in parallel → max(2s) = 2s -├─ all claims: process 5 claims at once → 15 claims / 5 = 3 batches -└─ total: ~15s (4x faster) - -TOTAL LATENCY: ~105s (4x improvement) -``` - -### resource management - -**centralized async pool manager** (singleton pattern): -- manages httpx.AsyncClient lifecycle -- enforces semaphore limits per operation type -- provides observability hooks for monitoring - -**semaphore limits**: -- link expansion: 5 concurrent fetches -- claim extraction: 4 concurrent LLM calls (rate limit protection) -- evidence gathering: 5 concurrent claims, 3 gatherers per claim - ---- - -## implementation plan - -### phase 1: core async utilities - -**new files**: -- `app/ai/threads/pool_manager.py` - centralized async resource manager -- `app/ai/threads/parallel_helpers.py` - reusable parallel execution utilities -- `app/ai/threads/progress.py` - progress tracking for long operations - -**code structure**: - -```python -# pool_manager.py -class AsyncPoolManager: - """singleton async resource pool with semaphore limits.""" - - _instance: Optional["AsyncPoolManager"] = None - - def __init__(self): - self.http_client: Optional[httpx.AsyncClient] = None - self.link_expansion_semaphore = asyncio.Semaphore(5) - self.claim_extraction_semaphore = asyncio.Semaphore(4) - self.evidence_gathering_semaphore = asyncio.Semaphore(5) - - @classmethod - def get_instance(cls) -> "AsyncPoolManager": - if cls._instance is None: - cls._instance = cls() - return cls._instance - - async def initialize(self): - """initialize async resources.""" - if self.http_client is None: - self.http_client = httpx.AsyncClient(timeout=30.0) - - async def cleanup(self): - """cleanup async resources.""" - if self.http_client: - await self.http_client.aclose() - self.http_client = None - - -# parallel_helpers.py -async def map_async_with_concurrency[T, R]( - items: List[T], - async_fn: Callable[[T], Awaitable[R]], - semaphore: asyncio.Semaphore, - description: str = "processing" -) -> List[R]: - """ - map async function over items with semaphore-based concurrency control. - - args: - items: list of items to process - async_fn: async function to apply to each item - semaphore: semaphore for concurrency limit - description: description for progress tracking - - returns: - list of results in same order as input items - """ - async def _bounded_task(item: T) -> R: - async with semaphore: - return await async_fn(item) - - tasks = [_bounded_task(item) for item in items] - - # use asyncio.gather to run all tasks in parallel - results = await asyncio.gather(*tasks, return_exceptions=True) - - # handle exceptions: log and convert to None or re-raise - processed_results = [] - for i, result in enumerate(results): - if isinstance(result, Exception): - logger.error(f"task {i} failed: {result}") - processed_results.append(None) # or re-raise - else: - processed_results.append(result) - - return processed_results - - -# progress.py -class ProgressTracker: - """track progress of parallel operations with real-time updates.""" - - def __init__(self, total: int, description: str): - self.total = total - self.completed = 0 - self.description = description - self.lock = asyncio.Lock() - - async def increment(self, count: int = 1): - async with self.lock: - self.completed += count - self._print_progress() - - def _print_progress(self): - pct = (self.completed / self.total) * 100 if self.total > 0 else 0 - print(f"[{self.description}] {self.completed}/{self.total} ({pct:.1f}%)") -``` - -### phase 2: parallelize link expansion - -**modify**: `app/ai/pipeline/link_context_expander.py` - -**changes**: - -```python -async def expand_link_contexts( - data_source: DataSource, - config: PipelineConfig -) -> List[DataSource]: - """expand links from original text into link_context data sources (parallel).""" - - # extract URLs - urls = extract_urls_from_text(data_source.original_text) - if not urls: - return [] - - # get pool manager - pool = AsyncPoolManager.get_instance() - await pool.initialize() - - # parallel fetch with semaphore - async def _fetch_one_url(url: str) -> Optional[DataSource]: - async with pool.link_expansion_semaphore: - return await fetch_and_create_link_context(url, data_source, config) - - # use asyncio.gather for parallel execution - results = await asyncio.gather( - *[_fetch_one_url(url) for url in urls], - return_exceptions=True - ) - - # filter out None and exceptions - expanded_sources = [r for r in results if isinstance(r, DataSource)] - - return expanded_sources -``` - -**expected improvement**: 60s → 15s (4x faster for 10 links with concurrency=5) - -### phase 3: parallelize claim extraction - -**modify**: `app/ai/pipeline/steps.py` (DefaultPipelineSteps) - -**changes**: - -```python -async def extract_claims_from_all_sources( - self, - data_sources: List[DataSource], - llm_config: LLMConfig -) -> List[ClaimExtractionOutput]: - """extract claims from all sources in parallel.""" - - pool = AsyncPoolManager.get_instance() - - async def _extract_one_source(source: DataSource) -> ClaimExtractionOutput: - extraction_input = ClaimExtractionInput(data_source=source) - - # semaphore protects LLM rate limits - async with pool.claim_extraction_semaphore: - return await self._extract_claims(extraction_input, llm_config) - - # parallel execution - claim_outputs = await map_async_with_concurrency( - items=data_sources, - async_fn=_extract_one_source, - semaphore=pool.claim_extraction_semaphore, - description="claim extraction" - ) - - return claim_outputs -``` - -**expected improvement**: 300s → 75s (4x faster with concurrency=4) - -### phase 4: parallelize evidence gathering - -**modify**: `app/ai/pipeline/evidence_retrieval.py` - -**changes**: - -```python -async def gather_evidence_async( - retrieval_input: EvidenceRetrievalInput, - gatherers: List[EvidenceGatherer] -) -> EvidenceRetrievalResult: - """gather evidence for all claims in parallel.""" - - pool = AsyncPoolManager.get_instance() - - async def _gather_for_one_claim(claim: ExtractedClaim) -> EnrichedClaim: - # for each claim, run all gatherers in parallel - async def _run_one_gatherer(gatherer: EvidenceGatherer) -> List[Citation]: - return await gatherer.gather(claim) - - # nested parallelism: all gatherers for this claim run concurrently - all_citations_nested = await asyncio.gather( - *[_run_one_gatherer(g) for g in gatherers], - return_exceptions=True - ) - - # flatten citations - all_citations = [] - for cits in all_citations_nested: - if isinstance(cits, list): - all_citations.extend(cits) - - return EnrichedClaim( - id=claim.id, - text=claim.text, - source=claim.source, - entities=claim.entities, - llm_comment=claim.llm_comment, - citations=all_citations - ) - - # parallel across claims with semaphore - enriched_claims = await map_async_with_concurrency( - items=retrieval_input.claims, - async_fn=_gather_for_one_claim, - semaphore=pool.evidence_gathering_semaphore, - description="evidence gathering" - ) - - # build result map - claim_evidence_map = { - claim.id: claim for claim in enriched_claims - } - - return EvidenceRetrievalResult(claim_evidence_map=claim_evidence_map) -``` - -**expected improvement**: 60s → 15s (4x faster with concurrency=5 claims, 3 gatherers each) - -### phase 5: integration and cleanup - -**modify**: `app/ai/main_pipeline.py` - -**changes**: - -```python -async def run_fact_check_pipeline( - data_sources: List[DataSource], - config: PipelineConfig, - steps: PipelineSteps, -) -> List[ClaimExtractionOutput]: - """run the fact-checking pipeline with async parallelization.""" - - # initialize async pool - pool = AsyncPoolManager.get_instance() - await pool.initialize() - - try: - # existing pipeline logic (now with parallel steps) - expanded_link_sources = await steps.expand_data_sources_with_links( - data_sources, config - ) - - all_data_sources = list(data_sources) + expanded_link_sources - - claim_outputs = await steps.extract_claims_from_all_sources( - data_sources=all_data_sources, - llm_config=config.claim_extraction_llm_config - ) - - return claim_outputs - - finally: - # cleanup resources - await pool.cleanup() -``` - -**add lifecycle management in FastAPI app**: - -```python -# app/main.py -from contextlib import asynccontextmanager - -@asynccontextmanager -async def lifespan(app: FastAPI): - # startup - pool = AsyncPoolManager.get_instance() - await pool.initialize() - yield - # shutdown - await pool.cleanup() - -app = FastAPI(lifespan=lifespan) -``` - ---- - -## testing strategy - -### unit tests - -1. **pool_manager_test.py**: - - singleton behavior - - semaphore limits enforced - - resource initialization and cleanup - -2. **parallel_helpers_test.py**: - - map_async_with_concurrency with mock async functions - - exception handling (partial failures) - - semaphore respects concurrency limits - -3. **progress_test.py**: - - thread-safe increment operations - - accurate progress calculations - -### integration tests - -1. **parallel_link_expansion_test.py**: - - expand 10 URLs in parallel - - measure latency improvement (should be ~4x faster) - - verify all results are correct - -2. **parallel_claim_extraction_test.py**: - - extract claims from 10 sources in parallel - - verify LLM rate limits are respected - - check result quality matches sequential version - -3. **parallel_evidence_gathering_test.py**: - - gather evidence for 15 claims with 3 gatherers - - verify nested parallelism (claims + gatherers) - - measure latency improvement - -### load tests - -1. **stress test with 100 claims**: - - verify semaphores prevent resource exhaustion - - check memory usage stays bounded - - ensure no deadlocks or race conditions - -2. **real-world scenario**: - - run full pipeline with 5 original sources (each with 2 links) - - measure end-to-end latency - - target: <120s (vs current ~420s) - ---- - -## migration strategy - -### rollout phases - -**week 1-2**: implement core utilities (phase 1) -- create pool_manager.py, parallel_helpers.py, progress.py -- write unit tests -- review and merge - -**week 3**: parallelize link expansion (phase 2) -- modify link_context_expander.py -- add integration tests -- measure latency improvements -- feature flag: `PARALLEL_LINK_EXPANSION=true` - -**week 4**: parallelize claim extraction (phase 3) -- modify steps.py -- add integration tests -- feature flag: `PARALLEL_CLAIM_EXTRACTION=true` - -**week 5**: parallelize evidence gathering (phase 4) -- modify evidence_retrieval.py -- add integration tests -- feature flag: `PARALLEL_EVIDENCE_GATHERING=true` - -**week 6**: full integration and cleanup (phase 5) -- modify main_pipeline.py -- add FastAPI lifecycle management -- end-to-end load tests -- remove feature flags, make parallel execution default - -### feature flags - -use environment variables for gradual rollout: - -```python -# config.py -PARALLEL_LINK_EXPANSION = os.getenv("PARALLEL_LINK_EXPANSION", "false").lower() == "true" -PARALLEL_CLAIM_EXTRACTION = os.getenv("PARALLEL_CLAIM_EXTRACTION", "false").lower() == "true" -PARALLEL_EVIDENCE_GATHERING = os.getenv("PARALLEL_EVIDENCE_GATHERING", "false").lower() == "true" -``` - -fallback to sequential execution if flags are disabled. - ---- - -## performance expectations - -### latency improvements - -| stage | current (sequential) | proposed (parallel) | speedup | -|-------|---------------------|---------------------|---------| -| link expansion (10 links) | 60s | 15s | 4x | -| claim extraction (10 sources) | 300s | 75s | 4x | -| evidence gathering (15 claims) | 60s | 15s | 4x | -| **total pipeline** | **420s** | **105s** | **4x** | - -### resource usage - -- **memory**: slight increase due to concurrent tasks (~10-20% more) -- **CPU**: minimal (IO-bound operations) -- **network**: concurrent connections limited by semaphores (safe) -- **LLM quota**: protected by claim_extraction_semaphore (max 4 concurrent) - -### observability - -- progress tracking: real-time completion percentages -- error rates: logged per operation type -- latency distribution: P50, P95, P99 per stage -- semaphore saturation: track how often limits are hit - ---- - -## risks and mitigations - -### risk 1: rate limit violations - -**impact**: LLM provider or API throttling errors - -**mitigation**: -- use semaphores to enforce strict concurrency limits -- implement exponential backoff in parallel_helpers.py -- monitor rate limit headers and adjust semaphores dynamically - -### risk 2: resource exhaustion - -**impact**: too many concurrent connections, memory spikes - -**mitigation**: -- centralized pool manager with bounded resources -- semaphore limits tuned conservatively (start with 4-5 concurrent) -- integration tests with stress scenarios (100+ claims) - -### risk 3: partial failures - -**impact**: some tasks fail, others succeed - inconsistent results - -**mitigation**: -- asyncio.gather with return_exceptions=True -- log all exceptions with context (claim ID, source ID) -- return partial results with clear indication of failures -- retry logic for transient errors (network timeouts) - -### risk 4: debugging complexity - -**impact**: harder to trace execution flow with concurrent tasks - -**mitigation**: -- add request_id to all log messages -- progress tracker shows which tasks are running -- structured logging with correlation IDs -- keep sequential execution as fallback (feature flags) - ---- - -## future optimizations - -### stream processing (advanced) - -for very large batches (100+ claims), implement streaming: - -```python -async def stream_process_claims( - data_sources: AsyncIterator[DataSource], - llm_config: LLMConfig -) -> AsyncIterator[ClaimExtractionOutput]: - """process claims as they arrive, don't wait for all sources.""" - - async for source in data_sources: - result = await extract_claims(source, llm_config) - yield result -``` - -**benefits**: -- start evidence gathering while still extracting claims -- reduce peak memory usage -- lower time-to-first-result - -### adaptive concurrency - -dynamically adjust semaphore limits based on: -- current system load (CPU, memory) -- rate limit headers from providers -- historical latency data - -### caching layer - -cache expanded link contexts and LLM responses: -- redis for short-term cache (1 hour) -- database for long-term cache (similar claims) -- cache key: hash of claim text + source type - ---- - -## conclusion - -this parallelization plan provides: -- **4x latency reduction** (420s → 105s) -- **bounded resource usage** with semaphore limits -- **incremental rollout** with feature flags -- **strong observability** and error handling -- **maintainable code** with centralized async utilities - -**next steps**: -1. review and approve this plan -2. create app/ai/threads/ directory -3. implement phase 1 (core utilities) -4. proceed with phases 2-5 over 6 weeks - -**status**: ready for implementation when prioritized. diff --git a/app/ai/threads/test/test_thread_utils.py b/app/ai/threads/test/test_thread_utils.py deleted file mode 100644 index b92cc1a..0000000 --- a/app/ai/threads/test/test_thread_utils.py +++ /dev/null @@ -1,1119 +0,0 @@ -""" -simple tests for thread-based job queue system. - -run with: python app/ai/threads/test_thread_utils.py -""" - -import asyncio -import time -from concurrent.futures import Future - -import sys -import os -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) - -from app.ai.threads.thread_utils import ( - Job, - OperationType, - ThreadPoolContext, - ThreadPoolManager, - map_threaded, - map_threaded_async, - wait_all, - with_thread_pool, -) - - -def test_operation_type_weights(): - """test that operation types have correct priority weights.""" - print("\n1. testing operation type weights...") - - assert OperationType.CLAIMS_EXTRACTION.weight == 10 - assert OperationType.LINK_CONTEXT_EXPANDING.weight == 5 - assert OperationType.LINK_EVIDENCE_RETRIEVER.weight == 3 - - print(" ✓ operation type weights correct") - - -def test_job_priority_ordering(): - """test that jobs are ordered by priority correctly.""" - print("\n2. testing job priority ordering...") - - # create jobs with different priorities - job1 = Job( - id="1", - operation_type=OperationType.LINK_EVIDENCE_RETRIEVER, # weight 3 - func=lambda: None - ) - job2 = Job( - id="2", - operation_type=OperationType.CLAIMS_EXTRACTION, # weight 10 - func=lambda: None - ) - job3 = Job( - id="3", - operation_type=OperationType.LINK_CONTEXT_EXPANDING, # weight 5 - func=lambda: None - ) - - # higher priority (weight 10) should have lower priority value - assert job2.priority < job3.priority < job1.priority - - print(" ✓ job priority ordering correct (higher weight = processed first)") - - -def test_thread_pool_initialization(): - """test thread pool manager initialization and shutdown.""" - print("\n3. testing thread pool initialization...") - - manager = ThreadPoolManager.get_instance(max_workers=5) - assert manager.max_workers == 5 - assert manager._initialized is False - - manager.initialize() - assert manager._initialized is True - assert manager.executor is not None - assert manager.dispatcher_thread is not None - - manager.shutdown() - assert manager._initialized is False - - print(" ✓ thread pool initialization/shutdown works") - - -def test_singleton_pattern(): - """test that ThreadPoolManager is a singleton.""" - print("\n4. testing singleton pattern...") - - # note: need to reset singleton for testing - ThreadPoolManager._instance = None - - manager1 = ThreadPoolManager.get_instance(max_workers=10) - manager2 = ThreadPoolManager.get_instance(max_workers=20) - - assert manager1 is manager2 - assert manager1.max_workers == 10 # first call wins - - # cleanup - manager1.shutdown() - - print(" ✓ singleton pattern works") - - -def test_submit_and_wait(): - """test submitting a job and waiting for result.""" - print("\n5. testing submit and wait...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def slow_task(x: int) -> int: - time.sleep(0.1) - return x * 2 - - future = manager.submit( - OperationType.CLAIMS_EXTRACTION, - slow_task, - 5 - ) - - assert isinstance(future, Future) - result = future.result() - assert result == 10 - - manager.shutdown() - - print(" ✓ submit and wait works") - - -def test_priority_ordering(): - """test that jobs are executed in priority order.""" - print("\n6. testing priority-based execution...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=1) # single worker - manager.initialize() - - execution_order = [] - - def track_execution(name: str): - execution_order.append(name) - time.sleep(0.05) - return name - - # submit jobs in reverse priority order - f1 = manager.submit(OperationType.LINK_EVIDENCE_RETRIEVER, track_execution, "low") - f2 = manager.submit(OperationType.LINK_CONTEXT_EXPANDING, track_execution, "medium") - f3 = manager.submit(OperationType.CLAIMS_EXTRACTION, track_execution, "high") - - # wait for all - results = wait_all([f1, f2, f3]) - - # with single worker, should execute in priority order: high, medium, low - # note: first job might start immediately before priority takes effect - print(f" execution order: {execution_order}") - - manager.shutdown() - - print(" ✓ priority ordering verified") - - -def test_map_threaded(): - """test map_threaded helper function.""" - print("\n7. testing map_threaded...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def square(x: int) -> int: - time.sleep(0.05) - return x * x - - items = [1, 2, 3, 4, 5] - start = time.time() - - results = map_threaded( - items=items, - func=square, - operation_type=OperationType.CLAIMS_EXTRACTION, - manager=manager - ) - - elapsed = time.time() - start - - assert results == [1, 4, 9, 16, 25] - - # with 5 workers, 5 items should complete in ~0.05s (parallel) - # instead of ~0.25s (sequential) - print(f" elapsed: {elapsed:.2f}s (should be ~0.05s with parallelism)") - - manager.shutdown() - - print(" ✓ map_threaded works") - -def test_context_manager(): - """test ThreadPoolContext context manager.""" - print("\n10. testing context manager...") - - ThreadPoolManager._instance = None - - with ThreadPoolContext(max_workers=5) as manager: - assert manager._initialized is True - - result = manager.submit( - OperationType.CLAIMS_EXTRACTION, - lambda x: x + 1, - 10 - ).result() - - assert result == 11 - - # after context exit, should be shut down - # note: singleton might still exist but should be cleaned up - - print(" ✓ context manager works") - - -def test_with_thread_pool(): - """test with_thread_pool convenience function.""" - print("\n11. testing with_thread_pool...") - - ThreadPoolManager._instance = None - - def process_data(manager: ThreadPoolManager) -> list: - return map_threaded( - items=[1, 2, 3], - func=lambda x: x * 10, - operation_type=OperationType.CLAIMS_EXTRACTION, - manager=manager - ) - - results = with_thread_pool(process_data, max_workers=5) - - assert results == [10, 20, 30] - - print(" ✓ with_thread_pool works") - - -def test_error_handling(): - """test that errors are properly propagated through futures.""" - print("\n12. testing error handling...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def failing_task(): - raise ValueError("intentional error") - - future = manager.submit( - OperationType.CLAIMS_EXTRACTION, - failing_task - ) - - try: - future.result() - assert False, "should have raised ValueError" - except ValueError as e: - assert str(e) == "intentional error" - - manager.shutdown() - - print(" ✓ error handling works") - - -def test_get_status(): - """test get_status method.""" - print("\n13. testing get_status...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - status = manager.get_status() - - assert status["max_workers"] == 5 - assert status["initialized"] is True - assert status["queue_size"] == 0 - assert status["running_jobs"] == 0 - - manager.shutdown() - - print(" ✓ get_status works") - - -def test_wait_next_completed(): - """test wait_next_completed for streaming results.""" - print("\n14. testing wait_next_completed (streaming pattern)...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def slow_task(x: int) -> int: - time.sleep(0.05 * x) # different delays - return x * 2 - - # submit 5 jobs - for i in range(1, 6): - manager.submit(OperationType.CLAIMS_EXTRACTION, slow_task, i) - - # collect results as they complete (streaming) - results = [] - for _ in range(5): - job_id, result = manager.wait_next_completed(OperationType.CLAIMS_EXTRACTION) - results.append(result) - print(f" received: {result}") - - # should have all results (not necessarily in order due to different delays) - assert sorted(results) == [2, 4, 6, 8, 10] - - manager.shutdown() - - print(" ✓ wait_next_completed works (streaming pattern)") - - -def test_wait_next_completed_any(): - """test wait_next_completed_any for mixed operation types.""" - print("\n15. testing wait_next_completed_any (mixed operations)...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def task_a(x: int) -> str: - time.sleep(0.05) - return f"A{x}" - - def task_b(x: int) -> str: - time.sleep(0.05) - return f"B{x}" - - def task_c(x: int) -> str: - time.sleep(0.05) - return f"C{x}" - - # submit mixed jobs - manager.submit(OperationType.CLAIMS_EXTRACTION, task_a, 1) - manager.submit(OperationType.LINK_CONTEXT_EXPANDING, task_b, 2) - manager.submit(OperationType.CLAIMS_EXTRACTION, task_a, 3) - manager.submit(OperationType.LINK_EVIDENCE_RETRIEVER, task_c, 4) - - # collect results from ANY operation type - results = [] - for _ in range(4): - op_type, job_id, result = manager.wait_next_completed_any() - results.append((op_type.name, result)) - print(f" {op_type.name}: {result}") - - # check we got all 4 results - assert len(results) == 4 - result_values = [r[1] for r in results] - assert sorted(result_values) == ["A1", "A3", "B2", "C4"] - - manager.shutdown() - - print(" ✓ wait_next_completed_any works (mixed operations)") - - -def test_wait_next_completed_timeout(): - """test wait_next_completed timeout behavior.""" - print("\n16. testing wait_next_completed timeout...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - # don't submit any jobs - - try: - manager.wait_next_completed(OperationType.CLAIMS_EXTRACTION, timeout=0.1) - assert False, "should have raised TimeoutError" - except TimeoutError as e: - assert "no completed jobs" in str(e) - - manager.shutdown() - - print(" ✓ timeout handling works") - - -def test_wait_next_completed_error_handling(): - """test wait_next_completed with failing jobs.""" - print("\n17. testing wait_next_completed error handling...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def failing_task(x: int): - if x == 3: - raise ValueError(f"task {x} failed") - return x * 2 - - # submit jobs (one will fail) - for i in range(1, 5): - manager.submit(OperationType.CLAIMS_EXTRACTION, failing_task, i) - - # collect results - results = [] - errors = [] - for _ in range(4): - try: - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - raise_on_error=True - ) - results.append(result) - except ValueError as e: - errors.append(str(e)) - - assert len(results) == 3 # 3 succeeded - assert len(errors) == 1 # 1 failed - assert "task 3 failed" in errors[0] - - manager.shutdown() - - print(" ✓ error handling in completion queue works") - - -def test_pipeline_id_job_tracking(): - """test that jobs submitted with pipeline_id have that value set.""" - print("\n18. testing pipeline_id is set on jobs...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def simple_task(x: int) -> int: - return x * 2 - - # submit job with pipeline_id - pipeline_id = "test-pipeline-123" - future = manager.submit( - OperationType.CLAIMS_EXTRACTION, - simple_task, - 5, - pipeline_id=pipeline_id - ) - - # wait for job to complete - result = future.result() - assert result == 10 - - # verify the job has the correct pipeline_id - # get the job from completed_jobs - with manager.running_jobs_lock: - # find the job in completed_jobs - job_found = False - for job_id, job in manager.completed_jobs.items(): - if job.pipeline_id == pipeline_id: - assert job.pipeline_id == pipeline_id - job_found = True - break - - assert job_found, "job with pipeline_id should be in completed_jobs" - - # submit job without pipeline_id - future2 = manager.submit( - OperationType.CLAIMS_EXTRACTION, - simple_task, - 10 - ) - - result2 = future2.result() - assert result2 == 20 - - # verify the job has pipeline_id=None - with manager.running_jobs_lock: - job_found_none = False - for job_id, job in manager.completed_jobs.items(): - if job.pipeline_id is None and job.func == simple_task: - job_found_none = True - break - - assert job_found_none, "job without pipeline_id should have None value" - - manager.shutdown() - - print(" ✓ pipeline_id is correctly set on jobs") - - -def test_pipeline_id_isolation(): - """test that wait_next_completed filters by pipeline_id correctly.""" - print("\n19. testing pipeline_id isolation in wait_next_completed...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def task_with_delay(x: int, delay: float) -> str: - time.sleep(delay) - return f"result-{x}" - - # submit jobs for pipeline A (fast jobs) - pipeline_a = "pipeline-A" - for i in range(3): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task_with_delay, - i, - 0.05, # fast - pipeline_id=pipeline_a - ) - - # submit jobs for pipeline B (slower jobs) - pipeline_b = "pipeline-B" - for i in range(3, 6): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task_with_delay, - i, - 0.1, # slower - pipeline_id=pipeline_b - ) - - # submit jobs without pipeline_id (no isolation) - for i in range(6, 9): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task_with_delay, - i, - 0.05 - ) - - # wait for all jobs from pipeline A - results_a = [] - for _ in range(3): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - pipeline_id=pipeline_a - ) - results_a.append(result) - - # verify job belongs to pipeline A - with manager.running_jobs_lock: - job = manager.completed_jobs[job_id] - assert job.pipeline_id == pipeline_a, f"job {job_id} should belong to pipeline A" - - assert len(results_a) == 3 - assert set(results_a) == {"result-0", "result-1", "result-2"} - - # wait for all jobs from pipeline B - results_b = [] - for _ in range(3): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - pipeline_id=pipeline_b - ) - results_b.append(result) - - # verify job belongs to pipeline B - with manager.running_jobs_lock: - job = manager.completed_jobs[job_id] - assert job.pipeline_id == pipeline_b, f"job {job_id} should belong to pipeline B" - - assert len(results_b) == 3 - assert set(results_b) == {"result-3", "result-4", "result-5"} - - # wait for jobs without pipeline_id (should get None pipeline jobs) - results_none = [] - for _ in range(3): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - pipeline_id=None # explicitly wait for non-isolated jobs - ) - results_none.append(result) - - assert len(results_none) == 3 - assert set(results_none) == {"result-6", "result-7", "result-8"} - - manager.shutdown() - - print(" ✓ pipeline_id isolation works correctly") - - -def test_pipeline_id_cross_contamination_prevention(): - """test that jobs from different pipelines don't interfere with each other.""" - print("\n20. testing prevention of cross-pipeline contamination...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def slow_task(x: int) -> int: - time.sleep(0.1) - return x * 10 - - # submit 5 jobs for request-1 - request_1 = "request-1" - for i in range(1, 6): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - slow_task, - i, - pipeline_id=request_1 - ) - - # submit 5 jobs for request-2 - request_2 = "request-2" - for i in range(10, 15): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - slow_task, - i, - pipeline_id=request_2 - ) - - # collect results for request-1 - results_req1 = [] - for _ in range(5): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - timeout=5.0, - pipeline_id=request_1 - ) - results_req1.append(result) - - # verify request-1 got correct results - assert len(results_req1) == 5 - assert set(results_req1) == {10, 20, 30, 40, 50} - - # collect results for request-2 - results_req2 = [] - for _ in range(5): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - timeout=5.0, - pipeline_id=request_2 - ) - results_req2.append(result) - - # verify request-2 got correct results (and NOT request-1's results) - assert len(results_req2) == 5 - assert set(results_req2) == {100, 110, 120, 130, 140} - - # verify no overlap - assert set(results_req1).isdisjoint(set(results_req2)) - - manager.shutdown() - - print(" ✓ cross-pipeline contamination is prevented") - - -def test_pipeline_id_with_mixed_operations(): - """test pipeline_id isolation across different operation types.""" - print("\n21. testing pipeline_id with mixed operation types...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def task_type_a(x: int) -> str: - time.sleep(0.05) - return f"A-{x}" - - def task_type_b(x: int) -> str: - time.sleep(0.05) - return f"B-{x}" - - pipeline_id = "mixed-pipeline" - - # submit claims extraction jobs for this pipeline - for i in range(3): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task_type_a, - i, - pipeline_id=pipeline_id - ) - - # submit link expansion jobs for this pipeline - for i in range(3): - manager.submit( - OperationType.LINK_CONTEXT_EXPANDING, - task_type_b, - i, - pipeline_id=pipeline_id - ) - - # submit claims extraction jobs for a different pipeline - other_pipeline = "other-pipeline" - for i in range(10, 12): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task_type_a, - i, - pipeline_id=other_pipeline - ) - - # wait for claims extraction jobs from mixed-pipeline - results = [] - for _ in range(3): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - pipeline_id=pipeline_id - ) - results.append(result) - - # verify it's from the correct pipeline - with manager.running_jobs_lock: - job = manager.completed_jobs[job_id] - assert job.pipeline_id == pipeline_id - - assert len(results) == 3 - assert set(results) == {"A-0", "A-1", "A-2"} - - # wait for link expansion jobs from mixed-pipeline - link_results = [] - for _ in range(3): - job_id, result = manager.wait_next_completed( - OperationType.LINK_CONTEXT_EXPANDING, - pipeline_id=pipeline_id - ) - link_results.append(result) - - # verify it's from the correct pipeline - with manager.running_jobs_lock: - job = manager.completed_jobs[job_id] - assert job.pipeline_id == pipeline_id - - assert len(link_results) == 3 - assert set(link_results) == {"B-0", "B-1", "B-2"} - - # wait for claims extraction jobs from other-pipeline - other_results = [] - for _ in range(2): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - pipeline_id=other_pipeline - ) - other_results.append(result) - - assert len(other_results) == 2 - assert set(other_results) == {"A-10", "A-11"} - - manager.shutdown() - - print(" ✓ pipeline_id isolation works across different operation types") - - -def test_clear_completed_jobs(): - """test clear_completed_jobs removes all jobs for a specific pipeline.""" - print("\n22. testing clear_completed_jobs...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def fast_task(x: int) -> int: - time.sleep(0.05) - return x * 2 - - pipeline_a = "pipeline-A" - pipeline_b = "pipeline-B" - pipeline_c = "pipeline-C" - - # submit 5 jobs for pipeline A (CLAIMS_EXTRACTION) - for i in range(5): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - fast_task, - i, - pipeline_id=pipeline_a - ) - - # submit 3 jobs for pipeline B (CLAIMS_EXTRACTION) - for i in range(10, 13): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - fast_task, - i, - pipeline_id=pipeline_b - ) - - # submit mixed operation types for pipeline C - for i in range(20, 22): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - fast_task, - i, - pipeline_id=pipeline_c - ) - for i in range(22, 25): - manager.submit( - OperationType.LINK_CONTEXT_EXPANDING, - fast_task, - i, - pipeline_id=pipeline_c - ) - - # wait for all jobs to complete - time.sleep(0.5) - - # clear all jobs for pipeline A (specific operation type) - cleared_a = manager.clear_completed_jobs( - pipeline_id=pipeline_a, - operation_type=OperationType.CLAIMS_EXTRACTION - ) - - assert cleared_a == 5, f"should have cleared 5 jobs, got {cleared_a}" - - # verify pipeline A jobs are gone (timeout immediately) - try: - manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - timeout=0.01, - pipeline_id=pipeline_a - ) - assert False, "should not find any pipeline A jobs" - except TimeoutError: - pass # expected - no jobs left - - # verify pipeline B jobs are still there - results_b = [] - for _ in range(3): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - timeout=1.0, - pipeline_id=pipeline_b - ) - results_b.append(result) - - assert len(results_b) == 3 - assert set(results_b) == {20, 22, 24} - - # clear pipeline B jobs (without specifying operation_type) - cleared_b = manager.clear_completed_jobs( - pipeline_id=pipeline_b - ) - - assert cleared_b == 0, "pipeline B jobs were already consumed" - - # clear all jobs for pipeline C (across all operation types) - cleared_c = manager.clear_completed_jobs( - pipeline_id=pipeline_c - ) - - assert cleared_c == 5, f"should have cleared 5 jobs (2 CLAIMS + 3 LINK), got {cleared_c}" - - # verify pipeline C jobs are gone from both operation types - try: - manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - timeout=0.01, - pipeline_id=pipeline_c - ) - assert False, "should not find any pipeline C CLAIMS_EXTRACTION jobs" - except TimeoutError: - pass # expected - - try: - manager.wait_next_completed( - OperationType.LINK_CONTEXT_EXPANDING, - timeout=0.01, - pipeline_id=pipeline_c - ) - assert False, "should not find any pipeline C LINK_CONTEXT_EXPANDING jobs" - except TimeoutError: - pass # expected - - manager.shutdown() - - print(" ✓ clear_completed_jobs works correctly") - - -def test_clear_completed_jobs_all_operation_types(): - """test clear_completed_jobs without operation_type clears all types.""" - print("\n23. testing clear_completed_jobs for all operation types...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=10) - manager.initialize() - - def task_a(x: int) -> str: - time.sleep(0.05) - return f"A-{x}" - - def task_b(x: int) -> str: - time.sleep(0.05) - return f"B-{x}" - - def task_c(x: int) -> str: - time.sleep(0.05) - return f"C-{x}" - - pipeline_target = "pipeline-target" - pipeline_other = "pipeline-other" - - # submit jobs for target pipeline across 3 different operation types - # 3 CLAIMS_EXTRACTION jobs - for i in range(3): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task_a, - i, - pipeline_id=pipeline_target - ) - - # 2 LINK_CONTEXT_EXPANDING jobs - for i in range(10, 12): - manager.submit( - OperationType.LINK_CONTEXT_EXPANDING, - task_b, - i, - pipeline_id=pipeline_target - ) - - # 4 LINK_EVIDENCE_RETRIEVER jobs - for i in range(20, 24): - manager.submit( - OperationType.LINK_EVIDENCE_RETRIEVER, - task_c, - i, - pipeline_id=pipeline_target - ) - - # submit jobs for other pipeline (should not be affected) - for i in range(100, 102): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task_a, - i, - pipeline_id=pipeline_other - ) - - # wait for all jobs to complete - time.sleep(0.8) - - # clear all jobs for target pipeline WITHOUT specifying operation_type - cleared = manager.clear_completed_jobs(pipeline_id=pipeline_target) - - # should have cleared 3 + 2 + 4 = 9 jobs - assert cleared == 9, f"should have cleared 9 jobs (3+2+4), got {cleared}" - - # verify target pipeline jobs are gone from all operation types - for op_type in [ - OperationType.CLAIMS_EXTRACTION, - OperationType.LINK_CONTEXT_EXPANDING, - OperationType.LINK_EVIDENCE_RETRIEVER - ]: - try: - manager.wait_next_completed( - op_type, - timeout=0.01, - pipeline_id=pipeline_target - ) - assert False, f"should not find any {op_type.name} jobs for target pipeline" - except TimeoutError: - pass # expected - all jobs cleared - - # verify other pipeline jobs are still there - results_other = [] - for _ in range(2): - job_id, result = manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - timeout=1.0, - pipeline_id=pipeline_other - ) - results_other.append(result) - - assert len(results_other) == 2 - assert set(results_other) == {"A-100", "A-101"} - - manager.shutdown() - - print(" ✓ clear_completed_jobs clears all operation types when not specified") - - -def test_clear_completed_jobs_async_non_blocking(): - """test clear_completed_jobs_async runs in background without blocking.""" - print("\n24. testing clear_completed_jobs_async (non-blocking)...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def task(x: int) -> int: - time.sleep(0.05) - return x * 2 - - pipeline_id = "background-cleanup" - - # submit 10 jobs - for i in range(10): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task, - i, - pipeline_id=pipeline_id - ) - - # wait for jobs to complete - time.sleep(0.8) - - # start cleanup in background - start_time = time.time() - cleanup_future = manager.clear_completed_jobs_async(pipeline_id=pipeline_id) - - # function should return immediately (non-blocking) - call_duration = time.time() - start_time - assert call_duration < 0.1, f"should return immediately, took {call_duration:.3f}s" - - print(f" cleanup submitted in {call_duration*1000:.1f}ms (non-blocking)") - - # server can continue processing here while cleanup runs in background - # simulate server doing other work - time.sleep(0.1) - - # wait for cleanup to complete - cleared = cleanup_future.result(timeout=2.0) - - assert cleared == 10, f"should have cleared 10 jobs, got {cleared}" - - # verify jobs are gone - try: - manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - timeout=0.01, - pipeline_id=pipeline_id - ) - assert False, "should not find any jobs" - except TimeoutError: - pass # expected - - manager.shutdown() - - print(" ✓ clear_completed_jobs_async runs in background without blocking") - - -def test_clear_completed_jobs_async_fire_and_forget(): - """test clear_completed_jobs_async can be used as fire-and-forget.""" - print("\n25. testing clear_completed_jobs_async (fire-and-forget)...") - - ThreadPoolManager._instance = None - manager = ThreadPoolManager.get_instance(max_workers=5) - manager.initialize() - - def task(x: int) -> int: - time.sleep(0.05) - return x * 2 - - pipeline_id = "fire-and-forget" - - # submit 5 jobs - for i in range(5): - manager.submit( - OperationType.CLAIMS_EXTRACTION, - task, - i, - pipeline_id=pipeline_id - ) - - # wait for jobs to complete - time.sleep(0.5) - - # fire-and-forget cleanup (don't wait for result) - _cleanup_future = manager.clear_completed_jobs_async(pipeline_id=pipeline_id) - - # server continues immediately, doesn't care about cleanup result - print(" server continues processing, cleanup runs in background") - - # give cleanup time to finish (in real scenario, we wouldn't wait) - time.sleep(0.5) - - # verify cleanup happened - try: - manager.wait_next_completed( - OperationType.CLAIMS_EXTRACTION, - timeout=0.01, - pipeline_id=pipeline_id - ) - assert False, "should not find any jobs" - except TimeoutError: - pass # expected - cleanup completed - - manager.shutdown() - - print(" ✓ fire-and-forget cleanup works") - - -def run_all_tests(): - """run all tests.""" - print("=" * 60) - print("running thread pool manager tests") - print("=" * 60) - - # sync tests - test_operation_type_weights() - test_job_priority_ordering() - test_thread_pool_initialization() - test_singleton_pattern() - test_submit_and_wait() - test_priority_ordering() - test_map_threaded() - test_context_manager() - test_with_thread_pool() - test_error_handling() - test_get_status() - test_wait_next_completed() - test_wait_next_completed_any() - test_wait_next_completed_timeout() - test_wait_next_completed_error_handling() - test_pipeline_id_job_tracking() - test_pipeline_id_isolation() - test_pipeline_id_cross_contamination_prevention() - test_pipeline_id_with_mixed_operations() - test_clear_completed_jobs() - test_clear_completed_jobs_all_operation_types() - test_clear_completed_jobs_async_non_blocking() - test_clear_completed_jobs_async_fire_and_forget() - - print("\n" + "=" * 60) - print("✓ all tests passed!") - print("=" * 60) - - -if __name__ == "__main__": - run_all_tests() diff --git a/app/ai/threads/thread_utils.py b/app/ai/threads/thread_utils.py deleted file mode 100644 index 2ceca4f..0000000 --- a/app/ai/threads/thread_utils.py +++ /dev/null @@ -1,816 +0,0 @@ -""" -thread-based job queue system for fact-checking pipeline. - -provides priority-based job scheduling using ThreadPoolExecutor and PriorityQueue, -with async bridge for awaitable completion and structured observability. -""" - -import asyncio -import logging -import os -import queue -import threading -import time -import uuid -from concurrent.futures import Future, ThreadPoolExecutor -from dataclasses import dataclass, field -from enum import Enum -from typing import Any, Callable, Dict, List, Optional, TypeVar - -logger = logging.getLogger(__name__) - -T = TypeVar("T") -R = TypeVar("R") - -# default max workers optimized for Apify free tier (8GB RAM) -# with complex actors needing 1-2GB, safer to limit to 4 concurrent jobs -# can be overridden via THREAD_POOL_MAX_WORKERS env var -DEFAULT_MAX_WORKERS = int(os.getenv("THREAD_POOL_MAX_WORKERS", "4")) - - -class OperationType(Enum): - """ - operation types for fact-checking pipeline with priority weights. - - higher weight = higher priority in job queue. - """ - CLAIMS_EXTRACTION = 10 # highest priority - critical path - ADJUDICATION_WITH_SEARCH = 8 # high priority - final adjudication with real-time search - LINK_EXPANSION_PIPELINE = 6 # high priority - full link expansion + DataSource creation - LINK_CONTEXT_EXPANDING = 5 # medium priority - individual URL scraping - LINK_EVIDENCE_RETRIEVER = 3 # lowest priority - evidence retrieval - - @property - def weight(self) -> int: - """get priority weight for this operation type.""" - return self.value - - -@dataclass(order=True) -class Job: - """ - represents a job in the priority queue. - - jobs are ordered by priority (higher weight first), then by creation time (FIFO). - """ - # priority field for heap ordering (negated for max-heap behavior) - priority: int = field(init=False, compare=True) - - # actual job data (not used in comparison) - id: str = field(compare=False) - operation_type: OperationType = field(compare=False) - func: Callable = field(compare=False) - args: tuple = field(default_factory=tuple, compare=False) - kwargs: dict = field(default_factory=dict, compare=False) - future: Future = field(default_factory=Future, compare=False) - created_at: float = field(default_factory=time.time, compare=False) - pipeline_id: Optional[str] = field(default=None, compare=False) - - def __post_init__(self): - """calculate priority after initialization.""" - # negate weight for max-heap (higher weight = lower priority value = processed first) - # add small timestamp component to break ties with FIFO - self.priority = -self.operation_type.weight - (self.created_at / 1e10) - - -class ThreadPoolManager: - """ - singleton thread pool manager with priority-based job scheduling. - - manages a fixed-size thread pool and dispatches jobs from a priority queue. - threads are pre-allocated and named for observability. - """ - - _instance: Optional["ThreadPoolManager"] = None - _lock = threading.Lock() - - def __init__(self, max_workers: int = None): - """ - initialize thread pool manager. - - args: - max_workers: number of worker threads to pre-allocate - (default: 10, optimized for Apify 8GB RAM free tier) - """ - if max_workers is None: - max_workers = DEFAULT_MAX_WORKERS - self.max_workers = max_workers - self.executor: Optional[ThreadPoolExecutor] = None - self.job_queue: queue.PriorityQueue = queue.PriorityQueue() - - # job tracking - self.running_jobs: Dict[str, Job] = {} - self.completed_jobs: Dict[str, Job] = {} - self.running_jobs_lock = threading.Lock() - - # completion queues for consumer pattern (per operation type) - self.completion_queues: Dict[OperationType, queue.Queue] = { - op_type: queue.Queue() for op_type in OperationType - } - # global completion queue for all operations - self.global_completion_queue: queue.Queue = queue.Queue() - - # dispatcher control - self.dispatcher_thread: Optional[threading.Thread] = None - self.dispatcher_running = False - self.shutdown_event = threading.Event() - - self._initialized = False - - @classmethod - def get_instance(cls, max_workers: int = None) -> "ThreadPoolManager": - """ - get or create singleton instance (thread-safe). - - args: - max_workers: number of worker threads (only used on first call) - (default: 10, optimized for Apify 8GB RAM free tier) - - returns: - ThreadPoolManager singleton instance - """ - if cls._instance is None: - with cls._lock: - if cls._instance is None: - cls._instance = cls(max_workers=max_workers) - return cls._instance - - def initialize(self): - """ - initialize thread pool and start dispatcher. - - creates ThreadPoolExecutor with named threads and starts background - dispatcher thread. safe to call multiple times (idempotent). - """ - if self._initialized: - return - - with self._lock: - if self._initialized: - return - - # create thread pool with named threads - self.executor = ThreadPoolExecutor( - max_workers=self.max_workers, - thread_name_prefix="FactCheck-Worker" - ) - - # start dispatcher thread - self.dispatcher_running = True - self.shutdown_event.clear() - self.dispatcher_thread = threading.Thread( - target=self._dispatch_loop, - name="FactCheck-Dispatcher", - daemon=True - ) - self.dispatcher_thread.start() - - self._initialized = True - - def shutdown(self, wait: bool = True, timeout: Optional[float] = None): - """ - shutdown thread pool and dispatcher. - - args: - wait: if True, wait for running jobs to complete - timeout: maximum time to wait in seconds (None = wait forever) - """ - if not self._initialized: - return - - # stop dispatcher - self.dispatcher_running = False - self.shutdown_event.set() - - # wait for dispatcher to finish - if self.dispatcher_thread and self.dispatcher_thread.is_alive(): - self.dispatcher_thread.join(timeout=5.0) - - # shutdown executor - if self.executor: - self.executor.shutdown(wait=wait, cancel_futures=not wait) - self.executor = None - - self._initialized = False - - def submit( - self, - operation_type: OperationType, - func: Callable, - *args, - pipeline_id: Optional[str] = None, - **kwargs - ) -> Future: - """ - submit a job to the priority queue. - - args: - operation_type: type of operation (determines priority) - func: function to execute - *args: positional arguments for func - pipeline_id: optional pipeline ID for request isolation - **kwargs: keyword arguments for func - - returns: - Future that will contain the result when job completes - - example: - >>> manager = ThreadPoolManager.get_instance() - >>> manager.initialize() - >>> future = manager.submit( - ... OperationType.CLAIMS_EXTRACTION, - ... extract_claims, - ... text="some text", - ... pipeline_id="request-123" - ... ) - >>> result = future.result() # blocks until complete - """ - if not self._initialized: - raise RuntimeError("thread pool manager not initialized. call initialize() first") - - # create job - job = Job( - id=str(uuid.uuid4()), - operation_type=operation_type, - func=func, - args=args, - kwargs=kwargs, - pipeline_id=pipeline_id - ) - - # add to priority queue - self.job_queue.put(job) - - return job.future - - async def submit_async( - self, - operation_type: OperationType, - func: Callable, - *args, - pipeline_id: Optional[str] = None, - **kwargs - ) -> Any: - """ - submit a job and await completion (async bridge). - - args: - operation_type: type of operation (determines priority) - func: function to execute (can be sync or async) - *args: positional arguments for func - pipeline_id: optional pipeline ID for request isolation - **kwargs: keyword arguments for func - - returns: - result from func - - example: - >>> manager = ThreadPoolManager.get_instance() - >>> manager.initialize() - >>> result = await manager.submit_async( - ... OperationType.CLAIMS_EXTRACTION, - ... extract_claims, - ... text="some text", - ... pipeline_id="request-123" - ... ) - """ - future = self.submit(operation_type, func, *args, pipeline_id=pipeline_id, **kwargs) - - # bridge sync Future to async - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, future.result) - - def _dispatch_loop(self): - """ - background dispatcher loop. - - continuously pulls jobs from priority queue and submits them to thread pool. - runs in dedicated dispatcher thread. - """ - while self.dispatcher_running or not self.job_queue.empty(): - try: - # get next job from priority queue (1 second timeout) - job = self.job_queue.get(timeout=1.0) - - # track running job - with self.running_jobs_lock: - self.running_jobs[job.id] = job - - # submit to thread pool - self.executor.submit(self._execute_job, job) - - # logger.debug can be slow, use print for dispatcher debugging - # print(f"[DISPATCHER] Dispatched {job.operation_type.name} job {job.id}") - - except queue.Empty: - # no jobs available, continue loop - continue - except Exception as e: - logger.error(f"dispatcher error: {e}", exc_info=True) - - def _execute_job(self, job: Job): - """ - execute a job and set result/exception on future. - - args: - job: job to execute - """ - start_time = time.time() - - try: - # execute function - result = job.func(*job.args, **job.kwargs) - - # set result on future - job.future.set_result(result) - - # put result in completion queues for consumer pattern - self.completion_queues[job.operation_type].put((job.id, result)) - self.global_completion_queue.put((job.operation_type, job.id, result)) - - except Exception as e: - # set exception on future - job.future.set_exception(e) - - # put exception in completion queues for consumer pattern - self.completion_queues[job.operation_type].put((job.id, e)) - self.global_completion_queue.put((job.operation_type, job.id, e)) - - # only print on error for debugging - elapsed = time.time() - start_time - print(f"[THREAD ERROR] Job {job.id} ({job.operation_type.name}) FAILED in {elapsed:.2f}s: {type(e).__name__}: {e}") - import traceback - traceback.print_exc() - - finally: - # move from running to completed - with self.running_jobs_lock: - if job.id in self.running_jobs: - del self.running_jobs[job.id] - self.completed_jobs[job.id] = job - - def get_status(self) -> dict: - """ - get current status of thread pool and job queue. - - returns: - dict with queue_size, running_jobs, completed_jobs, max_workers - """ - with self.running_jobs_lock: - return { - "queue_size": self.job_queue.qsize(), - "running_jobs": len(self.running_jobs), - "completed_jobs": len(self.completed_jobs), - "max_workers": self.max_workers, - "initialized": self._initialized, - } - - def wait_next_completed( - self, - operation_type: OperationType, - timeout: Optional[float] = None, - raise_on_error: bool = True, - pipeline_id: Optional[str] = None, - ) -> Any: - """ - wait for and return result of next completed job of given operation type. - - useful for processing results as they arrive instead of waiting for all. - - args: - operation_type: operation type to wait for - timeout: max time to wait in seconds (None = wait forever) - raise_on_error: if True, raise exception if job failed; if False, return exception object - pipeline_id: optional pipeline ID for request isolation. if provided, only jobs - with matching pipeline_id will be returned. jobs with non-matching - pipeline_id will be put back in the queue for other waiters. - - returns: - tuple of (job_id, result) where result is the job's return value or exception - - raises: - TimeoutError: if timeout exceeded with no completion (via queue.Empty) - Exception: if job failed and raise_on_error is True - - example: - >>> # submit 10 claim extraction jobs for pipeline "req-123" - >>> for claim in claims: - ... manager.submit( - ... OperationType.CLAIMS_EXTRACTION, - ... extract, - ... claim, - ... pipeline_id="req-123" - ... ) - >>> - >>> # process results as they complete (streaming pattern) - >>> for _ in range(10): - ... job_id, result = manager.wait_next_completed( - ... OperationType.CLAIMS_EXTRACTION, - ... pipeline_id="req-123" - ... ) - ... print(f"job {job_id} completed: {result}") - """ - # track non-matching jobs to put back - non_matching_jobs = [] - - try: - while True: - try: - job_id, result = self.completion_queues[operation_type].get( - block=True, - timeout=timeout - ) - - # if pipeline_id filtering is enabled, check if job matches - if pipeline_id is not None: - # look up the job to check its pipeline_id - with self.running_jobs_lock: - job = self.completed_jobs.get(job_id) - - # if job doesn't match our pipeline_id, save it for later - if job is None or job.pipeline_id != pipeline_id: - non_matching_jobs.append((job_id, result)) - print(f"trying to match job of type: {operation_type}") - continue # try next job - - # job matches (or no filtering) - return it - # if result is an exception, handle based on raise_on_error - if isinstance(result, Exception): - if raise_on_error: - raise result - return job_id, result - - return job_id, result - - except queue.Empty: - raise TimeoutError( - f"no completed jobs of type {operation_type.name} " - f"{'for pipeline ' + pipeline_id if pipeline_id else ''} " - f"within {timeout}s" - ) - finally: - # put non-matching jobs back into queue for other waiters - for job_id, result in non_matching_jobs: - self.completion_queues[operation_type].put((job_id, result)) - - def clear_completed_jobs( - self, - pipeline_id: str, - operation_type: Optional[OperationType] = None, - ) -> int: - """ - remove all completed jobs for a specific pipeline_id. - - this is a non-blocking cleanup operation that drains the completion queue - for the specified pipeline without waiting for or processing results. - - args: - pipeline_id: pipeline ID to filter by (required) - operation_type: optional type of jobs to clear. if None, clears all operation types. - - returns: - number of jobs cleared - - example: - >>> # clear all completed jobs for request-123 (all operation types) - >>> manager = ThreadPoolManager.get_instance() - >>> cleared = manager.clear_completed_jobs(pipeline_id="request-123") - >>> print(f"cleared {cleared} jobs") - - >>> # clear only claims extraction jobs for request-123 - >>> cleared = manager.clear_completed_jobs( - ... pipeline_id="request-123", - ... operation_type=OperationType.CLAIMS_EXTRACTION - ... ) - >>> print(f"cleared {cleared} jobs") - """ - cleared_count = 0 - - # if operation_type is None, clear from all operation types - if operation_type is None: - operation_types = list(OperationType) - else: - operation_types = [operation_type] - - # drain jobs from each operation type - for op_type in operation_types: - while True: - try: - # use very short timeout for non-blocking behavior - _job_id, _result = self.wait_next_completed( - operation_type=op_type, - timeout=0.001, # 1ms timeout = effectively non-blocking - raise_on_error=False, # don't raise on job errors - pipeline_id=pipeline_id - ) - cleared_count += 1 - except TimeoutError: - # no more jobs available for this operation type - break - - return cleared_count - - def clear_completed_jobs_async( - self, - pipeline_id: str, - operation_type: Optional[OperationType] = None, - ) -> Future: - """ - non-blocking background cleanup of completed jobs for a specific pipeline_id. - - this submits the cleanup as a background task and returns immediately, - allowing the server to continue responding while cleanup happens in the background. - - args: - pipeline_id: pipeline ID to filter by (required) - operation_type: optional type of jobs to clear. if None, clears all operation types. - - returns: - Future that will contain the number of jobs cleared when complete - - example: - >>> # submit cleanup in background and continue - >>> manager = ThreadPoolManager.get_instance() - >>> future = manager.clear_completed_jobs_async(pipeline_id="request-123") - >>> # server continues, cleanup happens in background - >>> # optionally check result later - >>> if future.done(): - ... cleared = future.result() - ... print(f"cleared {cleared} jobs") - """ - # use threading to avoid blocking the calling thread - import threading - - def cleanup_task(): - """background cleanup task""" - return self.clear_completed_jobs( - pipeline_id=pipeline_id, - operation_type=operation_type - ) - - # create future to track completion - future = Future() - - def run_cleanup(): - """wrapper that sets result on future""" - try: - result = cleanup_task() - future.set_result(result) - except Exception as e: - future.set_exception(e) - - # run in background thread - cleanup_thread = threading.Thread( - target=run_cleanup, - daemon=True, # daemon thread won't block shutdown - name=f"cleanup-{pipeline_id}" - ) - cleanup_thread.start() - - return future - - def wait_next_completed_any( - self, - timeout: Optional[float] = None, - raise_on_error: bool = True, - ) -> tuple: - """ - wait for and return result of next completed job of any operation type. - - useful for processing results from mixed operation types as they arrive. - - args: - timeout: max time to wait in seconds (None = wait forever) - raise_on_error: if True, raise exception if job failed; if False, return exception object - - returns: - tuple of (operation_type, job_id, result) - - raises: - TimeoutError: if timeout exceeded with no completion (via queue.Empty) - Exception: if job failed and raise_on_error is True - - example: - >>> # submit mixed jobs - >>> manager.submit(OperationType.CLAIMS_EXTRACTION, extract, claim1) - >>> manager.submit(OperationType.LINK_CONTEXT_EXPANDING, expand, link1) - >>> manager.submit(OperationType.CLAIMS_EXTRACTION, extract, claim2) - >>> - >>> # process ANY result as it completes - >>> for _ in range(3): - ... op_type, job_id, result = manager.wait_next_completed_any() - ... print(f"{op_type.name} job {job_id}: {result}") - """ - try: - operation_type, job_id, result = self.global_completion_queue.get( - block=True, - timeout=timeout - ) - - # if result is an exception, handle based on raise_on_error - if isinstance(result, Exception): - if raise_on_error: - raise result - return operation_type, job_id, result - - return operation_type, job_id, result - - except queue.Empty: - raise TimeoutError( - f"no completed jobs of any type within {timeout}s" - ) - - -# helper functions - - -def map_threaded( - items: List[T], - func: Callable[[T], R], - operation_type: OperationType, - manager: Optional[ThreadPoolManager] = None, -) -> List[R]: - """ - map function over items using thread pool (blocking). - - args: - items: list of items to process - func: function to apply to each item - operation_type: operation type (determines priority) - manager: thread pool manager (uses singleton if None) - - returns: - list of results in same order as input items - - example: - >>> def process_link(url: str) -> str: - ... return fetch_url(url) - >>> - >>> results = map_threaded( - ... items=urls, - ... func=process_link, - ... operation_type=OperationType.LINK_CONTEXT_EXPANDING - ... ) - """ - if manager is None: - manager = ThreadPoolManager.get_instance() - - if not manager._initialized: - raise RuntimeError("thread pool manager not initialized") - - # submit all jobs - futures = [ - manager.submit(operation_type, func, item) - for item in items - ] - - # wait for all to complete and collect results - results = [future.result() for future in futures] - - return results - - -async def map_threaded_async( - items: List[T], - func: Callable[[T], R], - operation_type: OperationType, - manager: Optional[ThreadPoolManager] = None, -) -> List[R]: - """ - map function over items using thread pool (async). - - args: - items: list of items to process - func: function to apply to each item - operation_type: operation type (determines priority) - manager: thread pool manager (uses singleton if None) - - returns: - list of results in same order as input items - - example: - >>> results = await map_threaded_async( - ... items=urls, - ... func=fetch_url, - ... operation_type=OperationType.LINK_CONTEXT_EXPANDING - ... ) - """ - if manager is None: - manager = ThreadPoolManager.get_instance() - - if not manager._initialized: - raise RuntimeError("thread pool manager not initialized") - - # submit all jobs and await completion - results = await asyncio.gather(*[ - manager.submit_async(operation_type, func, item) - for item in items - ]) - - return list(results) - - -def wait_all(futures: List[Future], timeout: Optional[float] = None) -> List[Any]: - """ - wait for all futures to complete and return results. - - args: - futures: list of futures to wait for - timeout: maximum time to wait in seconds (None = wait forever) - - returns: - list of results in same order as futures - - raises: - TimeoutError: if timeout is exceeded - """ - results = [] - start_time = time.time() - - for future in futures: - if timeout is not None: - elapsed = time.time() - start_time - remaining = timeout - elapsed - if remaining <= 0: - raise TimeoutError(f"wait_all timed out after {timeout}s") - result = future.result(timeout=remaining) - else: - result = future.result() - - results.append(result) - - return results - - -class ThreadPoolContext: - """ - context manager for thread pool lifecycle. - - ensures proper initialization and cleanup of ThreadPoolManager. - """ - - def __init__(self, max_workers: int = 25): - """ - initialize context manager. - - args: - max_workers: number of worker threads - """ - self.max_workers = max_workers - self.manager: Optional[ThreadPoolManager] = None - - def __enter__(self) -> ThreadPoolManager: - """initialize pool on context entry.""" - self.manager = ThreadPoolManager.get_instance(max_workers=self.max_workers) - self.manager.initialize() - return self.manager - - def __exit__(self, _exc_type, _exc_val, _exc_tb): - """cleanup pool on context exit.""" - if self.manager: - self.manager.shutdown(wait=True) - return False - - -async def __aenter__(self) -> ThreadPoolManager: - """initialize pool on async context entry.""" - self.manager = ThreadPoolManager.get_instance(max_workers=self.max_workers) - self.manager.initialize() - return self.manager - - -async def __aexit__(self, _exc_type, _exc_val, _exc_tb): - """cleanup pool on async context exit.""" - if self.manager: - self.manager.shutdown(wait=True) - return False - - -# convenience function -def with_thread_pool( - func: Callable[[ThreadPoolManager], R], - max_workers: int = 25 -) -> R: - """ - execute function with automatic pool lifecycle management. - - args: - func: function that takes ThreadPoolManager as argument - max_workers: number of worker threads - - returns: - result from func - - example: - >>> def process_data(manager: ThreadPoolManager): - ... return map_threaded( - ... items=data, - ... func=process_item, - ... operation_type=OperationType.CLAIMS_EXTRACTION, - ... manager=manager - ... ) - >>> - >>> results = with_thread_pool(process_data) - """ - with ThreadPoolContext(max_workers=max_workers) as manager: - return func(manager) diff --git a/app/ai/utils.py b/app/ai/utils.py deleted file mode 100644 index a126ae7..0000000 --- a/app/ai/utils.py +++ /dev/null @@ -1,175 +0,0 @@ -from app.models import FactCheckResult, VerdictTypeEnum -from app.ai.threads.thread_utils import ThreadPoolManager, OperationType -from app.observability.logger import get_logger, PipelineStep -from app.ai.log_utils import log_adjudication_output - - -def _chose_fact_checking_result( - original_result: FactCheckResult, - manager: ThreadPoolManager, - pipeline_id: str -) -> FactCheckResult: - """ - determines if the fact check result returned to the user will be the one from - the regular step or the fallback from adjudication with search. - - This is done by checking if the fallback result has found reliable fact-checking sources - for claims the main model does not have or if the fallback model and the main model disagree - on the number of false veredicts, favoring the one with more false veredicts to be conservative with fact-checking - - args: - original_result: the fact check result from normal adjudication - manager: thread pool manager to wait for adjudication_with_search job - pipeline_id: pipeline identifier to filter jobs - - returns: - either the original result or the adjudication_with_search result - """ - logger = get_logger(__name__, PipelineStep.ADJUDICATION) - try: - - total_num_claims = sum( - 1 - for result in original_result.results - for verdict in result.claim_verdicts - ) - - insufficient_old = sum( - 1 if verdict.verdict == "Fontes insuficientes para verificar" else 0 - for result in original_result.results - for verdict in result.claim_verdicts - ) - - number_of_original_false_claims = sum( - 1 if (verdict.verdict == VerdictTypeEnum.FALSO) or (verdict.verdict == VerdictTypeEnum.FORA_DE_CONTEXTO) else 0 - for result in original_result.results - for verdict in result.claim_verdicts - ) - - # All claims from the main fact-checking were already verified to be false, there is no condition where the fallback would be preffered - if total_num_claims > 0 and total_num_claims == number_of_original_false_claims: - return original_result - - # wait for adjudication_with_search job to complete (20 second timeout) - job_id, search_result = manager.wait_next_completed( - operation_type=OperationType.ADJUDICATION_WITH_SEARCH, - timeout=20.0, - raise_on_error=False, # don't raise on error, we'll check the result - pipeline_id=pipeline_id - ) - - # check if result is valid - if isinstance(search_result, Exception): - logger.warning( - f"adjudication_with_search job failed: {type(search_result).__name__}: {search_result}" - ) - # if original adjudication also failed, raise error - if len(original_result.results) == 0: - logger.error("both normal adjudication and fallback failed - raising error") - raise RuntimeError( - f"adjudication failed and fallback also failed. " - f"normal adjudication returned no results and adjudication_with_search failed: {search_result}" - ) from search_result - logger.info("using original insufficient sources result") - return original_result - elif search_result is None or not isinstance(search_result, FactCheckResult): - logger.warning( - f"adjudication_with_search returned invalid result: {type(search_result)}" - ) - # if original adjudication also failed, raise error - if len(original_result.results) == 0: - logger.error("both normal adjudication and fallback failed - raising error") - raise RuntimeError( - f"adjudication failed and fallback returned invalid result. " - f"normal adjudication returned no results and adjudication_with_search returned: {type(search_result)}" - ) - logger.info("using original insufficient sources result") - return original_result - elif len(search_result.results) == 0: - logger.warning("adjudication_with_search returned empty results") - # if original adjudication also failed, raise error - if len(original_result.results) == 0: - logger.error("both normal adjudication and fallback failed - raising error") - raise RuntimeError( - "adjudication failed and fallback returned empty results. " - "both normal adjudication and adjudication_with_search returned no results." - ) - logger.info("using original insufficient sources result") - return original_result - else: - # both results are valid - compare - - - number_of_fallback_false_claims = sum( - 1 if (verdict.verdict == VerdictTypeEnum.FALSO) or (verdict.verdict == VerdictTypeEnum.FORA_DE_CONTEXTO) else 0 - for result in search_result.results - for verdict in result.claim_verdicts - ) - - logger.info( - f"comparing results - original false claims: {number_of_original_false_claims}, " - f"fallback false claims: {number_of_fallback_false_claims}" - ) - - if number_of_fallback_false_claims >= number_of_original_false_claims: - logger.info( - f"[FALLBACK] using adjudication_with_search result (more false claims): " - f"{len(search_result.results)} results, " - f"{sum(len(r.claim_verdicts) for r in search_result.results)} verdicts" - ) - - # log both outputs for comparison - logger.info("[ORIGINAL OUTPUT - Insufficient Sources]") - log_adjudication_output(original_result) - logger.info("[FALLBACK OUTPUT - Adjudication with Search]") - log_adjudication_output(search_result) - - return search_result - else: - logger.info( - f"[ORIGINAL] using original result (fallback has fewer or equal false claims): " - f"{len(original_result.results)} results, " - f"{sum(len(r.claim_verdicts) for r in original_result.results)} verdicts" - ) - - # log both outputs for comparison - logger.info("[ORIGINAL OUTPUT - Selected]") - log_adjudication_output(original_result) - logger.info("[FALLBACK OUTPUT - Not Selected]") - log_adjudication_output(search_result) - - return original_result - - except TimeoutError: - logger.warning( - "adjudication_with_search job did not complete within 20 seconds" - ) - # if original adjudication failed (empty results), we can't return empty result - if len(original_result.results) == 0: - logger.error("both normal adjudication and fallback failed - raising error") - raise RuntimeError( - "adjudication failed and fallback did not complete in time. " - "normal adjudication returned no results and adjudication_with_search timed out after 20s." - ) - logger.info("using original insufficient sources result") - return original_result - except Exception as e: - logger.error( - f"error while waiting for adjudication_with_search: {type(e).__name__}: {e}", - exc_info=True - ) - - # if this is already a RuntimeError we raised earlier, just re-raise it - # (don't double-wrap our own error messages) - if isinstance(e, RuntimeError): - raise - - # for other exceptions, check if we need to raise or return original - if len(original_result.results) == 0: - logger.error("both normal adjudication and fallback failed - raising error") - raise RuntimeError( - f"adjudication failed and fallback also failed. " - f"normal adjudication returned no results and adjudication_with_search error: {e}" - ) from e - logger.info("using original insufficient sources result") - return original_result \ No newline at end of file diff --git a/app/api/endpoints/__init__.py b/app/api/endpoints/__init__.py index 33bd6a8..a103798 100644 --- a/app/api/endpoints/__init__.py +++ b/app/api/endpoints/__init__.py @@ -1,6 +1,5 @@ # Importar apenas scraping por enquanto para evitar dependências -from . import scraping, test, text, research +from . import scraping, text # from . import text # Descomentar quando precisar do pipeline completo -__all__ = ["scraping", "test", "text", "research"] - +__all__ = ["scraping", "text"] diff --git a/app/api/endpoints/research.py b/app/api/endpoints/research.py deleted file mode 100644 index f9ae9ce..0000000 --- a/app/api/endpoints/research.py +++ /dev/null @@ -1,167 +0,0 @@ -import time -import logging -from typing import Optional - -from fastapi import APIRouter -from pydantic import BaseModel, Field - -from app.ai.context.web import searchGoogleClaim -from app.ai.context.web.apify_utils import scrapeGenericSimple - -router = APIRouter() -logger = logging.getLogger(__name__) - - -class ClaimSearchRequest(BaseModel): - """request model for claim research""" - claim: str = Field(..., description="claim or statement to research", min_length=3) - max_results: Optional[int] = Field(default=10, description="maximum number of search results", ge=1, le=50) - enrich_with_content: Optional[bool] = Field( - default=False, - description="try to fetch full content from top result (no apify credits used)" - ) - - -class SearchResult(BaseModel): - """individual search result""" - title: str - url: str - description: str - position: int - domain: str - full_content: Optional[str] = None # enriched content from simple scraping - content_length: Optional[int] = None # length of full_content if available - scraping_success: Optional[bool] = None # whether content enrichment worked - - -class ClaimSearchResponse(BaseModel): - """response model for claim research""" - success: bool - claim: str - results: list[SearchResult] - total_results: int - processing_time_ms: int - metadata: Optional[dict] = None - error: Optional[str] = None - - -@router.post("/search-claim", response_model=ClaimSearchResponse) -async def search_claim(request: ClaimSearchRequest) -> ClaimSearchResponse: - """ - search google for information about a claim or statement. - helps with fact-checking by providing relevant search results. - - if enrich_with_content=true, tries to scrape full content from top result - (uses simple http scraping, no browser, no apify credits). - - examples: - - "vaccines cause autism" - - "earth is flat" - - "coffee prevents cancer" - """ - start_time = time.time() - - max_results:int - if request.max_results is None: - max_results = 5 - else: - max_results = request.max_results - - try: - result = await searchGoogleClaim(claim=request.claim, maxResults=max_results) - processing_time = int((time.time() - start_time) * 1000) - - if result["success"]: - search_results = result["results"] - - # enrich first result with full content if requested - if request.enrich_with_content and len(search_results) > 0: - logger.info(f"enriching top result with full content for claim: {request.claim}") - top_result = search_results[0] - top_url = top_result.get("url", "") - - if top_url: - # try simple scraping (no browser, no apify) - scrape_result = await scrapeGenericSimple(url=top_url, maxChars=10000) - - if scrape_result["success"]: - top_result["full_content"] = scrape_result["content"] - top_result["content_length"] = len(scrape_result["content"]) - top_result["scraping_success"] = True - logger.info(f"successfully enriched with {top_result['content_length']} chars") - else: - top_result["full_content"] = None - top_result["content_length"] = 0 - top_result["scraping_success"] = False - logger.warning(f"failed to enrich content: {scrape_result.get('error')}") - - return ClaimSearchResponse( - success=True, - claim=result["claim"], - results=[ - SearchResult(**r) for r in search_results - ], - total_results=result["total_results"], - processing_time_ms=processing_time, - metadata=result.get("metadata"), - error=None - ) - else: - return ClaimSearchResponse( - success=False, - claim=request.claim, - results=[], - total_results=0, - processing_time_ms=processing_time, - metadata=None, - error=result.get("error", "unknown error") - ) - - except Exception as e: - processing_time = int((time.time() - start_time) * 1000) - logger.error(f"claim search failed for '{request.claim}': {e}") - - return ClaimSearchResponse( - success=False, - claim=request.claim, - results=[], - total_results=0, - processing_time_ms=processing_time, - metadata=None, - error=str(e) - ) - - -@router.get("/research-status") -async def research_status(): - """check google search and serper fallback configuration and availability""" - import os - api_key = os.getenv("GOOGLE_SEARCH_API_KEY") - cse_cx = os.getenv("GOOGLE_CSE_CX") - serper_key = os.getenv("SERPER_API_KEY") - google_configured = bool(api_key and cse_cx) - serper_configured = bool(serper_key) - is_configured = google_configured or serper_configured - - return { - "research_available": is_configured, - "google_search_configured": google_configured, - "serper_fallback_configured": serper_configured, - "api_key_status": "configured" if api_key else "missing", - "cse_cx_status": "configured" if cse_cx else "missing", - "serper_key_status": "configured" if serper_key else "missing", - "search_engine": "google" if google_configured else ("serper-fallback" if serper_configured else "none"), - "supported_features": { - "claim_search": "available", - "fact_checking_support": "provides search results for verification", - "multi_language": "supports portuguese (pt) and other languages", - "serper_fallback": "enabled" if serper_configured else "disabled (set SERPER_API_KEY)" - }, - "api": "google-custom-search", - "note": ( - "google search ready" + (" with serper fallback" if serper_configured else "") - if google_configured - else "set GOOGLE_SEARCH_API_KEY and GOOGLE_CSE_CX in environment" - ) - } - diff --git a/app/api/endpoints/scraping.py b/app/api/endpoints/scraping.py index 5954d42..ca5615f 100644 --- a/app/api/endpoints/scraping.py +++ b/app/api/endpoints/scraping.py @@ -6,7 +6,7 @@ from fastapi import APIRouter from pydantic import BaseModel, Field -from app.ai.context.web.apify_utils import scrapeGenericUrl +from app.agentic_ai.context.web.apify_utils import scrapeGenericUrl router = APIRouter() logger = logging.getLogger(__name__) diff --git a/app/api/endpoints/test.py b/app/api/endpoints/test.py deleted file mode 100644 index f38b8c8..0000000 --- a/app/api/endpoints/test.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -Test endpoints for development and debugging. - -provides alternative endpoints with different configurations for testing purposes. -""" -import time -import asyncio -import traceback -from fastapi import APIRouter, HTTPException -from app.models.api import Request, AnalysisResponse -from app.clients import send_analytics_payload -from app.api import request_to_data_sources -from app.api.mapper import request_to_data_sources,fact_check_result_to_response, sanitize_request, sanitize_response -from app.ai import run_fact_check_pipeline -from app.config.gemini_models import get_gemini_default_pipeline_config -from app.ai.tests.fixtures.mock_pipelinesteps import WithoutBrowsingPipelineSteps, UnverifiableMockPipelineSteps -from app.observability.logger.logger import get_logger -from app.observability.analytics import AnalyticsCollector -from app.utils.id_generator import generate_message_id - - -router = APIRouter() -logger = get_logger(__name__) - - -@router.post("/text-without-browser", response_model=AnalysisResponse) -async def analyze_text_without_browser(request: Request) -> AnalysisResponse: - """ - fact-check content without browser-based scraping (Apify). - - uses hybrid approach: - - mocks social media URLs (Facebook, Instagram, Twitter, TikTok) to avoid Apify - - uses real simple HTTP scraping for generic URLs - - only uses GoogleFactCheckGatherer for evidence (no web search) - - ideal for: - - development and testing - - avoiding Apify API costs - - faster response times - - offline testing (with limitations) - - accepts an array of content items, each with textContent and type. - returns detailed analysis with verdict, rationale, and citations. - """ - start_time = time.time() - msg_id = generate_message_id() - - logger.info(f"[{msg_id}] received /text-without-browser request with {len(request.content)} content item(s)") - - try: - # step 0: sanitize request to remove PII - logger.info(f"[{msg_id}] sanitizing request to remove PII") - sanitized_request = sanitize_request(request) - - analytics = AnalyticsCollector(msg_id) - # log request details - for idx, item in enumerate(sanitized_request.content): - content_preview = item.textContent[:100] if item.textContent else "None" - logger.info(f"[{msg_id}] content[{idx}]: type={item.type}, text_length={len(item.textContent or '')}, preview='{content_preview}...'") - - # step 1: convert API request to internal DataSource format - logger.info(f"[{msg_id}] converting request to data sources") - data_sources = request_to_data_sources(sanitized_request) - analytics.populate_from_data_sources(data_sources) - logger.info(f"[{msg_id}] created {len(data_sources)} data source(s)") - - # step 2: get pipeline configuration - logger.info(f"[{msg_id}] initializing gemini pipeline config (no-browser mode)") - config = get_gemini_default_pipeline_config() - - # step 3: use WithoutBrowsingPipelineSteps (hybrid mock/real) - pipeline_steps = WithoutBrowsingPipelineSteps() - logger.info(f"[{msg_id}] using WithoutBrowsingPipelineSteps (mocks social media, real simple scraping)") - - # step 4: run the async fact-checking pipeline - logger.info(f"[{msg_id}] starting fact-check pipeline") - pipeline_start = time.time() - fact_check_result = await run_fact_check_pipeline( - data_sources, - config, - pipeline_steps, - analytics, - message_id=msg_id - ) - pipeline_duration = (time.time() - pipeline_start) * 1000 - logger.info(f"[{msg_id}] pipeline completed in {pipeline_duration:.0f}ms") - - # log pipeline results - total_claims = sum(len(ds_result.claim_verdicts) for ds_result in fact_check_result.results) - logger.info(f"[{msg_id}] extracted {total_claims} claim(s) from {len(fact_check_result.results)} data source(s)") - - # build response - logger.info(f"[{msg_id}] building response") - response = fact_check_result_to_response(msg_id, fact_check_result) - - # sanitize response to remove PII - logger.info(f"[{msg_id}] sanitizing response to remove PII") - sanitized_response = sanitize_response(response) - - analytics.set_final_response(sanitized_response.rationale) - # only send analytics if claims were extracted - if analytics.has_extracted_claims(): - logger.info(f"[{msg_id}] sending analytics payload (claims found)") - asyncio.create_task(send_analytics_payload(analytics)) - else: - logger.info(f"[{msg_id}] skipping analytics payload (no claims extracted)") - - total_duration = (time.time() - start_time) * 1000 - logger.info(f"[{msg_id}] request completed successfully in {total_duration:.0f}ms") - - return sanitized_response - - except Exception as e: - total_duration = (time.time() - start_time) * 1000 - error_type = type(e).__name__ - logger.error(f"[{msg_id}] request failed after {total_duration:.0f}ms: {error_type}: {str(e)}") - logger.error(f"[{msg_id}] traceback:\n{traceback.format_exc()}") - raise HTTPException( - status_code=500, - detail=f"Error processing request: {str(e)}" - ) from e - -@router.post("/text-adjudication-search", response_model=AnalysisResponse) -async def analyze_text_with_adjundication_search(request: Request) -> AnalysisResponse: - """ - test endpoint for adjudication_with_search fallback logic. - - uses UnverifiableMockPipelineSteps which: - - mocks link expansion (no browser scraping) - - returns hard-coded unverifiable results for normal adjudication - - triggers the adjudication_with_search fallback with OpenAI web search - - ideal for: - - testing the fallback mechanism when all sources are insufficient - - integration testing of adjudication_with_search - - verifying that OpenAI web search provides better results - """ - start_time = time.time() - msg_id = generate_message_id() - - logger.info(f"[{msg_id}] received /text-adjudication-search request with {len(request.content)} content item(s)") - - try: - # step 0: sanitize request to remove PII - logger.info(f"[{msg_id}] sanitizing request to remove PII") - sanitized_request = sanitize_request(request) - - analytics = AnalyticsCollector(msg_id) - # log request details - for idx, item in enumerate(sanitized_request.content): - content_preview = item.textContent[:100] if item.textContent else "None" - logger.info(f"[{msg_id}] content[{idx}]: type={item.type}, text_length={len(item.textContent or '')}, preview='{content_preview}...'") - - # step 1: convert API request to internal DataSource format - logger.info(f"[{msg_id}] converting request to data sources") - data_sources = request_to_data_sources(sanitized_request) - analytics.populate_from_data_sources(data_sources) - logger.info(f"[{msg_id}] created {len(data_sources)} data source(s)") - - # step 2: get pipeline configuration - logger.info(f"[{msg_id}] initializing gemini pipeline config (adjudication-with-search mode)") - config = get_gemini_default_pipeline_config() - - # step 3: use UnverifiableMockPipelineSteps to trigger fallback - pipeline_steps = UnverifiableMockPipelineSteps() - logger.info(f"[{msg_id}] using UnverifiableMockPipelineSteps (will trigger adjudication_with_search fallback)") - logger.info(f"[{msg_id}] UnverifiableMockPipelineSteps.adjudicate_claims will return all unverifiable") - - # step 4: run the async fact-checking pipeline - logger.info(f"[{msg_id}] starting fact-check pipeline (adjudication will return unverifiable)") - pipeline_start = time.time() - fact_check_result = await run_fact_check_pipeline( - data_sources, - config, - pipeline_steps, - analytics, - message_id=msg_id - ) - - pipeline_duration = (time.time() - pipeline_start) * 1000 - logger.info(f"[{msg_id}] pipeline completed in {pipeline_duration:.0f}ms") - - # log pipeline results - total_claims = sum(len(ds_result.claim_verdicts) for ds_result in fact_check_result.results) - logger.info(f"[{msg_id}] extracted {total_claims} claim(s) from {len(fact_check_result.results)} data source(s)") - - # build response - logger.info(f"[{msg_id}] building response") - response = fact_check_result_to_response(msg_id, fact_check_result) - - # sanitize response to remove PII - logger.info(f"[{msg_id}] sanitizing response to remove PII") - sanitized_response = sanitize_response(response) - - analytics.set_final_response(sanitized_response.rationale) - # only send analytics if claims were extracted - if analytics.has_extracted_claims(): - logger.info(f"[{msg_id}] sending analytics payload (claims found)") - asyncio.create_task(send_analytics_payload(analytics)) - else: - logger.info(f"[{msg_id}] skipping analytics payload (no claims extracted)") - - total_duration = (time.time() - start_time) * 1000 - logger.info(f"[{msg_id}] request completed successfully in {total_duration:.0f}ms") - - return sanitized_response - - except Exception as e: - total_duration = (time.time() - start_time) * 1000 - error_type = type(e).__name__ - logger.error(f"[{msg_id}] request failed after {total_duration:.0f}ms: {error_type}: {str(e)}") - logger.error(f"[{msg_id}] traceback:\n{traceback.format_exc()}") - raise HTTPException( - status_code=500, - detail=f"Error processing request: {str(e)}" - ) from e \ No newline at end of file diff --git a/app/clients/memorystore.py b/app/clients/memorystore.py new file mode 100644 index 0000000..3c54550 --- /dev/null +++ b/app/clients/memorystore.py @@ -0,0 +1,130 @@ +""" +async redis client for GCP Memorystore with circuit breaker. + +all public methods return None/False on errors — never raises. +if REDIS_HOST is not set, caching is silently disabled. +""" + +import logging +import os +import time +from typing import Optional + +import redis.asyncio as aioredis + +logger = logging.getLogger(__name__) + +_redis_client: Optional[aioredis.Redis] = None + +# circuit breaker state +_consecutive_failures: int = 0 +_circuit_open_until: float = 0.0 +_FAILURE_THRESHOLD: int = 3 +_RECOVERY_TIMEOUT: float = 60.0 + + +def _circuit_is_open() -> bool: + global _circuit_open_until + if _consecutive_failures < _FAILURE_THRESHOLD: + return False + if time.monotonic() >= _circuit_open_until: + # allow a single probe + return False + return True + + +def _record_success() -> None: + global _consecutive_failures, _circuit_open_until + _consecutive_failures = 0 + _circuit_open_until = 0.0 + + +def _record_failure() -> None: + global _consecutive_failures, _circuit_open_until + _consecutive_failures += 1 + if _consecutive_failures >= _FAILURE_THRESHOLD: + _circuit_open_until = time.monotonic() + _RECOVERY_TIMEOUT + logger.warning( + "redis circuit breaker OPEN after %d failures, retrying in %.0fs", + _consecutive_failures, + _RECOVERY_TIMEOUT, + ) + + +def get_redis_client() -> Optional[aioredis.Redis]: + """return a singleton async redis client, or None if not configured.""" + global _redis_client + + host = os.getenv("REDIS_HOST", "").strip() + if not host: + return None + + if _redis_client is not None: + return _redis_client + + port = int(os.getenv("REDIS_PORT", "6379")) + password = os.getenv("REDIS_PASSWORD", "") or None + db = int(os.getenv("REDIS_DB", "0")) + + _redis_client = aioredis.Redis( + host=host, + port=port, + password=password, + db=db, + socket_timeout=0.5, + socket_connect_timeout=0.5, + decode_responses=False, + ) + logger.info("redis client created for %s:%d db=%d", host, port, db) + return _redis_client + + +async def safe_get(key: str) -> Optional[bytes]: + """get a value from redis. returns None on any error or if disabled.""" + if _circuit_is_open(): + return None + + client = get_redis_client() + if client is None: + return None + + try: + value = await client.get(key) + _record_success() + return value + except Exception as e: + _record_failure() + logger.warning("redis GET failed for key=%s: %s", key, e) + return None + + +async def safe_set(key: str, value: bytes, ex: int) -> bool: + """set a value in redis with TTL. returns False on any error or if disabled.""" + if _circuit_is_open(): + return False + + client = get_redis_client() + if client is None: + return False + + try: + await client.set(key, value, ex=ex) + _record_success() + return True + except Exception as e: + _record_failure() + logger.warning("redis SET failed for key=%s: %s", key, e) + return False + + +def reset_circuit_breaker() -> None: + """reset circuit breaker state — useful for tests.""" + global _consecutive_failures, _circuit_open_until + _consecutive_failures = 0 + _circuit_open_until = 0.0 + + +def reset_client() -> None: + """reset the singleton client — useful for tests.""" + global _redis_client + _redis_client = None diff --git a/app/clients/tests/__init__.py b/app/clients/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/clients/tests/test_memorystore.py b/app/clients/tests/test_memorystore.py new file mode 100644 index 0000000..b293a2a --- /dev/null +++ b/app/clients/tests/test_memorystore.py @@ -0,0 +1,366 @@ +""" +tests for memorystore: singleton client, safe_get/safe_set, +and circuit breaker behavior. +""" + +import time +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +import app.clients.memorystore as memorystore +from app.clients.memorystore import ( + get_redis_client, + safe_get, + safe_set, + reset_circuit_breaker, + reset_client, + _record_failure, + _record_success, + _circuit_is_open, + _FAILURE_THRESHOLD, + _RECOVERY_TIMEOUT, +) + + +@pytest.fixture(autouse=True) +def _clean_state(): + """reset module-level state before and after each test.""" + reset_circuit_breaker() + reset_client() + yield + reset_circuit_breaker() + reset_client() + + +# ── get_redis_client ───────────────────────────────────────────────── + +class TestGetRedisClient: + def test_returns_none_when_host_not_set(self, monkeypatch): + monkeypatch.delenv("REDIS_HOST", raising=False) + assert get_redis_client() is None + + def test_returns_none_for_blank_host(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", " ") + assert get_redis_client() is None + + def test_creates_client_when_host_set(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + client = get_redis_client() + assert client is not None + + def test_singleton_returns_same_instance(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + c1 = get_redis_client() + c2 = get_redis_client() + assert c1 is c2 + + def test_custom_port_and_db(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "10.0.0.5") + monkeypatch.setenv("REDIS_PORT", "6380") + monkeypatch.setenv("REDIS_DB", "2") + client = get_redis_client() + pool = client.connection_pool + kwargs = pool.connection_kwargs + assert kwargs["host"] == "10.0.0.5" + assert kwargs["port"] == 6380 + assert kwargs["db"] == 2 + + def test_password_none_when_empty(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + monkeypatch.delenv("REDIS_PASSWORD", raising=False) + client = get_redis_client() + kwargs = client.connection_pool.connection_kwargs + assert kwargs["password"] is None + + def test_password_set_when_provided(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + monkeypatch.setenv("REDIS_PASSWORD", "secret123") + client = get_redis_client() + kwargs = client.connection_pool.connection_kwargs + assert kwargs["password"] == "secret123" + + def test_reset_client_allows_new_instance(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + c1 = get_redis_client() + reset_client() + monkeypatch.setenv("REDIS_PORT", "6380") + c2 = get_redis_client() + assert c1 is not c2 + + +# ── circuit breaker ────────────────────────────────────────────────── + +class TestCircuitBreaker: + def test_circuit_closed_initially(self): + assert _circuit_is_open() is False + + def test_circuit_stays_closed_under_threshold(self): + for _ in range(_FAILURE_THRESHOLD - 1): + _record_failure() + assert _circuit_is_open() is False + + def test_circuit_opens_at_threshold(self): + for _ in range(_FAILURE_THRESHOLD): + _record_failure() + assert _circuit_is_open() is True + + def test_circuit_closes_after_recovery_timeout(self, monkeypatch): + for _ in range(_FAILURE_THRESHOLD): + _record_failure() + assert _circuit_is_open() is True + + # fast-forward past recovery timeout + future = time.monotonic() + _RECOVERY_TIMEOUT + 1 + monkeypatch.setattr(time, "monotonic", lambda: future) + assert _circuit_is_open() is False + + def test_success_resets_circuit(self): + for _ in range(_FAILURE_THRESHOLD): + _record_failure() + assert _circuit_is_open() is True + + _record_success() + assert _circuit_is_open() is False + + def test_reset_circuit_breaker_clears_state(self): + for _ in range(_FAILURE_THRESHOLD): + _record_failure() + assert _circuit_is_open() is True + + reset_circuit_breaker() + assert _circuit_is_open() is False + + def test_failures_accumulate_across_calls(self): + _record_failure() + _record_failure() + assert _circuit_is_open() is False + _record_failure() # hits threshold + assert _circuit_is_open() is True + + def test_success_after_partial_failures_resets_count(self): + _record_failure() + _record_failure() + _record_success() + # counter reset, so 3 more failures needed + _record_failure() + _record_failure() + assert _circuit_is_open() is False + + +# ── safe_get ───────────────────────────────────────────────────────── + +class TestSafeGet: + @pytest.mark.asyncio + async def test_returns_none_when_no_host(self, monkeypatch): + monkeypatch.delenv("REDIS_HOST", raising=False) + result = await safe_get("some_key") + assert result is None + + @pytest.mark.asyncio + async def test_returns_value_on_success(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.get.return_value = b"cached_data" + memorystore._redis_client = mock_client + + result = await safe_get("my_key") + assert result == b"cached_data" + mock_client.get.assert_called_once_with("my_key") + + @pytest.mark.asyncio + async def test_returns_none_on_key_not_found(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.get.return_value = None + memorystore._redis_client = mock_client + + result = await safe_get("missing_key") + assert result is None + + @pytest.mark.asyncio + async def test_returns_none_on_exception(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.get.side_effect = ConnectionError("connection refused") + memorystore._redis_client = mock_client + + result = await safe_get("key") + assert result is None + + @pytest.mark.asyncio + async def test_records_failure_on_exception(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.get.side_effect = TimeoutError("timed out") + memorystore._redis_client = mock_client + + assert memorystore._consecutive_failures == 0 + await safe_get("key") + assert memorystore._consecutive_failures == 1 + + @pytest.mark.asyncio + async def test_records_success_on_hit(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.get.return_value = b"data" + memorystore._redis_client = mock_client + + # inject a prior failure + _record_failure() + assert memorystore._consecutive_failures == 1 + + await safe_get("key") + assert memorystore._consecutive_failures == 0 + + @pytest.mark.asyncio + async def test_skips_call_when_circuit_open(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + memorystore._redis_client = mock_client + + for _ in range(_FAILURE_THRESHOLD): + _record_failure() + + result = await safe_get("key") + assert result is None + mock_client.get.assert_not_called() + + +# ── safe_set ───────────────────────────────────────────────────────── + +class TestSafeSet: + @pytest.mark.asyncio + async def test_returns_false_when_no_host(self, monkeypatch): + monkeypatch.delenv("REDIS_HOST", raising=False) + result = await safe_set("key", b"val", ex=60) + assert result is False + + @pytest.mark.asyncio + async def test_returns_true_on_success(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + memorystore._redis_client = mock_client + + result = await safe_set("key", b"value", ex=300) + assert result is True + mock_client.set.assert_called_once_with("key", b"value", ex=300) + + @pytest.mark.asyncio + async def test_returns_false_on_exception(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.set.side_effect = ConnectionError("connection refused") + memorystore._redis_client = mock_client + + result = await safe_set("key", b"val", ex=60) + assert result is False + + @pytest.mark.asyncio + async def test_records_failure_on_exception(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.set.side_effect = OSError("broken pipe") + memorystore._redis_client = mock_client + + await safe_set("key", b"val", ex=60) + assert memorystore._consecutive_failures == 1 + + @pytest.mark.asyncio + async def test_records_success_clears_failures(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + memorystore._redis_client = mock_client + + _record_failure() + _record_failure() + assert memorystore._consecutive_failures == 2 + + await safe_set("key", b"val", ex=60) + assert memorystore._consecutive_failures == 0 + + @pytest.mark.asyncio + async def test_skips_call_when_circuit_open(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + memorystore._redis_client = mock_client + + for _ in range(_FAILURE_THRESHOLD): + _record_failure() + + result = await safe_set("key", b"val", ex=60) + assert result is False + mock_client.set.assert_not_called() + + +# ── circuit breaker + safe_get/safe_set integration ────────────────── + +class TestCircuitBreakerIntegration: + @pytest.mark.asyncio + async def test_three_get_failures_open_circuit(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.get.side_effect = ConnectionError("refused") + memorystore._redis_client = mock_client + + for _ in range(_FAILURE_THRESHOLD): + await safe_get("key") + + # circuit is now open — next call should not reach redis + mock_client.get.reset_mock() + result = await safe_get("key") + assert result is None + mock_client.get.assert_not_called() + + @pytest.mark.asyncio + async def test_three_set_failures_open_circuit(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.set.side_effect = ConnectionError("refused") + memorystore._redis_client = mock_client + + for _ in range(_FAILURE_THRESHOLD): + await safe_set("key", b"v", ex=60) + + # circuit open — blocks get too + mock_client.get = AsyncMock(return_value=b"data") + result = await safe_get("key") + assert result is None + mock_client.get.assert_not_called() + + @pytest.mark.asyncio + async def test_circuit_allows_probe_after_timeout(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.get.side_effect = ConnectionError("refused") + memorystore._redis_client = mock_client + + for _ in range(_FAILURE_THRESHOLD): + await safe_get("key") + assert _circuit_is_open() is True + + # fast-forward past recovery + future = time.monotonic() + _RECOVERY_TIMEOUT + 1 + monkeypatch.setattr(time, "monotonic", lambda: future) + + # circuit allows a probe — redis call happens again + mock_client.get.side_effect = None + mock_client.get.return_value = b"recovered" + result = await safe_get("key") + assert result == b"recovered" + # success resets circuit + assert memorystore._consecutive_failures == 0 + + @pytest.mark.asyncio + async def test_mixed_get_set_failures_accumulate(self, monkeypatch): + monkeypatch.setenv("REDIS_HOST", "localhost") + mock_client = AsyncMock() + mock_client.get.side_effect = ConnectionError("refused") + mock_client.set.side_effect = ConnectionError("refused") + memorystore._redis_client = mock_client + + await safe_get("k") # failure 1 + await safe_set("k", b"v", ex=60) # failure 2 + assert _circuit_is_open() is False + await safe_get("k") # failure 3 — opens circuit + assert _circuit_is_open() is True diff --git a/app/clients/tests/test_web_search_cache.py b/app/clients/tests/test_web_search_cache.py new file mode 100644 index 0000000..0b5bcda --- /dev/null +++ b/app/clients/tests/test_web_search_cache.py @@ -0,0 +1,227 @@ +""" +tests for web_search_cache: normalization, key building, serialization, +and cached_custom_search integration with mock redis. +""" + +import json +import zlib +from unittest.mock import AsyncMock, patch + +import pytest + +from app.clients.web_search_cache import ( + normalize_query, + hash_domains, + build_cache_key, + serialize, + deserialize, + cached_custom_search, +) + + +# ── normalize_query ────────────────────────────────────────────────── + +class TestNormalizeQuery: + def test_lowercase_and_strip(self): + assert normalize_query(" Climate Change ") == "climate change" + + def test_collapse_whitespace(self): + assert normalize_query("hello world\t\nfoo") == "hello world foo" + + def test_empty_string(self): + assert normalize_query("") == "" + + def test_already_normalized(self): + assert normalize_query("already clean") == "already clean" + + def test_mixed_case_and_spaces(self): + assert normalize_query(" A B c ") == "a b c" + + +# ── hash_domains ───────────────────────────────────────────────────── + +class TestHashDomains: + def test_none_returns_nodomain(self): + assert hash_domains(None) == "nodomain" + + def test_empty_list_returns_nodomain(self): + assert hash_domains([]) == "nodomain" + + def test_blank_entries_returns_nodomain(self): + assert hash_domains(["", " "]) == "nodomain" + + def test_deterministic(self): + h1 = hash_domains(["a.com", "b.com"]) + h2 = hash_domains(["a.com", "b.com"]) + assert h1 == h2 + + def test_order_independent(self): + h1 = hash_domains(["b.com", "a.com"]) + h2 = hash_domains(["a.com", "b.com"]) + assert h1 == h2 + + def test_case_independent(self): + h1 = hash_domains(["A.COM"]) + h2 = hash_domains(["a.com"]) + assert h1 == h2 + + def test_hash_length(self): + h = hash_domains(["example.com"]) + assert len(h) == 12 + + +# ── build_cache_key ────────────────────────────────────────────────── + +class TestBuildCacheKey: + def test_short_query_inline(self): + key = build_cache_key("climate change", None) + assert key == "web_search:v1:climate_change:nodomain" + + def test_long_query_hashed(self): + long_q = "a " * 60 # > 100 chars + key = build_cache_key(long_q, None) + parts = key.split(":") + assert parts[0] == "web_search" + assert parts[1] == "v1" + assert len(parts[2]) == 64 # sha256 hex + assert parts[3] == "nodomain" + + def test_with_domains(self): + key = build_cache_key("test", ["a.com", "b.com"]) + assert key.startswith("web_search:v1:test:") + assert key.split(":")[-1] != "nodomain" + + def test_different_queries_different_keys(self): + k1 = build_cache_key("query one", None) + k2 = build_cache_key("query two", None) + assert k1 != k2 + + def test_same_query_different_case_same_key(self): + k1 = build_cache_key("Hello World", None) + k2 = build_cache_key("hello world", None) + assert k1 == k2 + + +# ── serialize / deserialize ────────────────────────────────────────── + +class TestSerialization: + def test_roundtrip(self): + data = [ + {"title": "Test", "link": "https://example.com", "snippet": "...", "displayLink": "example.com"}, + {"title": "Another", "link": "https://other.com", "snippet": "x", "displayLink": "other.com"}, + ] + compressed = serialize(data) + assert isinstance(compressed, bytes) + result = deserialize(compressed) + assert result == data + + def test_empty_list_roundtrip(self): + data = [] + assert deserialize(serialize(data)) == [] + + def test_corrupted_data_returns_none(self): + assert deserialize(b"not valid zlib data") is None + + def test_compressed_is_smaller(self): + data = [{"title": f"Item {i}", "link": f"https://example.com/{i}", "snippet": "a" * 200, "displayLink": "example.com"} for i in range(10)] + raw_json = json.dumps(data).encode() + compressed = serialize(data) + assert len(compressed) < len(raw_json) + + def test_unicode_roundtrip(self): + data = [{"title": "Notícia sobre saúde", "link": "https://ex.com", "snippet": "à é ü ñ", "displayLink": "ex.com"}] + assert deserialize(serialize(data)) == data + + +# ── cached_custom_search (integration with mock redis) ─────────────── + +@pytest.fixture +def sample_results(): + return [ + {"title": "Result 1", "link": "https://a.com", "snippet": "snip 1", "displayLink": "a.com"}, + {"title": "Result 2", "link": "https://b.com", "snippet": "snip 2", "displayLink": "b.com"}, + ] + + +@pytest.fixture +def mock_search_fn(sample_results): + fn = AsyncMock(return_value=sample_results) + return fn + + +class TestCachedCustomSearch: + @pytest.mark.asyncio + @patch("app.clients.web_search_cache.safe_get", new_callable=AsyncMock, return_value=None) + @patch("app.clients.web_search_cache.safe_set", new_callable=AsyncMock, return_value=True) + async def test_cache_miss_calls_original(self, mock_set, mock_get, mock_search_fn, sample_results): + result = await cached_custom_search( + "test query", num=10, domains=None, timeout=15.0, + original_search_fn=mock_search_fn, + ) + assert result == sample_results + mock_search_fn.assert_called_once_with("test query", num=10, domains=None, timeout=15.0) + mock_set.assert_called_once() + + @pytest.mark.asyncio + @patch("app.clients.web_search_cache.safe_set", new_callable=AsyncMock) + @patch("app.clients.web_search_cache.safe_get", new_callable=AsyncMock) + async def test_cache_hit_skips_original(self, mock_get, mock_set, mock_search_fn, sample_results): + # simulate cached compressed data + mock_get.return_value = serialize(sample_results) + + result = await cached_custom_search( + "test query", num=10, domains=None, timeout=15.0, + original_search_fn=mock_search_fn, + ) + assert result == sample_results + mock_search_fn.assert_not_called() + mock_set.assert_not_called() + + @pytest.mark.asyncio + @patch("app.clients.web_search_cache.safe_set", new_callable=AsyncMock, return_value=True) + @patch("app.clients.web_search_cache.safe_get", new_callable=AsyncMock, return_value=None) + async def test_redis_unavailable_on_get_falls_through(self, mock_get, mock_set, mock_search_fn, sample_results): + # safe_get returns None (redis unavailable) — should call original + result = await cached_custom_search( + "test query", num=10, domains=None, timeout=15.0, + original_search_fn=mock_search_fn, + ) + assert result == sample_results + mock_search_fn.assert_called_once() + + @pytest.mark.asyncio + @patch("app.clients.web_search_cache.safe_set", new_callable=AsyncMock, return_value=False) + @patch("app.clients.web_search_cache.safe_get", new_callable=AsyncMock, return_value=None) + async def test_redis_error_on_set_still_returns_result(self, mock_get, mock_set, mock_search_fn, sample_results): + # safe_set returns False (redis error) — result should still be returned + result = await cached_custom_search( + "test query", num=10, domains=None, timeout=15.0, + original_search_fn=mock_search_fn, + ) + assert result == sample_results + + @pytest.mark.asyncio + @patch("app.clients.web_search_cache.safe_set", new_callable=AsyncMock) + @patch("app.clients.web_search_cache.safe_get", new_callable=AsyncMock) + async def test_corrupted_cache_treated_as_miss(self, mock_get, mock_set, mock_search_fn, sample_results): + # return corrupted data — should fall through to original + mock_get.return_value = b"corrupted data" + + result = await cached_custom_search( + "test query", num=10, domains=None, timeout=15.0, + original_search_fn=mock_search_fn, + ) + assert result == sample_results + mock_search_fn.assert_called_once() + + @pytest.mark.asyncio + @patch("app.clients.web_search_cache.safe_set", new_callable=AsyncMock, return_value=True) + @patch("app.clients.web_search_cache.safe_get", new_callable=AsyncMock, return_value=None) + async def test_empty_results_not_cached(self, mock_get, mock_set): + empty_fn = AsyncMock(return_value=[]) + result = await cached_custom_search( + "test", num=10, domains=None, timeout=15.0, + original_search_fn=empty_fn, + ) + assert result == [] + mock_set.assert_not_called() diff --git a/app/clients/web_search_cache.py b/app/clients/web_search_cache.py new file mode 100644 index 0000000..c99d8f9 --- /dev/null +++ b/app/clients/web_search_cache.py @@ -0,0 +1,112 @@ +""" +caching layer for web search queries using Redis (GCP Memorystore). + +normalizes queries, builds deterministic cache keys, and stores results +as zlib-compressed JSON to minimize memory usage. +""" + +import hashlib +import json +import logging +import os +import re +import zlib +from typing import Callable, Awaitable, Optional + +from app.clients.memorystore import safe_get, safe_set + +logger = logging.getLogger(__name__) + +_WHITESPACE_RE = re.compile(r"\s+") +_KEY_PREFIX = "web_search:v1" +_MAX_INLINE_QUERY_LEN = 100 + + +def normalize_query(query: str) -> str: + """lowercase, strip, and collapse whitespace.""" + return _WHITESPACE_RE.sub(" ", query.strip().lower()) + + +def hash_domains(domains: list[str] | None) -> str: + """deterministic hash for a domain list (order-independent).""" + if not domains: + return "nodomain" + cleaned = sorted(d.strip().lower() for d in domains if d and d.strip()) + if not cleaned: + return "nodomain" + raw = ",".join(cleaned) + return hashlib.sha256(raw.encode()).hexdigest()[:12] + + +def build_cache_key(query: str, domains: list[str] | None) -> str: + """build a deterministic redis key from query and domains.""" + nq = normalize_query(query) + if len(nq) > _MAX_INLINE_QUERY_LEN: + query_part = hashlib.sha256(nq.encode()).hexdigest() + else: + # replace spaces with underscores for readability + query_part = nq.replace(" ", "_") + domain_part = hash_domains(domains) + return f"{_KEY_PREFIX}:{query_part}:{domain_part}" + + +def serialize(results: list[dict]) -> bytes: + """json + zlib compress.""" + raw = json.dumps(results, separators=(",", ":"), ensure_ascii=False) + return zlib.compress(raw.encode("utf-8"), level=6) + + +def deserialize(data: bytes) -> Optional[list[dict]]: + """zlib decompress + json parse. returns None on corruption.""" + try: + raw = zlib.decompress(data) + return json.loads(raw) + except Exception: + logger.warning("cache deserialization failed, treating as miss") + return None + + +def _get_ttl_seconds() -> int: + """read TTL from env (in minutes), default 60.""" + minutes = int(os.getenv("WEB_SEARCH_CACHE_TTL_MINUTES", "60")) + return max(minutes, 1) * 60 + + +async def cached_custom_search( + query: str, + *, + num: int, + domains: list[str] | None, + timeout: float, + original_search_fn: Callable[..., Awaitable[list[dict]]], +) -> list[dict]: + """ + cache-through wrapper for _custom_search(). + + on cache hit returns deserialized results directly. + on miss calls original_search_fn and caches the result. + any redis error silently falls through to the original function. + """ + key = build_cache_key(query, domains) + + # try cache + cached = await safe_get(key) + if cached is not None: + results = deserialize(cached) + if results is not None: + logger.debug("cache HIT for key=%s (%d results)", key, len(results)) + return results + + # cache miss — call original + logger.debug("cache MISS for key=%s", key) + results = await original_search_fn( + query, num=num, domains=domains, timeout=timeout, + ) + + # best-effort cache store + if results: + ttl = _get_ttl_seconds() + compressed = serialize(results) + await safe_set(key, compressed, ex=ttl) + + return results diff --git a/app/main.py b/app/main.py index d99cef0..1aa6823 100644 --- a/app/main.py +++ b/app/main.py @@ -1,6 +1,6 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -from app.api.endpoints import scraping, research, text, test +from app.api.endpoints import scraping, text from app.core.config import get_settings settings = get_settings() @@ -19,11 +19,9 @@ allow_headers=["*"], ) -# rotas de scraping, research e fact-checking +# rotas de scraping e fact-checking app.include_router(scraping.router, tags=["scraping"]) -app.include_router(research.router, tags=["research"]) app.include_router(text.router, tags=["fact-checking"]) -app.include_router(test.router, tags=["testing"]) @app.get("/") async def root(): diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..c58d375 --- /dev/null +++ b/conftest.py @@ -0,0 +1,7 @@ +import os +import sys + + +ROOT_DIR = os.path.abspath(os.path.dirname(__file__)) +if ROOT_DIR not in sys.path: + sys.path.insert(0, ROOT_DIR) diff --git a/requirements.txt b/requirements.txt index 4c5c91c..daa1758 100644 --- a/requirements.txt +++ b/requirements.txt @@ -103,6 +103,7 @@ python-multipart==0.0.6 pytz==2025.2 PyYAML==6.0.3 readability-lxml==0.8.4.1 +redis>=5.0.0,<6.0.0 regex==2025.11.3 requests==2.32.5 requests-file==3.0.1 diff --git a/scripts/image_claim_extraction.py b/scripts/image_claim_extraction.py deleted file mode 100644 index 9e09d26..0000000 --- a/scripts/image_claim_extraction.py +++ /dev/null @@ -1,417 +0,0 @@ -# -*- coding: utf-8 -*- -""" -test script for comparing two approaches to image claim extraction: -1. direct claim extraction with image input -2. image transcription followed by text-based claim extraction -""" - -import os -import base64 -import json -from pathlib import Path -from typing import List - -from langchain_openai import ChatOpenAI -from langchain_core.messages import HumanMessage - -# import claim extractor -import sys -sys.path.append(str(Path(__file__).parent.parent)) - -from app.ai.pipeline.claim_extractor import extract_claims -from app.ai.pipeline.prompts import IMAGE_CLAIM_EXTRACTION_SYSTEM_PROMPT, IMAGE_CLAIM_EXTRACTION_USER_PROMPT -from app.models import ClaimExtractionInput, DataSource, LLMConfig - - -# ===== IMAGE TRANSCRIPTION PROMPT ===== - -IMAGE_TRANSCRIPTION_PROMPT = """Você receberá uma imagem enviada pelo usuário, seu objetivo é transcrever a imagem enviada para o fact-checking de fake news seguindo as tarefas adiantes: - -TAREFA 1: Você deve transcrever todo o texto de uma imagem, focando não apenas no texto mas em como ele está visualmente disposto (letras grandes, pequenas, CAPS LOCK, negrito, itálico, cores). Ex: A imagem tem um título "Político perdeu tudo" em negrito e CAPS LOCK com letras grandes. - -TAREFA 2: Foque em transcrever elementos visuais/não-textuais da imagem de forma a explicitar pessoas, especialmente figuras famosas, históricas, importantes, políticos ou celebridades, caso essas figuras estejam presentes, apenas mencione o NOME delas, não qualquer status dela como sua posição, emprego, se está vivo ou não. Também busque descrever entidades humanas e não humanas centrais à imagem. - -Não dê tanto importância a detalhes cotidianos e comuns da paisagem, apenas em detalhes anormais que possam auxiliar no processo de fact-checking. - -Exemplos de descrições detalhadas que ajudam no fact-checking de fake news: - -"A imagem mostra o político Abraham Lincoln numa pose constrangedora, sendo zombado por uma multidão" - -Exemplo de uma descrição que não ajuda no fact-checking: - -"A imagem mostra um homem de terno e cabelo branco, numa festa, com convidados de smoking." - -Retornar no seguinte formato: - -"Descrição da imagem: [sua descrição detalhada aqui]""" - - -# ===== DIRECT IMAGE CLAIM EXTRACTION PROMPT ===== - -IMAGE_VISION_CLAIM_EXTRACTION_SYSTEM_PROMPT = """Você é um especialista em extração de alegações para um sistema de checagem de fatos. - -IMPORTANTE: Você receberá uma IMAGEM como input visual. Sua tarefa é analisar a imagem de forma holística e identificar TODAS as alegações verificáveis presentes nela. - -## Como Analisar a Imagem: - -**PASSO 1 - Identifique Pessoas e Personagens na imagem:** -- Procure por pessoas famosas, políticos, celebridades, figuras históricas ou personagens conhecidos (incluindo personagens fictícios de filmes, séries, livros, etc.) presentes na imagem -- Identifique-os pelo NOME sempre que possível -- NÃO mencione status, cargo, posição ou se estão vivos/mortos - apenas o NOME - -**PASSO 2 - Leia Todo o Texto:** -- Leia todo o texto presente na imagem -- Preste atenção em títulos, legendas, manchetes, citações -- Note a formatação visual: texto em CAPS LOCK, negrito, tamanhos diferentes, cores destacadas - -**PASSO 3 - Interpretação Holística:** -- NÃO analise apenas o texto OU apenas a imagem separadamente -- CONECTE os elementos visuais + texto + pessoas/personagens identificados -- Interprete a MENSAGEM COMPLETA que a mídia está comunicando -- Considere o contexto: é uma notícia? Um meme? Uma charge? Uma montagem? -- Identifique a NARRATIVA ou ALEGAÇÃO que a combinação de todos esses elementos está fazendo sobre o mundo real - -## O que Extrair: - -**Extraia alegações que:** -- Podem ser verificadas como verdadeiras ou falsas com base em evidências -- Fazem afirmações sobre o mundo real (fatos, eventos, pessoas, sociedade) -- Resultam da interpretação HOLÍSTICA da imagem (visual + texto + contexto) -- Contêm entidades nomeadas, eventos ou detalhes específicos -- São opiniões que contêm alegações verificáveis sobre fatos do mundo -- São perguntas que contêm alegações implícitas verificáveis - -**Exemplos de boas alegações extraídas holisticamente (com NOMES explícitos):** -- Imagem mostra Joãozinho com texto "Roubou milhões": extrair "Joãozinho roubou milhões" -- Foto de Michael Jackson com legenda "Morreu ontem": extrair "Michael Jackson morreu ontem" -- Meme com Homer Simpson dizendo "Vacinas causam autismo": extrair "Vacinas causam autismo" (atribuindo ao contexto do meme, não ao personagem) -- Charge mostrando Pedro esmagando trabalhadores com legenda sobre reforma trabalhista: extrair "Pedro implementou políticas trabalhistas prejudiciais aos trabalhadores" - -**Exemplos ERRADOS (sem nomes explícitos):** -- ❌ "O político roubou milhões" (falta o nome - deve ser "Joãozinho roubou milhões") -- ❌ "A celebridade morreu ontem" (falta o nome - deve ser "Michael Jackson morreu ontem") -- ❌ "Um presidente fez declaração polêmica" (falta o nome específico) - -**NÃO extraia:** -- Descrições puramente visuais sem alegação factual ("A imagem mostra uma pessoa") -- Perguntas sem alegações implícitas ("O que você acha?") -- Cumprimentos ou conversa trivial -- Elementos visuais sem conexão com alegações do mundo real -- Alegações genéricas sobre "um político" ou "uma celebridade" quando você consegue identificar a pessoa - -## Casos Especiais - Memes, Charges e Montagens: - -Quando a imagem for um meme, charge, quadrinho ou montagem: - -1. **Identifique a mensagem central**: O que a imagem está AFIRMANDO sobre o mundo? -2. **Use pistas contextuais**: - - Texto sobreposto ou legendas - - Personagens ou pessoas identificáveis - - Símbolos ou metáforas visuais - - Referências a eventos conhecidos -3. **Extraia a alegação factual implícita**: Se a charge "critica a corrupção do governo X", extraia "O governo X é corrupto" -4. **Detecte manipulação visual**: Se a imagem sugere ou afirma ser real mas parece editada/manipulada, extraia alegação sobre autenticidade - -## Diretrizes de Normalização: - -1. **Alegações autocontidas**: Cada alegação deve ser compreensível sem ver a imagem - - Original na imagem: "Ele roubou tudo" - - Normalizada: "O político [nome] roubou dinheiro público" - -2. **NOMEIE pessoas famosas explicitamente**: SEMPRE que identificar uma pessoa famosa, político, celebridade ou personagem na imagem, você DEVE incluir o NOME COMPLETO dessa pessoa na alegação extraída - - ERRADO: "O político roubou dinheiro público" - - CORRETO: "O político Joãozinho roubou dinheiro público" - - ERRADO: "A celebridade morreu ontem" - - CORRETO: "A celebridade Michael Jackson morreu ontem" - - Se você NÃO conseguir identificar o nome da pessoa, NÃO extraia alegações genéricas sobre "um político" ou "uma celebridade" - -3. **Substitua pronomes**: Use nomes específicos das pessoas/entidades identificadas - - Original: "Essa vacina causa problemas" - - Normalizada: "A vacina COVID-19 causa problemas de saúde" - -4. **Preserve contexto crítico**: Se a imagem mostra data, local, números específicos, inclua na alegação - -5. **Múltiplas alegações**: Uma imagem pode conter várias alegações - extraia cada uma separadamente - -6. **Preserve o idioma**: Mantenha o idioma do texto na imagem (português → alegações em português) - -7. **Entidades**: Identifique e liste entidades principais (pessoas, lugares, organizações, produtos, datas, números) - -8. **Análise LLM**: Para cada alegação, explique brevemente por que ela é verificável e como foi extraída da imagem - -## Formato de Saída: - -Retorne um objeto JSON com array "claims". Cada alegação deve ter: -- text: O texto normalizado e autocontido da alegação -- entities: Array de entidades principais mencionadas -- llm_comment: Análise de por que é verificável e como foi extraída da imagem - -Se nenhuma alegação verificável for encontrada, retorne array vazio. - -IMPORTANTE: Você está recebendo uma IMAGEM VISUAL. Analise-a completamente: -1. Identifique pessoas/personagens famosos PELO NOME -2. Leia todo o texto -3. Conecte visual + texto + pessoas para extrair alegações holísticas sobre o mundo real - -CRÍTICO: Qualquer pessoa famosa, político, celebridade ou personagem identificado na imagem DEVE ser NOMEADO EXPLICITAMENTE nas alegações extraídas. Não use termos genéricos como "o político" ou "a celebridade" - use o NOME da pessoa. - -Nota: NÃO inclua campos 'id' ou 'source' - serão adicionados automaticamente.""" - - -# ===== UTILITY FUNCTIONS ===== - -def load_images_from_folder(folder_path: str) -> List[tuple[str, str]]: - """ - load all image files from folder and convert to base64. - - args: - folder_path: path to folder containing images - - returns: - list of tuples (filename, base64_data) - """ - image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} - images = [] - - folder = Path(folder_path) - if not folder.exists(): - print(f"folder {folder_path} does not exist") - return images - - for file_path in folder.iterdir(): - if file_path.suffix.lower() in image_extensions: - try: - with open(file_path, 'rb') as img_file: - base64_data = base64.b64encode(img_file.read()).decode('utf-8') - images.append((file_path.name, base64_data)) - print(f"loaded image: {file_path.name}") - except Exception as e: - print(f"error loading {file_path.name}: {e}") - - return images - - -def call_gpt4o_with_image(image_base64: str, prompt: str, model_name: str = "gpt-5-nano") -> str: - """ - call OpenAI gpt-4o model with image input. - - args: - image_base64: base64 encoded image data - prompt: text prompt to send with image - model_name: model name to use (default: gpt-4o) - - returns: - model response text - """ - model = ChatOpenAI(model=model_name, temperature=0.0) - - message = HumanMessage( - content=[ - {"type": "text", "text": prompt}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, - }, - ], - ) - - response = model.invoke([message]) - return response.content - - -def call_gpt4o_for_claim_extraction_with_image( - image_base64: str, - model_name: str = "gpt-5-nano" -) -> dict: - """ - call gpt-4o with image using the holistic vision-based claim extraction prompt. - - args: - image_base64: base64 encoded image data - model_name: model name to use - - returns: - dict with extracted claims - """ - # use the new holistic vision prompt that explicitly handles image input - full_prompt = IMAGE_VISION_CLAIM_EXTRACTION_SYSTEM_PROMPT - - # create model with structured output - model = ChatOpenAI(model=model_name, temperature=0.0) - - # define schema for structured output - from pydantic import BaseModel, Field - - class ExtractedClaim(BaseModel): - text: str = Field(..., description="The normalized claim text") - entities: List[str] = Field(default_factory=list, description="Named entities in the claim") - llm_comment: str = Field(None, description="LLM's analysis of why this is fact-checkable") - - class ClaimOutput(BaseModel): - claims: List[ExtractedClaim] = Field( - default_factory=list, - description="List of extracted claims" - ) - - structured_model = model.with_structured_output(ClaimOutput, method="json_mode") - - message = HumanMessage( - content=[ - {"type": "text", "text": full_prompt}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, - }, - ], - ) - - response = structured_model.invoke([message]) - - # convert to dict - return { - "claims": [ - { - "text": claim.text, - "entities": claim.entities, - "llm_comment": claim.llm_comment - } - for claim in response.claims - ] - } - - -# ===== MAIN TEST FUNCTIONS ===== - -def test_approach_1_direct_extraction(image_base64: str, image_name: str): - """ - approach 1: direct claim extraction with image input. - uses holistic vision-based prompt that: - - identifies famous people, politicians, celebrities, and fictional characters - - reads all text in the image - - interprets the image holistically (visual + text + people) - - extracts claims about the media combining all elements - """ - print(f"\n{'='*80}") - print(f"APPROACH 1: Direct Image Claim Extraction (Holistic Vision)") - print(f"Image: {image_name}") - print(f"{'='*80}\n") - - try: - result = call_gpt4o_for_claim_extraction_with_image(image_base64) - - print(f"Extracted {len(result['claims'])} claims:\n") - for i, claim in enumerate(result['claims'], 1): - print(f"Claim {i}:") - print(f" Text: {claim['text']}") - print(f" Entities: {', '.join(claim['entities']) if claim['entities'] else 'None'}") - print(f" LLM Comment: {claim['llm_comment']}") - print() - - except Exception as e: - print(f"Error in approach 1: {e}") - import traceback - traceback.print_exc() - - -def test_approach_2_transcribe_then_extract(image_base64: str, image_name: str): - """ - approach 2: first transcribe image, then extract claims from transcription. - step 1: use transcription prompt with image - step 2: use text-based claim extraction (without image) - """ - print(f"\n{'='*80}") - print(f"APPROACH 2: Transcribe Image Then Extract Claims") - print(f"Image: {image_name}") - print(f"{'='*80}\n") - - try: - # step 1: transcribe image - print("Step 1: Transcribing image...") - transcription = call_gpt4o_with_image(image_base64, IMAGE_TRANSCRIPTION_PROMPT) - print(f"Transcription:\n{transcription}\n") - - # step 2: extract claims from transcription - print("Step 2: Extracting claims from transcription...") - - # create data source with transcribed text - data_source = DataSource( - id="test-image-transcription", - source_type="image", - original_text=transcription, - locale="pt-BR" - ) - - extraction_input = ClaimExtractionInput(data_source=data_source) - - # configure llm with gpt-4o - llm = ChatOpenAI(model="gpt-4o", temperature=0.0) - llm_config = LLMConfig(llm=llm) - - # extract claims - result = extract_claims(extraction_input, llm_config) - - print(f"Extracted {len(result.claims)} claims:\n") - for i, claim in enumerate(result.claims, 1): - print(f"Claim {i}:") - print(f" ID: {claim.id}") - print(f" Text: {claim.text}") - print(f" Entities: {', '.join(claim.entities) if claim.entities else 'None'}") - print(f" LLM Comment: {claim.llm_comment}") - print(f" Source: {claim.source.source_type} (ID: {claim.source.source_id})") - print() - - except Exception as e: - print(f"Error in approach 2: {e}") - import traceback - traceback.print_exc() - - -# ===== MAIN SCRIPT ===== - -def main(): - """ - main function to run both test approaches on all images. - """ - print("="*80) - print("IMAGE CLAIM EXTRACTION TEST SCRIPT") - print("="*80) - - # get script directory - script_dir = Path(__file__).parent - images_folder = script_dir / "images" - - print(f"\nLoading images from: {images_folder}") - - # load images - images = load_images_from_folder(str(images_folder)) - - if not images: - print("\nNo images found in the images folder.") - print("Please add some test images to scripts/images/ and run again.") - return - - print(f"\nFound {len(images)} image(s)\n") - - # test each image with both approaches - for image_name, image_base64 in images: - print(f"\n{'#'*80}") - print(f"# Processing: {image_name}") - print(f"{'#'*80}") - - # approach 1: direct extraction - test_approach_1_direct_extraction(image_base64, image_name) - - # approach 2: transcribe then extract - test_approach_2_transcribe_then_extract(image_base64, image_name) - - print(f"\n{'#'*80}") - print(f"# Finished processing: {image_name}") - print(f"{'#'*80}\n") - - print("\n" + "="*80) - print("TEST COMPLETE") - print("="*80) - - -if __name__ == "__main__": - main() diff --git a/scripts/playground/google/google_search_cli.py b/scripts/playground/google/google_search_cli.py deleted file mode 100644 index 605c596..0000000 --- a/scripts/playground/google/google_search_cli.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python3 -""" -google_search_cli.py — interactive CLI for Google Custom Search API. - -usage: - python scripts/playground/google/google_search_cli.py - -configuration (edit in code below): - SITE_FILTER — restrict results to a domain, e.g. "g1.globo.com" - set to None to search the open web - NUM_RESULTS — number of results per query (max 10) - DATE_RESTRICT — relative date window, e.g. "m3" (last 3 months) or None -""" - -import asyncio -import os -import sys -from pathlib import Path - -# allow imports from project root -sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) - -from scripts.playground.common import ( - Colors, - print_header, - print_section, - print_success, - print_error, - print_warning, - print_info, - with_spinner, -) -from app.ai.context.web.google_search import google_search, GoogleSearchError - - -# ─── configuration (edit here) ──────────────────────────────────────────────── - -# estadao.com.br -# folha.uol.com.br -# g1.globo.com -# aosfatos.org -SITE_FILTER: str | None = "aosfatos.org" # e.g. "g1.globo.com" or None for open web -NUM_RESULTS: int = 10 # 1–10 -DATE_RESTRICT: str | None = None # e.g. "d7", "m3", "y1" or None - - -# ─── helpers ────────────────────────────────────────────────────────────────── - -def _check_env() -> bool: - missing = [v for v in ("GOOGLE_SEARCH_API_KEY", "GOOGLE_CSE_CX") if not os.environ.get(v)] - if missing: - print_error(f"missing environment variables: {', '.join(missing)}") - print_info("set them before running:\n export GOOGLE_SEARCH_API_KEY=...\n export GOOGLE_CSE_CX=...") - return False - return True - - -def _print_config() -> None: - print_section("active configuration") - rows = { - "site filter": SITE_FILTER or "(none — open web)", - "results": NUM_RESULTS, - "date restrict": DATE_RESTRICT or "(none)", - } - max_k = max(len(k) for k in rows) - for k, v in rows.items(): - print(f" {Colors.BOLD}{k.ljust(max_k)}{Colors.END} {v}") - - -def _print_results(items: list, query: str) -> None: - if not items: - print_warning("no results found") - return - - print_success(f"{len(items)} result(s) for: {Colors.BOLD}{query}{Colors.END}") - - for i, item in enumerate(items, 1): - title = item.get("title", "") - link = item.get("link", "") - snippet = item.get("snippet", "").replace("\n", " ") - domain = item.get("displayLink", "") - date = item.get("pagemap", {}).get("metatags", [{}])[0].get("article:published_time", "") - - print(f"\n {Colors.BOLD}{Colors.CYAN}{i}.{Colors.END} {Colors.BOLD}{title}{Colors.END}") - if domain: - print(f" {Colors.YELLOW}{domain}{Colors.END}", end="") - if date: - print(f" · {date[:10]}", end="") - print() - if snippet: - # wrap snippet at ~80 chars - words, line, lines = snippet.split(), "", [] - for word in words: - if len(line) + len(word) + 1 > 78: - lines.append(line) - line = word - else: - line = f"{line} {word}".strip() - if line: - lines.append(line) - for l in lines: - print(f" {l}") - print(f" {Colors.CYAN}{link}{Colors.END}") - - -async def _run_query(query: str) -> None: - try: - items = await google_search( - query, - num=NUM_RESULTS, - site_search=SITE_FILTER, - site_search_filter="i" if SITE_FILTER else None, - date_restrict=DATE_RESTRICT, - sort="date", - ) - _print_results(items, query) - except GoogleSearchError as e: - print_error(str(e)) - - -# ─── main loop ──────────────────────────────────────────────────────────────── - -def main() -> None: - print_header("Google Custom Search — interactive CLI") - - if not _check_env(): - sys.exit(1) - - _print_config() - - print_info("\ntype a query and press Enter · empty line to quit\n") - - while True: - try: - raw = input(f"{Colors.BOLD}search>{Colors.END} ").strip() - except (EOFError, KeyboardInterrupt): - print() - break - - if not raw: - break - - with_spinner(lambda: asyncio.run(_run_query(raw)), "searching...") - print() - - print_info("bye") - - -if __name__ == "__main__": - main() diff --git a/scripts/playground/google_factcheck_cli.py b/scripts/playground/google_factcheck_cli.py index ce572d2..fe3951b 100755 --- a/scripts/playground/google_factcheck_cli.py +++ b/scripts/playground/google_factcheck_cli.py @@ -38,7 +38,7 @@ Colors, ) -from app.ai.context.factcheckapi import GoogleFactCheckGatherer +from app.agentic_ai.context.factcheckapi import GoogleFactCheckGatherer from app.models import ExtractedClaim, ClaimSource