From f5d08b950ad3223cf145821c1a4b6469a7b966fc Mon Sep 17 00:00:00 2001 From: NeurArk Date: Tue, 27 May 2025 17:12:49 +0200 Subject: [PATCH] feat: add analytics visualizations --- TODO.md | 8 ++--- app.py | 76 +++++++++++++++++++++++++++++++++++++----- tests/unit/test_app.py | 33 +++++++++++++++++- 3 files changed, 103 insertions(+), 14 deletions(-) diff --git a/TODO.md b/TODO.md index 56fe9f4..38276f2 100644 --- a/TODO.md +++ b/TODO.md @@ -355,10 +355,10 @@ core/ - [x] Basic document type breakdown (PDF, DOCX, etc.) - [x] Simple stats in sidebar or tab -- [ ] **Optional: Simple Visualization** - - [ ] Basic bar chart of document types - - [ ] Simple scatter plot if time permits - - [ ] Use Gradio's built-in plot component +- [x] **Optional: Simple Visualization** + - [x] Basic bar chart of document types + - [x] Simple scatter plot if time permits + - [x] Use Gradio's built-in plot component ### Acceptance Criteria: - [x] Stats display without errors diff --git a/app.py b/app.py index fd9c68b..e219d75 100644 --- a/app.py +++ b/app.py @@ -6,6 +6,8 @@ from typing import List import gradio as gr +import pandas as pd +import logging from config.logging import setup_logging from core.document_processor import DocumentProcessor @@ -17,6 +19,7 @@ setup_logging() +logger = logging.getLogger(__name__) doc_processor = DocumentProcessor() embedder = EmbeddingService() @@ -106,6 +109,47 @@ def get_system_stats() -> str: return "📊 Statistics unavailable" +def create_document_type_chart() -> pd.DataFrame: + """Return data for bar chart of document types.""" + try: + stats = vector_store.get_statistics() + return pd.DataFrame( + { + "Type": ["PDF", "DOCX", "TXT"], + "Count": [ + stats.get("pdf_count", 0), + stats.get("docx_count", 0), + stats.get("txt_count", 0), + ], + } + ) + except Exception as exc: # pragma: no cover - simple fallback + logger.error("Failed to create bar chart: %s", exc) + return pd.DataFrame({"Type": [], "Count": []}) + + +def create_document_scatter() -> pd.DataFrame: + """Return data for scatter plot of file size vs chunk count.""" + try: + docs = vector_store.get_all_documents() + data = { + "Chunks": [], + "Size KB": [], + "Type": [], + "Filename": [], + } + for doc in docs: + data["Chunks"].append(doc.get("chunk_count", 0)) + size = doc.get("metadata", {}).get("size", 0) + data["Size KB"].append(round(size / 1024, 2)) + data["Type"].append(doc.get("file_type", "")) + data["Filename"].append(doc.get("filename", "")) + return pd.DataFrame(data) + except Exception as exc: # pragma: no cover - simple fallback + logger.error("Failed to create scatter plot: %s", exc) + return pd.DataFrame({"Chunks": [], "Size KB": [], "Type": [], "Filename": []}) + + css = """ #chatbot { border-radius: 10px; @@ -163,21 +207,36 @@ def get_system_stats() -> str: with gr.Tab("Analytics"): stats_display = gr.Markdown(get_system_stats()) + bar_chart = gr.BarPlot( + value=create_document_type_chart(), + x="Type", + y="Count", + title="Documents by Type", + ) + scatter_plot = gr.ScatterPlot( + value=create_document_scatter(), + x="Chunks", + y="Size KB", + color="Type", + tooltip="Filename", + title="Size vs Chunks", + ) refresh_stats = gr.Button("Refresh Stats") - refresh_stats.click( - fn=get_system_stats, - outputs=[stats_display] - ) + refresh_stats.click(fn=get_system_stats, outputs=[stats_display]) + refresh_stats.click(fn=create_document_type_chart, outputs=[bar_chart]) + refresh_stats.click(fn=create_document_scatter, outputs=[scatter_plot]) - def respond(user_message: str, chat_history: List[List[str]]) -> tuple[str, List[List[str]]]: + def respond( + user_message: str, chat_history: List[List[str]] + ) -> tuple[str, List[List[str]]]: bot_message = chat_response(user_message, chat_history) chat_history.append([user_message, bot_message]) return "", chat_history - file_upload.change(fn=process_file, inputs=[file_upload], outputs=[upload_status]).then( - fn=get_document_list, outputs=[doc_list] - ) + file_upload.change( + fn=process_file, inputs=[file_upload], outputs=[upload_status] + ).then(fn=get_document_list, outputs=[doc_list]) msg.submit(respond, [msg, chatbot], [msg, chatbot]) submit.click(respond, [msg, chatbot], [msg, chatbot]) @@ -190,4 +249,3 @@ def respond(user_message: str, chat_history: List[List[str]]) -> tuple[str, List if __name__ == "__main__": app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) - diff --git a/tests/unit/test_app.py b/tests/unit/test_app.py index 4b83a28..3ae1ec0 100644 --- a/tests/unit/test_app.py +++ b/tests/unit/test_app.py @@ -2,6 +2,7 @@ from types import SimpleNamespace from unittest.mock import Mock, patch +import pandas as pd import app from core.models.document import Document, DocumentChunk @@ -10,7 +11,9 @@ @patch("app.vector_store") @patch("app.embedder") @patch("app.doc_processor") -def test_process_file_success(mock_processor: Mock, mock_embedder: Mock, mock_store: Mock) -> None: +def test_process_file_success( + mock_processor: Mock, mock_embedder: Mock, mock_store: Mock +) -> None: app.uploaded_files.clear() file_obj = SimpleNamespace(name="/tmp/test.txt") @@ -89,3 +92,31 @@ def test_get_system_stats_error(mock_store: Mock) -> None: stats = app.get_system_stats() assert "unavailable" in stats.lower() + +@patch("app.vector_store") +def test_create_document_type_chart(mock_store: Mock) -> None: + mock_store.get_statistics.return_value = { + "pdf_count": 2, + "docx_count": 1, + "txt_count": 0, + } + + df = app.create_document_type_chart() + assert isinstance(df, pd.DataFrame) + assert df.loc[df["Type"] == "PDF", "Count"].iloc[0] == 2 + + +@patch("app.vector_store") +def test_create_document_scatter(mock_store: Mock) -> None: + mock_store.get_all_documents.return_value = [ + { + "filename": "doc.pdf", + "file_type": "pdf", + "chunk_count": 3, + "metadata": {"size": 2048}, + } + ] + + df = app.create_document_scatter() + assert isinstance(df, pd.DataFrame) + assert df["Chunks"].iloc[0] == 3