From 46916ffde3ea5f6fc1274126302304f20827e96a Mon Sep 17 00:00:00 2001 From: NeurArk Date: Tue, 27 May 2025 16:51:51 +0200 Subject: [PATCH] feat: add basic analytics tab --- TODO.md | 20 +++++++++---------- app.py | 30 ++++++++++++++++++++++++++++ core/vector_store.py | 31 +++++++++++++++++++++++++++++ tests/unit/test_app.py | 26 ++++++++++++++++++++++++ tests/unit/test_vector_store.py | 35 +++++++++++++++++++++++++++++++++ 5 files changed, 132 insertions(+), 10 deletions(-) diff --git a/TODO.md b/TODO.md index 342a89c..56fe9f4 100644 --- a/TODO.md +++ b/TODO.md @@ -349,11 +349,11 @@ core/ **Goal**: Add simple statistics to show system capabilities ### Tasks: -- [ ] **Simple Stats Display** - - [ ] Show number of documents uploaded - - [ ] Display total chunks in database - - [ ] Basic document type breakdown (PDF, DOCX, etc.) - - [ ] Simple stats in sidebar or tab +- [x] **Simple Stats Display** + - [x] Show number of documents uploaded + - [x] Display total chunks in database + - [x] Basic document type breakdown (PDF, DOCX, etc.) + - [x] Simple stats in sidebar or tab - [ ] **Optional: Simple Visualization** - [ ] Basic bar chart of document types @@ -361,13 +361,13 @@ core/ - [ ] Use Gradio's built-in plot component ### Acceptance Criteria: -- [ ] Stats display without errors -- [ ] Information is accurate -- [ ] Doesn't slow down main chat interface +- [x] Stats display without errors +- [x] Information is accurate +- [x] Doesn't slow down main chat interface ### Definition of Done: -- [ ] Basic stats working -- [ ] No impact on main functionality +- [x] Basic stats working +- [x] No impact on main functionality --- diff --git a/app.py b/app.py index 446011c..fd9c68b 100644 --- a/app.py +++ b/app.py @@ -85,6 +85,27 @@ def clear_all_documents() -> str: return f"❌ Error clearing documents: {exc}" +def get_system_stats() -> str: + """Get simple system statistics.""" + try: + stats = vector_store.get_statistics() + + return f""" + 📊 **System Statistics** + + • Documents: {stats.get('total_documents', 0)} + • Total Chunks: {stats.get('total_chunks', 0)} + • Vector Store Size: {stats.get('collection_size', 0)} + + **Document Types:** + • PDF: {stats.get('pdf_count', 0)} + • DOCX: {stats.get('docx_count', 0)} + • TXT: {stats.get('txt_count', 0)} + """ + except Exception: # pragma: no cover - simple fallback + return "📊 Statistics unavailable" + + css = """ #chatbot { border-radius: 10px; @@ -140,6 +161,15 @@ def clear_all_documents() -> str: refresh_btn = gr.Button("Refresh List", size="sm") clear_docs_btn = gr.Button("Clear All Documents", variant="stop", size="sm") + with gr.Tab("Analytics"): + stats_display = gr.Markdown(get_system_stats()) + refresh_stats = gr.Button("Refresh Stats") + + refresh_stats.click( + fn=get_system_stats, + outputs=[stats_display] + ) + def respond(user_message: str, chat_history: List[List[str]]) -> tuple[str, List[List[str]]]: bot_message = chat_response(user_message, chat_history) chat_history.append([user_message, bot_message]) diff --git a/core/vector_store.py b/core/vector_store.py index c7fb99a..4b2f1e0 100644 --- a/core/vector_store.py +++ b/core/vector_store.py @@ -154,6 +154,37 @@ def get_stats(self) -> Dict[str, Any]: stats["persist_directory"] = str(self.chroma_manager.persist_directory) return stats + def _get_collection_size(self) -> int: + """Return size on disk of the vector store in bytes.""" + total = 0 + try: + for path in self.chroma_manager.persist_directory.rglob("*"): + if path.is_file(): + total += path.stat().st_size + except Exception as exc: # pragma: no cover - simple helper + logger.error("Failed to calculate collection size: %s", exc) + return total + + def get_statistics(self) -> Dict[str, Any]: + """Return extended statistics for analytics display.""" + try: + stats = self.get_stats() + documents = self.get_all_documents() + stats["pdf_count"] = sum( + 1 for doc in documents if doc.get("file_type") == "pdf" + ) + stats["docx_count"] = sum( + 1 for doc in documents if doc.get("file_type") == "docx" + ) + stats["txt_count"] = sum( + 1 for doc in documents if doc.get("file_type") == "txt" + ) + stats["collection_size"] = self._get_collection_size() + return stats + except Exception as exc: # pragma: no cover - wrapper + logger.error("Failed to get statistics: %s", exc) + return {"error": str(exc)} + def clear(self) -> None: """Remove all documents from the vector store.""" try: diff --git a/tests/unit/test_app.py b/tests/unit/test_app.py index 6b46a56..4b83a28 100644 --- a/tests/unit/test_app.py +++ b/tests/unit/test_app.py @@ -63,3 +63,29 @@ def test_clear_all_documents(mock_store: Mock) -> None: assert app.uploaded_files == {} mock_store.clear.assert_called_once() + +@patch("app.vector_store") +def test_get_system_stats(mock_store: Mock) -> None: + mock_store.get_statistics.return_value = { + "total_documents": 2, + "total_chunks": 5, + "collection_size": 123, + "pdf_count": 1, + "docx_count": 1, + "txt_count": 0, + } + + stats = app.get_system_stats() + + assert "Documents: 2" in stats + assert "Total Chunks: 5" in stats + assert "PDF: 1" in stats + mock_store.get_statistics.assert_called_once() + + +@patch("app.vector_store") +def test_get_system_stats_error(mock_store: Mock) -> None: + mock_store.get_statistics.side_effect = Exception("fail") + stats = app.get_system_stats() + assert "unavailable" in stats.lower() + diff --git a/tests/unit/test_vector_store.py b/tests/unit/test_vector_store.py index b186132..248db02 100644 --- a/tests/unit/test_vector_store.py +++ b/tests/unit/test_vector_store.py @@ -81,3 +81,38 @@ def test_delete_and_get_chunks(vector_store, sample_document, sample_chunks): assert retrieved[0].id == ids[0] assert vector_store.delete_document(sample_document.id) is True assert vector_store.get_all_documents() == [] + + +def test_get_statistics(vector_store, sample_document, sample_chunks): + vector_store.store_document(sample_document, sample_chunks) + + doc2 = Document( + id="doc_456", + filename="other.docx", + file_type="docx", + file_size=2000, + content="More text", + ) + chunks2 = [ + DocumentChunk( + id=f"chunk_b{i}", + document_id="doc_456", + content=f"Chunk text {i}", + chunk_index=i, + start_char=i * 50, + end_char=(i + 1) * 50, + embedding=[0.1] * 3072, + ) + for i in range(2) + ] + + vector_store.store_document(doc2, chunks2) + + stats = vector_store.get_statistics() + + assert stats["total_documents"] == 2 + assert stats["total_chunks"] == len(sample_chunks) + len(chunks2) + assert stats["pdf_count"] == 1 + assert stats["docx_count"] == 1 + assert stats["txt_count"] == 0 + assert "collection_size" in stats