Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -349,25 +349,25 @@ core/
**Goal**: Add simple statistics to show system capabilities

### Tasks:
- [ ] **Simple Stats Display**
- [ ] Show number of documents uploaded
- [ ] Display total chunks in database
- [ ] Basic document type breakdown (PDF, DOCX, etc.)
- [ ] Simple stats in sidebar or tab
- [x] **Simple Stats Display**
- [x] Show number of documents uploaded
- [x] Display total chunks in database
- [x] Basic document type breakdown (PDF, DOCX, etc.)
- [x] Simple stats in sidebar or tab

- [ ] **Optional: Simple Visualization**
- [ ] Basic bar chart of document types
- [ ] Simple scatter plot if time permits
- [ ] Use Gradio's built-in plot component

### Acceptance Criteria:
- [ ] Stats display without errors
- [ ] Information is accurate
- [ ] Doesn't slow down main chat interface
- [x] Stats display without errors
- [x] Information is accurate
- [x] Doesn't slow down main chat interface

### Definition of Done:
- [ ] Basic stats working
- [ ] No impact on main functionality
- [x] Basic stats working
- [x] No impact on main functionality

---

Expand Down
30 changes: 30 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,27 @@ def clear_all_documents() -> str:
return f"❌ Error clearing documents: {exc}"


def get_system_stats() -> str:
"""Get simple system statistics."""
try:
stats = vector_store.get_statistics()

return f"""
📊 **System Statistics**

• Documents: {stats.get('total_documents', 0)}
• Total Chunks: {stats.get('total_chunks', 0)}
• Vector Store Size: {stats.get('collection_size', 0)}

**Document Types:**
• PDF: {stats.get('pdf_count', 0)}
• DOCX: {stats.get('docx_count', 0)}
• TXT: {stats.get('txt_count', 0)}
"""
except Exception: # pragma: no cover - simple fallback
return "📊 Statistics unavailable"


css = """
#chatbot {
border-radius: 10px;
Expand Down Expand Up @@ -140,6 +161,15 @@ def clear_all_documents() -> str:
refresh_btn = gr.Button("Refresh List", size="sm")
clear_docs_btn = gr.Button("Clear All Documents", variant="stop", size="sm")

with gr.Tab("Analytics"):
stats_display = gr.Markdown(get_system_stats())
refresh_stats = gr.Button("Refresh Stats")

refresh_stats.click(
fn=get_system_stats,
outputs=[stats_display]
)

def respond(user_message: str, chat_history: List[List[str]]) -> tuple[str, List[List[str]]]:
bot_message = chat_response(user_message, chat_history)
chat_history.append([user_message, bot_message])
Expand Down
31 changes: 31 additions & 0 deletions core/vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,37 @@ def get_stats(self) -> Dict[str, Any]:
stats["persist_directory"] = str(self.chroma_manager.persist_directory)
return stats

def _get_collection_size(self) -> int:
"""Return size on disk of the vector store in bytes."""
total = 0
try:
for path in self.chroma_manager.persist_directory.rglob("*"):
if path.is_file():
total += path.stat().st_size
except Exception as exc: # pragma: no cover - simple helper
logger.error("Failed to calculate collection size: %s", exc)
return total

def get_statistics(self) -> Dict[str, Any]:
"""Return extended statistics for analytics display."""
try:
stats = self.get_stats()
documents = self.get_all_documents()
stats["pdf_count"] = sum(
1 for doc in documents if doc.get("file_type") == "pdf"
)
stats["docx_count"] = sum(
1 for doc in documents if doc.get("file_type") == "docx"
)
stats["txt_count"] = sum(
1 for doc in documents if doc.get("file_type") == "txt"
)
stats["collection_size"] = self._get_collection_size()
return stats
except Exception as exc: # pragma: no cover - wrapper
logger.error("Failed to get statistics: %s", exc)
return {"error": str(exc)}

def clear(self) -> None:
"""Remove all documents from the vector store."""
try:
Expand Down
26 changes: 26 additions & 0 deletions tests/unit/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,29 @@ def test_clear_all_documents(mock_store: Mock) -> None:
assert app.uploaded_files == {}
mock_store.clear.assert_called_once()


@patch("app.vector_store")
def test_get_system_stats(mock_store: Mock) -> None:
mock_store.get_statistics.return_value = {
"total_documents": 2,
"total_chunks": 5,
"collection_size": 123,
"pdf_count": 1,
"docx_count": 1,
"txt_count": 0,
}

stats = app.get_system_stats()

assert "Documents: 2" in stats
assert "Total Chunks: 5" in stats
assert "PDF: 1" in stats
mock_store.get_statistics.assert_called_once()


@patch("app.vector_store")
def test_get_system_stats_error(mock_store: Mock) -> None:
mock_store.get_statistics.side_effect = Exception("fail")
stats = app.get_system_stats()
assert "unavailable" in stats.lower()

35 changes: 35 additions & 0 deletions tests/unit/test_vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,38 @@ def test_delete_and_get_chunks(vector_store, sample_document, sample_chunks):
assert retrieved[0].id == ids[0]
assert vector_store.delete_document(sample_document.id) is True
assert vector_store.get_all_documents() == []


def test_get_statistics(vector_store, sample_document, sample_chunks):
vector_store.store_document(sample_document, sample_chunks)

doc2 = Document(
id="doc_456",
filename="other.docx",
file_type="docx",
file_size=2000,
content="More text",
)
chunks2 = [
DocumentChunk(
id=f"chunk_b{i}",
document_id="doc_456",
content=f"Chunk text {i}",
chunk_index=i,
start_char=i * 50,
end_char=(i + 1) * 50,
embedding=[0.1] * 3072,
)
for i in range(2)
]

vector_store.store_document(doc2, chunks2)

stats = vector_store.get_statistics()

assert stats["total_documents"] == 2
assert stats["total_chunks"] == len(sample_chunks) + len(chunks2)
assert stats["pdf_count"] == 1
assert stats["docx_count"] == 1
assert stats["txt_count"] == 0
assert "collection_size" in stats