Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -355,10 +355,10 @@ core/
- [x] Basic document type breakdown (PDF, DOCX, etc.)
- [x] Simple stats in sidebar or tab

- [ ] **Optional: Simple Visualization**
- [ ] Basic bar chart of document types
- [ ] Simple scatter plot if time permits
- [ ] Use Gradio's built-in plot component
- [x] **Optional: Simple Visualization**
- [x] Basic bar chart of document types
- [x] Simple scatter plot if time permits
- [x] Use Gradio's built-in plot component

### Acceptance Criteria:
- [x] Stats display without errors
Expand Down
76 changes: 67 additions & 9 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from typing import List

import gradio as gr
import pandas as pd
import logging

from config.logging import setup_logging
from core.document_processor import DocumentProcessor
Expand All @@ -17,6 +19,7 @@

setup_logging()

logger = logging.getLogger(__name__)

doc_processor = DocumentProcessor()
embedder = EmbeddingService()
Expand Down Expand Up @@ -106,6 +109,47 @@ def get_system_stats() -> str:
return "📊 Statistics unavailable"


def create_document_type_chart() -> pd.DataFrame:
"""Return data for bar chart of document types."""
try:
stats = vector_store.get_statistics()
return pd.DataFrame(
{
"Type": ["PDF", "DOCX", "TXT"],
"Count": [
stats.get("pdf_count", 0),
stats.get("docx_count", 0),
stats.get("txt_count", 0),
],
}
)
except Exception as exc: # pragma: no cover - simple fallback
logger.error("Failed to create bar chart: %s", exc)
return pd.DataFrame({"Type": [], "Count": []})


def create_document_scatter() -> pd.DataFrame:
"""Return data for scatter plot of file size vs chunk count."""
try:
docs = vector_store.get_all_documents()
data = {
"Chunks": [],
"Size KB": [],
"Type": [],
"Filename": [],
}
for doc in docs:
data["Chunks"].append(doc.get("chunk_count", 0))
size = doc.get("metadata", {}).get("size", 0)
data["Size KB"].append(round(size / 1024, 2))
data["Type"].append(doc.get("file_type", ""))
data["Filename"].append(doc.get("filename", ""))
return pd.DataFrame(data)
except Exception as exc: # pragma: no cover - simple fallback
logger.error("Failed to create scatter plot: %s", exc)
return pd.DataFrame({"Chunks": [], "Size KB": [], "Type": [], "Filename": []})


css = """
#chatbot {
border-radius: 10px;
Expand Down Expand Up @@ -163,21 +207,36 @@ def get_system_stats() -> str:

with gr.Tab("Analytics"):
stats_display = gr.Markdown(get_system_stats())
bar_chart = gr.BarPlot(
value=create_document_type_chart(),
x="Type",
y="Count",
title="Documents by Type",
)
scatter_plot = gr.ScatterPlot(
value=create_document_scatter(),
x="Chunks",
y="Size KB",
color="Type",
tooltip="Filename",
title="Size vs Chunks",
)
refresh_stats = gr.Button("Refresh Stats")

refresh_stats.click(
fn=get_system_stats,
outputs=[stats_display]
)
refresh_stats.click(fn=get_system_stats, outputs=[stats_display])
refresh_stats.click(fn=create_document_type_chart, outputs=[bar_chart])
refresh_stats.click(fn=create_document_scatter, outputs=[scatter_plot])

def respond(user_message: str, chat_history: List[List[str]]) -> tuple[str, List[List[str]]]:
def respond(
user_message: str, chat_history: List[List[str]]
) -> tuple[str, List[List[str]]]:
bot_message = chat_response(user_message, chat_history)
chat_history.append([user_message, bot_message])
return "", chat_history

file_upload.change(fn=process_file, inputs=[file_upload], outputs=[upload_status]).then(
fn=get_document_list, outputs=[doc_list]
)
file_upload.change(
fn=process_file, inputs=[file_upload], outputs=[upload_status]
).then(fn=get_document_list, outputs=[doc_list])

msg.submit(respond, [msg, chatbot], [msg, chatbot])
submit.click(respond, [msg, chatbot], [msg, chatbot])
Expand All @@ -190,4 +249,3 @@ def respond(user_message: str, chat_history: List[List[str]]) -> tuple[str, List

if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)

33 changes: 32 additions & 1 deletion tests/unit/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from types import SimpleNamespace
from unittest.mock import Mock, patch
import pandas as pd

import app
from core.models.document import Document, DocumentChunk
Expand All @@ -10,7 +11,9 @@
@patch("app.vector_store")
@patch("app.embedder")
@patch("app.doc_processor")
def test_process_file_success(mock_processor: Mock, mock_embedder: Mock, mock_store: Mock) -> None:
def test_process_file_success(
mock_processor: Mock, mock_embedder: Mock, mock_store: Mock
) -> None:
app.uploaded_files.clear()
file_obj = SimpleNamespace(name="/tmp/test.txt")

Expand Down Expand Up @@ -89,3 +92,31 @@ def test_get_system_stats_error(mock_store: Mock) -> None:
stats = app.get_system_stats()
assert "unavailable" in stats.lower()


@patch("app.vector_store")
def test_create_document_type_chart(mock_store: Mock) -> None:
mock_store.get_statistics.return_value = {
"pdf_count": 2,
"docx_count": 1,
"txt_count": 0,
}

df = app.create_document_type_chart()
assert isinstance(df, pd.DataFrame)
assert df.loc[df["Type"] == "PDF", "Count"].iloc[0] == 2


@patch("app.vector_store")
def test_create_document_scatter(mock_store: Mock) -> None:
mock_store.get_all_documents.return_value = [
{
"filename": "doc.pdf",
"file_type": "pdf",
"chunk_count": 3,
"metadata": {"size": 2048},
}
]

df = app.create_document_scatter()
assert isinstance(df, pd.DataFrame)
assert df["Chunks"].iloc[0] == 3