Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions document_qa/document_qa_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from langchain_core.vectorstores import VectorStore
from tqdm import tqdm

from document_qa.grobid_processors import GrobidProcessor
from document_qa.grobid_processors import GrobidProcessor, GrobidServiceError
from document_qa.langchain import ChromaAdvancedRetrieval


Expand Down Expand Up @@ -219,7 +219,7 @@ def __init__(self,
self.data_storage = data_storage

if grobid_url:
self.grobid_processor = GrobidProcessor(grobid_url)
self.grobid_processor = GrobidProcessor(grobid_url, ping_server=False)

def query_document(
self,
Expand Down Expand Up @@ -376,6 +376,8 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
filename = Path(pdf_file_path).stem
coordinates = True # if chunk_size == -1 else False
structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
if not structure:
raise GrobidServiceError("Grobid did not return a response.")
Comment thread
lfoppiano marked this conversation as resolved.

biblio = structure['biblio']
biblio['filename'] = filename.replace(" ", "_")
Expand Down
36 changes: 25 additions & 11 deletions document_qa/grobid_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
from grobid_client.grobid_client import GrobidClient


class GrobidServiceError(RuntimeError):
"""Raised when the Grobid service fails to process a document."""

def __init__(self, message="Grobid service error", status_code=None):
super().__init__(message)
self.status_code = status_code


def get_span_start(type, title=None):
title_ = ' title="' + title + '"' if title is not None else ""
return '<span class="label ' + type + '"' + title_ + '>'
Expand Down Expand Up @@ -97,18 +105,24 @@ def __init__(self, grobid_url, ping_server=True):
self.grobid_client = grobid_client

def process_structure(self, input_path, coordinates=False):
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
try:
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
except Exception as exc:
raise GrobidServiceError("Grobid service did not respond.") from exc
Comment thread
lfoppiano marked this conversation as resolved.

if status != 200:
return
raise GrobidServiceError(
f"Grobid service returned status {status}.",
status_code=status
)
Comment on lines 107 to +125

document_object = self.parse_grobid_xml(text, coordinates=coordinates)
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
Expand Down Expand Up @@ -137,7 +151,7 @@ def parse_grobid_xml(self, text, coordinates=False):
try:
year = dateparser.parse(doc_biblio.header.date).year
biblio["publication_year"] = year
except:
except Exception:
pass

output_data['biblio'] = biblio
Expand Down
42 changes: 25 additions & 17 deletions streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
from tempfile import NamedTemporaryFile

import dotenv
import streamlit as st
from grobid_quantities.quantities import QuantitiesAPI
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from streamlit_pdf_viewer import pdf_viewer

from document_qa.custom_embeddings import ModalEmbeddings
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError
from document_qa.ner_client_generic import NERClientGeneric

dotenv.load_dotenv(override=True)

import streamlit as st
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations

API_MODELS = {
"microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
"Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
Expand Down Expand Up @@ -314,19 +313,28 @@ def play_old_messages(container):
st.stop()

with left_column:
with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
binary = uploaded_file.getvalue()
tmp_file = NamedTemporaryFile()
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary

st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
tmp_file.name,
chunk_size=chunk_size,
perc_overlap=0.1
)
st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
try:
with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
binary = uploaded_file.getvalue()
tmp_file = NamedTemporaryFile()
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary

st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
tmp_file.name,
chunk_size=chunk_size,
perc_overlap=0.1
)
Comment thread
lfoppiano marked this conversation as resolved.
st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
except GrobidServiceError as exc:
message = str(exc).strip() or "Grobid is not responding"
status = f" (status {exc.status_code})" if exc.status_code else ""
st.session_state['doc_id'] = None
st.session_state['loaded_embeddings'] = False
st.session_state['uploaded'] = False
st.error(f"{message} Please try later.")
st.stop()


def rgb_to_hex(rgb):
Expand Down