From 6a83c20bcc14ea72161ddb16edc8c8ff170ee009 Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 23 Apr 2026 19:40:05 +0100 Subject: [PATCH 1/3] Add error handling for Grobid service when not responding --- document_qa/document_qa_engine.py | 4 +++- document_qa/grobid_processors.py | 36 +++++++++++++++++++++---------- streamlit_app.py | 29 +++++++++++++++---------- 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py index 7560ecf..8dd7f86 100644 --- a/document_qa/document_qa_engine.py +++ b/document_qa/document_qa_engine.py @@ -15,7 +15,7 @@ from langchain_core.vectorstores import VectorStore from tqdm import tqdm -from document_qa.grobid_processors import GrobidProcessor +from document_qa.grobid_processors import GrobidProcessor, GrobidServiceError from document_qa.langchain import ChromaAdvancedRetrieval @@ -376,6 +376,8 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, filename = Path(pdf_file_path).stem coordinates = True # if chunk_size == -1 else False structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates) + if not structure: + raise GrobidServiceError("Grobid did not return a response.") biblio = structure['biblio'] biblio['filename'] = filename.replace(" ", "_") diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py index 0aae0ee..55ec695 100644 --- a/document_qa/grobid_processors.py +++ b/document_qa/grobid_processors.py @@ -9,6 +9,14 @@ from grobid_client.grobid_client import GrobidClient +class GrobidServiceError(RuntimeError): + """Raised when the Grobid service fails to process a document.""" + + def __init__(self, message="Grobid service error", status_code=None): + super().__init__(message) + self.status_code = status_code + + def get_span_start(type, title=None): title_ = ' title="' + title + '"' if title is not None else "" return '' @@ -97,18 +105,24 @@ def __init__(self, grobid_url, ping_server=True): self.grobid_client = grobid_client def process_structure(self, input_path, coordinates=False): - pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument", - input_path, - consolidate_header=True, - consolidate_citations=False, - segment_sentences=False, - tei_coordinates=coordinates, - include_raw_citations=False, - include_raw_affiliations=False, - generateIDs=True) + try: + pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument", + input_path, + consolidate_header=True, + consolidate_citations=False, + segment_sentences=False, + tei_coordinates=coordinates, + include_raw_citations=False, + include_raw_affiliations=False, + generateIDs=True) + except Exception as exc: + raise GrobidServiceError("Grobid service did not respond.") from exc if status != 200: - return + raise GrobidServiceError( + f"Grobid service returned status {status}.", + status_code=status + ) document_object = self.parse_grobid_xml(text, coordinates=coordinates) document_object['filename'] = Path(pdf_file).stem.replace(".tei", "") @@ -137,7 +151,7 @@ def parse_grobid_xml(self, text, coordinates=False): try: year = dateparser.parse(doc_biblio.header.date).year biblio["publication_year"] = year - except: + except Exception: pass output_data['biblio'] = biblio diff --git a/streamlit_app.py b/streamlit_app.py index c01ad2b..a10dc7d 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -4,20 +4,19 @@ from tempfile import NamedTemporaryFile import dotenv +import streamlit as st from grobid_quantities.quantities import QuantitiesAPI from langchain.memory import ConversationBufferMemory from langchain_openai import ChatOpenAI from streamlit_pdf_viewer import pdf_viewer from document_qa.custom_embeddings import ModalEmbeddings +from document_qa.document_qa_engine import DocumentQAEngine, DataStorage +from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError from document_qa.ner_client_generic import NERClientGeneric dotenv.load_dotenv(override=True) -import streamlit as st -from document_qa.document_qa_engine import DocumentQAEngine, DataStorage -from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations - API_MODELS = { "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"], "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"] @@ -320,13 +319,21 @@ def play_old_messages(container): tmp_file.write(bytearray(binary)) st.session_state['binary'] = binary - st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings( - tmp_file.name, - chunk_size=chunk_size, - perc_overlap=0.1 - ) - st.session_state['loaded_embeddings'] = True - st.session_state.messages = [] + try: + st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings( + tmp_file.name, + chunk_size=chunk_size, + perc_overlap=0.1 + ) + st.session_state['loaded_embeddings'] = True + st.session_state.messages = [] + except GrobidServiceError as exc: + status = f" (status {exc.status_code})" if exc.status_code else "" + st.session_state['doc_id'] = None + st.session_state['loaded_embeddings'] = False + st.session_state['uploaded'] = False + st.error(f"Grobid is not responding{status}. Please try later.") + st.stop() def rgb_to_hex(rgb): From 5670b4b5d078fc093407341ed6500ef82f6538ba Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 30 Apr 2026 10:52:34 +0100 Subject: [PATCH 2/3] Update GrobidProcessor initialization and enhance error handling in Streamlit app --- document_qa/document_qa_engine.py | 2 +- streamlit_app.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py index 8dd7f86..f42e94e 100644 --- a/document_qa/document_qa_engine.py +++ b/document_qa/document_qa_engine.py @@ -219,7 +219,7 @@ def __init__(self, self.data_storage = data_storage if grobid_url: - self.grobid_processor = GrobidProcessor(grobid_url) + self.grobid_processor = GrobidProcessor(grobid_url, ping_server=False) def query_document( self, diff --git a/streamlit_app.py b/streamlit_app.py index a10dc7d..b98673d 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -313,13 +313,13 @@ def play_old_messages(container): st.stop() with left_column: - with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'): - binary = uploaded_file.getvalue() - tmp_file = NamedTemporaryFile() - tmp_file.write(bytearray(binary)) - st.session_state['binary'] = binary + try: + with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'): + binary = uploaded_file.getvalue() + tmp_file = NamedTemporaryFile() + tmp_file.write(bytearray(binary)) + st.session_state['binary'] = binary - try: st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings( tmp_file.name, chunk_size=chunk_size, @@ -327,13 +327,13 @@ def play_old_messages(container): ) st.session_state['loaded_embeddings'] = True st.session_state.messages = [] - except GrobidServiceError as exc: - status = f" (status {exc.status_code})" if exc.status_code else "" - st.session_state['doc_id'] = None - st.session_state['loaded_embeddings'] = False - st.session_state['uploaded'] = False - st.error(f"Grobid is not responding{status}. Please try later.") - st.stop() + except GrobidServiceError as exc: + status = f" (status {exc.status_code})" if exc.status_code else "" + st.session_state['doc_id'] = None + st.session_state['loaded_embeddings'] = False + st.session_state['uploaded'] = False + st.error(f"Grobid is not responding{status}. Please try later.") + st.stop() def rgb_to_hex(rgb): From cf2978129ce192929b07bdf6617637fbfb61333a Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Thu, 30 Apr 2026 11:14:38 +0100 Subject: [PATCH 3/3] Fix missing space in error message --- streamlit_app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/streamlit_app.py b/streamlit_app.py index b98673d..bad17f1 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -328,11 +328,12 @@ def play_old_messages(container): st.session_state['loaded_embeddings'] = True st.session_state.messages = [] except GrobidServiceError as exc: + message = str(exc).strip() or "Grobid is not responding" status = f" (status {exc.status_code})" if exc.status_code else "" st.session_state['doc_id'] = None st.session_state['loaded_embeddings'] = False st.session_state['uploaded'] = False - st.error(f"Grobid is not responding{status}. Please try later.") + st.error(f"{message} Please try later.") st.stop()