ScienciaLAB · Sanakhamassi · Apr 23, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py
@@ -15,7 +15,7 @@
 from langchain_core.vectorstores import VectorStore
 from tqdm import tqdm
 
-from document_qa.grobid_processors import GrobidProcessor
+from document_qa.grobid_processors import GrobidProcessor, GrobidServiceError
 from document_qa.langchain import ChromaAdvancedRetrieval
 
 
@@ -219,7 +219,7 @@ def __init__(self,
         self.data_storage = data_storage
 
         if grobid_url:
-            self.grobid_processor = GrobidProcessor(grobid_url)
+            self.grobid_processor = GrobidProcessor(grobid_url, ping_server=False)
 
     def query_document(
             self,
@@ -376,6 +376,8 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
         filename = Path(pdf_file_path).stem
         coordinates = True  # if chunk_size == -1 else False
         structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
+        if not structure:
+            raise GrobidServiceError("Grobid did not return a response.")
 
         biblio = structure['biblio']
         biblio['filename'] = filename.replace(" ", "_")

diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py
@@ -9,6 +9,14 @@
 from grobid_client.grobid_client import GrobidClient
 
 
+class GrobidServiceError(RuntimeError):
+    """Raised when the Grobid service fails to process a document."""
+
+    def __init__(self, message="Grobid service error", status_code=None):
+        super().__init__(message)
+        self.status_code = status_code
+
+
 def get_span_start(type, title=None):
     title_ = ' title="' + title + '"' if title is not None else ""
     return '<span class="label ' + type + '"' + title_ + '>'
@@ -97,18 +105,24 @@ def __init__(self, grobid_url, ping_server=True):
         self.grobid_client = grobid_client
 
     def process_structure(self, input_path, coordinates=False):
-        pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
-                                                                input_path,
-                                                                consolidate_header=True,
-                                                                consolidate_citations=False,
-                                                                segment_sentences=False,
-                                                                tei_coordinates=coordinates,
-                                                                include_raw_citations=False,
-                                                                include_raw_affiliations=False,
-                                                                generateIDs=True)
+        try:
+            pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
+                                                                    input_path,
+                                                                    consolidate_header=True,
+                                                                    consolidate_citations=False,
+                                                                    segment_sentences=False,
+                                                                    tei_coordinates=coordinates,
+                                                                    include_raw_citations=False,
+                                                                    include_raw_affiliations=False,
+                                                                    generateIDs=True)
+        except Exception as exc:
+            raise GrobidServiceError("Grobid service did not respond.") from exc
 
         if status != 200:
-            return
+            raise GrobidServiceError(
+                f"Grobid service returned status {status}.",
+                status_code=status
+            )
 
         document_object = self.parse_grobid_xml(text, coordinates=coordinates)
         document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
@@ -137,7 +151,7 @@ def parse_grobid_xml(self, text, coordinates=False):
         try:
             year = dateparser.parse(doc_biblio.header.date).year
             biblio["publication_year"] = year
-        except:
+        except Exception:
             pass
 
         output_data['biblio'] = biblio

diff --git a/streamlit_app.py b/streamlit_app.py
@@ -4,20 +4,19 @@
 from tempfile import NamedTemporaryFile
 
 import dotenv
+import streamlit as st
 from grobid_quantities.quantities import QuantitiesAPI
 from langchain.memory import ConversationBufferMemory
 from langchain_openai import ChatOpenAI
 from streamlit_pdf_viewer import pdf_viewer
 
 from document_qa.custom_embeddings import ModalEmbeddings
+from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
+from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError
 from document_qa.ner_client_generic import NERClientGeneric
 
 dotenv.load_dotenv(override=True)
 
-import streamlit as st
-from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
-from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
-
 API_MODELS = {
     "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
     "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
@@ -314,19 +313,28 @@ def play_old_messages(container):
         st.stop()
 
     with left_column:
-        with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
-            binary = uploaded_file.getvalue()
-            tmp_file = NamedTemporaryFile()
-            tmp_file.write(bytearray(binary))
-            st.session_state['binary'] = binary
-
-            st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
-                tmp_file.name,
-                chunk_size=chunk_size,
-                perc_overlap=0.1
-            )
-            st.session_state['loaded_embeddings'] = True
-            st.session_state.messages = []
+        try:
+            with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
+                binary = uploaded_file.getvalue()
+                tmp_file = NamedTemporaryFile()
+                tmp_file.write(bytearray(binary))
+                st.session_state['binary'] = binary
+
+                st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
+                    tmp_file.name,
+                    chunk_size=chunk_size,
+                    perc_overlap=0.1
+                )
+                st.session_state['loaded_embeddings'] = True
+                st.session_state.messages = []
+        except GrobidServiceError as exc:
+            message = str(exc).strip() or "Grobid is not responding"
+            status = f" (status {exc.status_code})" if exc.status_code else ""
+            st.session_state['doc_id'] = None
+            st.session_state['loaded_embeddings'] = False
+            st.session_state['uploaded'] = False
+            st.error(f"{message} Please try later.")
+            st.stop()
 
 
 def rgb_to_hex(rgb):