ScienciaLAB · lfoppiano · Apr 13, 2026 · Jan 2, 2025 · Apr 13, 2025 · Apr 14, 2025
diff --git a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java
@@ -19,11 +19,10 @@ public class DatastetAnalyzer implements org.grobid.core.analyzers.Analyzer {
 
     public static DatastetAnalyzer getInstance() {
         if (instance == null) {
-            //double check idiom
-            // synchronized (instanceController) {
+            synchronized (DatastetAnalyzer.class) {
                 if (instance == null)
                     getNewInstance();
-            // }
+            }
         }
         return instance;
     }

diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
@@ -187,7 +187,7 @@ public void ensureCustomizationReady() {
                 LOGGER.debug("Calling: " + url.toString());
 //System.out.println("Calling: " + url.toString());
                 // load the dataset customisation
-                File cutomisationFile = new File("resources/config/customisation-software.json");
+                File cutomisationFile = new File("resources/config/customisation-dataset.json");
                 cutomisationFile = new File(cutomisationFile.getAbsolutePath());
 
                 String json = FileUtils.readFileToString(cutomisationFile, "UTF-8");

diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -78,18 +78,15 @@ public class DatasetParser extends AbstractParser {
 
     public static DatasetParser getInstance(DatastetConfiguration configuration) {
         if (instance == null) {
-            getNewInstance(configuration);
+            synchronized (DatasetParser.class) {
+                if (instance == null) {
+                    instance = new DatasetParser(configuration);
+                }
+            }
         }
         return instance;
     }
 
-    /**
-     * Create a new instance.
-     */
-    private static synchronized void getNewInstance(DatastetConfiguration configuration) {
-        instance = new DatasetParser(configuration);
-    }
-
     protected DatasetParser(GrobidModel model) {
         super(model);
     }
@@ -1607,6 +1604,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
                 DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, titleId);
                 localSequence.setRelevantSectionsNamedDatasets(false);
                 localSequence.setRelevantSectionsImplicitDatasets(false);
+                selectedSequences.add(localSequence);
             }
 
         } catch (XPathExpressionException e) {
@@ -2004,12 +2002,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
 
                 Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target));
                 if (referenceInformation != null) {
-                    BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight());
-                    String refTextClean = refText.replaceAll("[\\[\\], ]+", "");
-
-                    biblioRefMap.put(refTextClean, biblioItem);
+                    BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem(doc, (org.w3c.dom.Element) referenceInformation.getRight());
 
                     Integer refKey = biblioComponentWrapper.getRefKey(target);
+                    biblioRefMap.put(String.valueOf(refKey), biblioItem);
                     BiblioComponent biblioComponent = new BiblioComponent(
                             biblioItem, refKey
                     );
@@ -2074,14 +2070,17 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
 
 
         // TODO make sure that selectedSequences == allSentences above in the processPDF?
-        List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList();
+        List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).collect(Collectors.toList());
         List<DataseerResults> dataseerClassificationResults = classifyWithDataseerClassifier(allSentences);
 
         for (int i = 0; i < entities.size(); i++) {
             List<Dataset> localDatasets = entities.get(i);
             if (CollectionUtils.isEmpty(localDatasets)) {
                 continue;
             }
+            if (i >= dataseerClassificationResults.size()) {
+                break;
+            }
             for (Dataset localDataset : localDatasets) {
                 if (localDataset == null) {
                     continue;
@@ -2242,7 +2241,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
 
         // mark datasets present in Data Availability section(s)
         if (CollectionUtils.isNotEmpty(availabilitySequences)) {
-            List<LayoutToken> availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).toList();
+            List<LayoutToken> availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).collect(Collectors.toList());
             entities = markDAS(entities, availabilityTokens);
         }
 

diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
@@ -92,7 +92,7 @@ public static Element getFirstDirectChild(Element parent, String name) {
 
     public static Element getLastDirectChild(Element parent, String name) {
         NodeList children = parent.getChildNodes();
-        for(int j=children.getLength()-1; j>0; j--) {
+        for(int j=children.getLength()-1; j>=0; j--) {
             Node child = children.item(j);
             if (child instanceof Element && name.equals(child.getNodeName()))
                 return (Element) child;
@@ -114,13 +114,13 @@ public static String getText(Element element) {
         return found ? buf.toString() : null;
     }
 
-    public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Element biblStructElement) {
+    public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Document doc, org.w3c.dom.Element biblStructElement) {
         BiblStructSaxHandler handler = new BiblStructSaxHandler();
         String teiXML = null;
         try {
             SAXParserFactory spf = SAXParserFactory.newInstance();
             SAXParser p = spf.newSAXParser();
-            teiXML = serialize(null, biblStructElement);
+            teiXML = serialize(doc, biblStructElement);
             p.parse(new InputSource(new StringReader(teiXML)), handler);
         } catch(Exception e) {
             if (teiXML != null)
@@ -192,30 +192,27 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g
                     }
 
                     // get the ref marker text
-                    NodeList list2 = node.getChildNodes();
-                    for (int j = 0; j < list2.getLength(); j++) {
-                        Node subChildNode = list2.item(j);
-                        if (subChildNode.getNodeType() == Node.TEXT_NODE) {
-                            String chunk = normalize(getTextRecursively(node));
-
-                            if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) {
-                                Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE);
-                                right.put(StringUtils.strip(chunk), refInfo);
-                                String holder = StringUtils.repeat(" ", chunk.length());
-                                buf.append(holder);
-                            } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) {
-                                org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE);
-                                right.put(StringUtils.strip(chunk), urlInfo);
-                                // we still add added like normal text
-                                buf.append(chunk);
-                                found = true;
-                            } else {
-                                // other ref are filtered out
-                                String holder = StringUtils.repeat(" ", chunk.length());
-                                buf.append(holder);
-                            }
-                            indexPos += chunk.length();
+                    String refFullText = normalize(getTextRecursively(node));
+                    if (refFullText != null && !refFullText.isEmpty()) {
+                        String chunk = refFullText;
+
+                        if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) {
+                            Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE);
+                            right.put(StringUtils.strip(chunk), refInfo);
+                            String holder = StringUtils.repeat(" ", chunk.length());
+                            buf.append(holder);
+                        } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) {
+                            org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE);
+                            right.put(StringUtils.strip(chunk), urlInfo);
+                            // we still add added like normal text
+                            buf.append(chunk);
+                            found = true;
+                        } else {
+                            // other ref are filtered out
+                            String holder = StringUtils.repeat(" ", chunk.length());
+                            buf.append(holder);
                         }
+                        indexPos += chunk.length();
                     }
                 } else {
                     // get the text recursively
@@ -271,7 +268,7 @@ public static String serialize(org.w3c.dom.Document doc, Node node) {
             XPathExpression xpathExp = xpathFactory.newXPath().compile(
                     "//text()[normalize-space(.) = '']");
             NodeList emptyTextNodes = (NodeList)
-                    xpathExp.evaluate(doc, XPathConstants.NODESET);
+                    xpathExp.evaluate(node, XPathConstants.NODESET);
 
             // Remove each empty text node from document.
             for (int i = 0; i < emptyTextNodes.getLength(); i++) {
@@ -470,7 +467,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                  (textualElements.contains(n.getNodeName())) ) {
                 // text content
                 //String text = n.getTextContent();
-                StringBuffer textBuffer = new StringBuffer();
+                StringBuilder textBuffer = new StringBuilder();
                 NodeList childNodes = n.getChildNodes();
                 for(int y=0; y<childNodes.getLength(); y++) {
                     textBuffer.append(serialize(doc, childNodes.item(y)));
@@ -487,6 +484,10 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                     LOGGER.warn("The sentence segmentation failed for: " + text);
                 }
 
+                if (theSentenceBoundaries == null || theSentenceBoundaries.isEmpty()) {
+                    continue;
+                }
+
                 // we're making a first pass to ensure that there is no element broken by the segmentation
                 List<String> sentences = new ArrayList<String>();
                 List<String> toConcatenate = new ArrayList<String>();

diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java
@@ -274,7 +274,7 @@ public static Response processDatasetPDF(final InputStream inputStream,
             if (!isResultOK(retValString)) {
                 response = Response.status(Status.NO_CONTENT).build();
             } else {
-                response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
+                response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
             }
         } catch (Exception exp) {
             LOGGER.error("An unexpected exception occurs. ", exp);
@@ -366,12 +366,7 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean
                 if (!isResultOK(retValString)) {
                     response = Response.status(Status.NO_CONTENT).build();
                 } else {
-                    response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
-                    /*response = Response
-                            .ok()
-                            .type("application/json")
-                            .entity(retValString)
-                            .build();*/
+                    response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
                 }
             }
 
@@ -382,7 +377,8 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean
             LOGGER.error("An unexpected exception occurs. ", exp);
             response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build();
         } finally {
-            IOUtilities.removeTempFile(originFile);
+            if (originFile != null)
+                IOUtilities.removeTempFile(originFile);
         }
         LOGGER.debug(methodLogOut());
         return response;
@@ -467,7 +463,7 @@ public static Response processDatasetTEI(
                 if (!isResultOK(retValString)) {
                     response = Response.status(Status.NO_CONTENT).build();
                 } else {
-                    response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
+                    response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
                 }
             }
 
@@ -478,7 +474,8 @@ public static Response processDatasetTEI(
             LOGGER.error("An unexpected exception occurs. ", exp);
             response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build();
         } finally {
-            IOUtilities.removeTempFile(originFile);
+            if (originFile != null)
+                IOUtilities.removeTempFile(originFile);
         }
         LOGGER.debug(methodLogOut());
         return response;