From f58c4930ff85c5bf413e3e26fdd93a1c8e420ab1 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 2 Jan 2025 19:48:38 +0100
Subject: [PATCH 1/3] fix extraction of urls that are not well formed
 (supplementary-material generated by pub2tei)

(cherry picked from commit 39c0e43ee5d6cfb5a2f4d7ef0439ec814cceb73c)
---
 .../core/engines/DatasetDisambiguator.java    | 77 +++++--------------
 .../grobid/core/engines/DatasetParser.java    |  6 +-
 .../grobid/core/utilities/XMLUtilities.java   |  4 +-
 3 files changed, 27 insertions(+), 60 deletions(-)

diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
index 4744b25..c1b78e7 100644
--- a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
+++ b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
@@ -1,73 +1,36 @@
 package org.grobid.core.engines;
 
-import nu.xom.Attribute;
-import nu.xom.Element;
+import com.fasterxml.jackson.core.io.JsonStringEncoder;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.http.HttpEntity;
 import org.apache.http.client.config.RequestConfig;
-import org.grobid.core.GrobidModels;
-import org.grobid.core.data.DatasetComponent;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.conn.HttpHostConnectException;
+import org.apache.http.entity.ContentType;
+import org.apache.http.entity.mime.HttpMultipartMode;
+import org.apache.http.entity.mime.MultipartEntityBuilder;
+import org.apache.http.entity.mime.content.StringBody;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.util.EntityUtils;
 import org.grobid.core.data.Dataset;
-import org.grobid.core.data.BiblioItem;
-import org.grobid.core.document.Document;
-import org.grobid.core.document.DocumentPiece;
-import org.grobid.core.document.DocumentSource;
-import org.grobid.core.document.xml.XmlBuilderUtils;
-import org.grobid.core.engines.config.GrobidAnalysisConfig;
-import org.grobid.core.engines.label.DatasetTaggingLabels;
-import org.grobid.core.engines.label.SegmentationLabels;
-import org.grobid.core.engines.label.TaggingLabel;
-import org.grobid.core.engines.label.TaggingLabels;
-import org.grobid.core.exceptions.GrobidException;
-import org.grobid.core.factory.GrobidFactory;
-import org.grobid.core.features.FeaturesVectorDataseer;
-import org.grobid.core.layout.BoundingBox;
+import org.grobid.core.data.DatasetComponent;
 import org.grobid.core.layout.LayoutToken;
-import org.grobid.core.layout.LayoutTokenization;
-import org.grobid.core.lexicon.DatastetLexicon;
 import org.grobid.core.utilities.DatastetConfiguration;
-import org.grobid.core.utilities.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.xml.sax.InputSource;
-
-import com.fasterxml.jackson.core.*;
-import com.fasterxml.jackson.databind.*;
-import com.fasterxml.jackson.databind.node.*;
-import com.fasterxml.jackson.annotation.*;
-import com.fasterxml.jackson.core.io.*;
-
-import java.io.*;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.*;
 
-import java.net.HttpURLConnection;
+import java.io.File;
+import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
-
-import org.apache.http.HttpResponse;
-import org.apache.http.NameValuePair;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.entity.UrlEncodedFormEntity;
-import org.apache.http.client.methods.HttpGet;
-import org.apache.http.client.methods.HttpPost;
-import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.http.message.BasicNameValuePair;
-import org.apache.http.impl.client.CloseableHttpClient;
-import org.apache.http.impl.client.HttpClients;
-import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.HttpEntity;
-import org.apache.http.util.EntityUtils;
-import org.apache.http.entity.mime.content.StringBody;
-import org.apache.http.entity.ContentType;
-import org.apache.http.entity.mime.MultipartEntityBuilder;
-import org.apache.http.entity.mime.HttpMultipartMode;
-import org.apache.http.conn.HttpHostConnectException;
-import org.apache.commons.lang3.tuple.Pair;
-
-import static org.apache.commons.lang3.StringUtils.*;
-import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
+import java.util.*;
 
 /**
  * Dataset entity disambiguator. Once dataset mentions are recognized and grouped
diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
index 60125b6..39cffe6 100644
--- a/src/main/java/org/grobid/core/engines/DatasetParser.java
+++ b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -550,7 +550,11 @@ private List<DatasetComponent> addUrlComponentsAsReferences(DatasetDocumentSeque
             String target = urlInfos.getMiddle();
 //            String type = urlInfos.getRight();
 
-            DatasetComponent urlComponent = new DatasetComponent(sequence.getText().substring(pos.start, pos.end));
+            String sequenceText = sequence.getText();
+            if (sequenceText.length() <= pos.start || sequenceText.length() <= pos.end) {
+                continue;
+            }
+            DatasetComponent urlComponent = new DatasetComponent(sequenceText.substring(pos.start, pos.end));
             urlComponent.setOffsetStart(pos.start);
             urlComponent.setOffsetEnd(pos.end);
             if (target != null) {
diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
index d5d5a95..63aebab 100644
--- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java
+++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
@@ -223,7 +223,7 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g
                     for (int j = 0; j < list2.getLength(); j++) {
                         Node node2 = list2.item(j);
                         if (node2.getNodeType() == Node.TEXT_NODE) {
-                            String chunk = node2.getNodeValue();
+                            String chunk = normalize(node2.getNodeValue());
                             buf.append(chunk);
                             found = true;
                             indexPos += chunk.length();
@@ -231,7 +231,7 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g
                     }
                 }
             } else if (node.getNodeType() == Node.TEXT_NODE) {
-                String chunk = node.getNodeValue();
+                String chunk = normalize(node.getNodeValue());
                 buf.append(chunk);
                 found = true;
                 indexPos += chunk.length();

From d280410c35133a90ac1f194d3840a506fddcbcb6 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Mon, 14 Apr 2025 08:14:15 +0200
Subject: [PATCH 2/3] avoid reference parsing errors

---
 src/main/java/org/grobid/core/engines/DatasetParser.java  | 4 ++--
 src/main/java/org/grobid/core/utilities/XMLUtilities.java | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
index a57bee6..46398ee 100644
--- a/src/main/java/org/grobid/core/engines/DatasetParser.java
+++ b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -2004,7 +2004,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
 
                 Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target));
                 if (referenceInformation != null) {
-                    BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight());
+                    BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem(doc, (org.w3c.dom.Element) referenceInformation.getRight());
                     String refTextClean = refText.replaceAll("[\\[\\], ]+", "");
 
                     biblioRefMap.put(refTextClean, biblioItem);
@@ -2074,7 +2074,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
 
 
         // TODO make sure that selectedSequences == allSentences above in the processPDF?
-        List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList();
+        List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).collect(Collectors.toList());
         List<DataseerResults> dataseerClassificationResults = classifyWithDataseerClassifier(allSentences);
 
         for (int i = 0; i < entities.size(); i++) {
diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
index 63aebab..0666b80 100644
--- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java
+++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
@@ -114,13 +114,13 @@ public static String getText(Element element) {
         return found ? buf.toString() : null;
     }
 
-    public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Element biblStructElement) {
+    public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Document doc, org.w3c.dom.Element biblStructElement) {
         BiblStructSaxHandler handler = new BiblStructSaxHandler();
         String teiXML = null;
         try {
             SAXParserFactory spf = SAXParserFactory.newInstance();
             SAXParser p = spf.newSAXParser();
-            teiXML = serialize(null, biblStructElement);
+            teiXML = serialize(doc, biblStructElement);
             p.parse(new InputSource(new StringReader(teiXML)), handler);
         } catch(Exception e) {
             if (teiXML != null)
@@ -271,7 +271,7 @@ public static String serialize(org.w3c.dom.Document doc, Node node) {
             XPathExpression xpathExp = xpathFactory.newXPath().compile(
                     "//text()[normalize-space(.) = '']");
             NodeList emptyTextNodes = (NodeList)
-                    xpathExp.evaluate(doc, XPathConstants.NODESET);
+                    xpathExp.evaluate(node, XPathConstants.NODESET);
 
             // Remove each empty text node from document.
             for (int i = 0; i < emptyTextNodes.getLength(); i++) {
@@ -470,7 +470,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                  (textualElements.contains(n.getNodeName())) ) {
                 // text content
                 //String text = n.getTextContent();
-                StringBuffer textBuffer = new StringBuffer();
+                StringBuilder textBuffer = new StringBuilder();
                 NodeList childNodes = n.getChildNodes();
                 for(int y=0; y<childNodes.getLength(); y++) {
                     textBuffer.append(serialize(doc, childNodes.item(y)));

From 593150ed80de5b795ab18090eedc7dcd755c1356 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 12 Apr 2026 22:17:09 +0000
Subject: [PATCH 3/3] Fix multiple bugs in TEI processing pipeline

- Fix .toList() Java 16 incompatibility (use Collectors.toList() for Java 11)
- Fix wrong customisation file name (software -> dataset) in DatasetDisambiguator
- Add title to selectedSequences in processTEIDocument (was silently discarded)
- Fix NPE in XMLUtilities.segment() when sentence detection fails
- Fix biblioRefMap key mismatch (use consistent refKey integer keys)
- Add bounds check for classifier results to prevent IndexOutOfBoundsException
- Fix Content-Type mismatch: use APPLICATION_JSON instead of TEXT_PLAIN for JSON endpoints
- Fix off-by-one in getLastDirectChild (loop now checks index 0)
- Fix DatastetAnalyzer.getInstance() race condition (restore synchronized block)
- Fix getTextNoRefMarkersAndMarkerPositions duplicating content for multi-child refs
- Fix DatasetParser.getInstance() broken double-checked locking
- Add null checks for originFile in finally blocks

https://claude.ai/code/session_018EBZhK2RtGtsvN4E1rp2tF
---
 .../core/analyzers/DatastetAnalyzer.java      |  5 +-
 .../core/engines/DatasetDisambiguator.java    |  2 +-
 .../grobid/core/engines/DatasetParser.java    | 23 +++++----
 .../grobid/core/utilities/XMLUtilities.java   | 49 ++++++++++---------
 .../controller/DatastetProcessFile.java       | 17 +++----
 5 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java
index b089324..e0a9b10 100644
--- a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java
+++ b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java
@@ -19,11 +19,10 @@ public class DatastetAnalyzer implements org.grobid.core.analyzers.Analyzer {
 
     public static DatastetAnalyzer getInstance() {
         if (instance == null) {
-            //double check idiom
-            // synchronized (instanceController) {
+            synchronized (DatastetAnalyzer.class) {
                 if (instance == null)
                     getNewInstance();
-            // }
+            }
         }
         return instance;
     }
diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
index c1b78e7..4562aec 100644
--- a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
+++ b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java
@@ -187,7 +187,7 @@ public void ensureCustomizationReady() {
                 LOGGER.debug("Calling: " + url.toString());
 //System.out.println("Calling: " + url.toString());
                 // load the dataset customisation
-                File cutomisationFile = new File("resources/config/customisation-software.json");
+                File cutomisationFile = new File("resources/config/customisation-dataset.json");
                 cutomisationFile = new File(cutomisationFile.getAbsolutePath());
 
                 String json = FileUtils.readFileToString(cutomisationFile, "UTF-8");
diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
index 46398ee..f430514 100644
--- a/src/main/java/org/grobid/core/engines/DatasetParser.java
+++ b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -78,18 +78,15 @@ public class DatasetParser extends AbstractParser {
 
     public static DatasetParser getInstance(DatastetConfiguration configuration) {
         if (instance == null) {
-            getNewInstance(configuration);
+            synchronized (DatasetParser.class) {
+                if (instance == null) {
+                    instance = new DatasetParser(configuration);
+                }
+            }
         }
         return instance;
     }
 
-    /**
-     * Create a new instance.
-     */
-    private static synchronized void getNewInstance(DatastetConfiguration configuration) {
-        instance = new DatasetParser(configuration);
-    }
-
     protected DatasetParser(GrobidModel model) {
         super(model);
     }
@@ -1607,6 +1604,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
                 DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, titleId);
                 localSequence.setRelevantSectionsNamedDatasets(false);
                 localSequence.setRelevantSectionsImplicitDatasets(false);
+                selectedSequences.add(localSequence);
             }
 
         } catch (XPathExpressionException e) {
@@ -2005,11 +2003,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
                 Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target));
                 if (referenceInformation != null) {
                     BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem(doc, (org.w3c.dom.Element) referenceInformation.getRight());
-                    String refTextClean = refText.replaceAll("[\\[\\], ]+", "");
-
-                    biblioRefMap.put(refTextClean, biblioItem);
 
                     Integer refKey = biblioComponentWrapper.getRefKey(target);
+                    biblioRefMap.put(String.valueOf(refKey), biblioItem);
                     BiblioComponent biblioComponent = new BiblioComponent(
                             biblioItem, refKey
                     );
@@ -2082,6 +2078,9 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
             if (CollectionUtils.isEmpty(localDatasets)) {
                 continue;
             }
+            if (i >= dataseerClassificationResults.size()) {
+                break;
+            }
             for (Dataset localDataset : localDatasets) {
                 if (localDataset == null) {
                     continue;
@@ -2242,7 +2241,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
 
         // mark datasets present in Data Availability section(s)
         if (CollectionUtils.isNotEmpty(availabilitySequences)) {
-            List<LayoutToken> availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).toList();
+            List<LayoutToken> availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).collect(Collectors.toList());
             entities = markDAS(entities, availabilityTokens);
         }
 
diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
index 0666b80..d20a88c 100644
--- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java
+++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
@@ -92,7 +92,7 @@ public static Element getFirstDirectChild(Element parent, String name) {
 
     public static Element getLastDirectChild(Element parent, String name) {
         NodeList children = parent.getChildNodes();
-        for(int j=children.getLength()-1; j>0; j--) {
+        for(int j=children.getLength()-1; j>=0; j--) {
             Node child = children.item(j);
             if (child instanceof Element && name.equals(child.getNodeName()))
                 return (Element) child;
@@ -192,30 +192,27 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g
                     }
 
                     // get the ref marker text
-                    NodeList list2 = node.getChildNodes();
-                    for (int j = 0; j < list2.getLength(); j++) {
-                        Node subChildNode = list2.item(j);
-                        if (subChildNode.getNodeType() == Node.TEXT_NODE) {
-                            String chunk = normalize(getTextRecursively(node));
-
-                            if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) {
-                                Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE);
-                                right.put(StringUtils.strip(chunk), refInfo);
-                                String holder = StringUtils.repeat(" ", chunk.length());
-                                buf.append(holder);
-                            } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) {
-                                org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE);
-                                right.put(StringUtils.strip(chunk), urlInfo);
-                                // we still add added like normal text
-                                buf.append(chunk);
-                                found = true;
-                            } else {
-                                // other ref are filtered out
-                                String holder = StringUtils.repeat(" ", chunk.length());
-                                buf.append(holder);
-                            }
-                            indexPos += chunk.length();
+                    String refFullText = normalize(getTextRecursively(node));
+                    if (refFullText != null && !refFullText.isEmpty()) {
+                        String chunk = refFullText;
+
+                        if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) {
+                            Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE);
+                            right.put(StringUtils.strip(chunk), refInfo);
+                            String holder = StringUtils.repeat(" ", chunk.length());
+                            buf.append(holder);
+                        } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) {
+                            org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE);
+                            right.put(StringUtils.strip(chunk), urlInfo);
+                            // we still add added like normal text
+                            buf.append(chunk);
+                            found = true;
+                        } else {
+                            // other ref are filtered out
+                            String holder = StringUtils.repeat(" ", chunk.length());
+                            buf.append(holder);
                         }
+                        indexPos += chunk.length();
                     }
                 } else {
                     // get the text recursively
@@ -487,6 +484,10 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                     LOGGER.warn("The sentence segmentation failed for: " + text);
                 }
 
+                if (theSentenceBoundaries == null || theSentenceBoundaries.isEmpty()) {
+                    continue;
+                }
+
                 // we're making a first pass to ensure that there is no element broken by the segmentation
                 List<String> sentences = new ArrayList<String>();
                 List<String> toConcatenate = new ArrayList<String>();
diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java
index 95a65bd..ae9504e 100644
--- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java
+++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java
@@ -274,7 +274,7 @@ public static Response processDatasetPDF(final InputStream inputStream,
             if (!isResultOK(retValString)) {
                 response = Response.status(Status.NO_CONTENT).build();
             } else {
-                response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
+                response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
             }
         } catch (Exception exp) {
             LOGGER.error("An unexpected exception occurs. ", exp);
@@ -366,12 +366,7 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean
                 if (!isResultOK(retValString)) {
                     response = Response.status(Status.NO_CONTENT).build();
                 } else {
-                    response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
-                    /*response = Response
-                            .ok()
-                            .type("application/json")
-                            .entity(retValString)
-                            .build();*/
+                    response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
                 }
             }
 
@@ -382,7 +377,8 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean
             LOGGER.error("An unexpected exception occurs. ", exp);
             response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build();
         } finally {
-            IOUtilities.removeTempFile(originFile);
+            if (originFile != null)
+                IOUtilities.removeTempFile(originFile);
         }
         LOGGER.debug(methodLogOut());
         return response;
@@ -467,7 +463,7 @@ public static Response processDatasetTEI(
                 if (!isResultOK(retValString)) {
                     response = Response.status(Status.NO_CONTENT).build();
                 } else {
-                    response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
+                    response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
                 }
             }
 
@@ -478,7 +474,8 @@ public static Response processDatasetTEI(
             LOGGER.error("An unexpected exception occurs. ", exp);
             response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build();
         } finally {
-            IOUtilities.removeTempFile(originFile);
+            if (originFile != null)
+                IOUtilities.removeTempFile(originFile);
         }
         LOGGER.debug(methodLogOut());
         return response;