From f58c4930ff85c5bf413e3e26fdd93a1c8e420ab1 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Jan 2025 19:48:38 +0100 Subject: [PATCH 1/3] fix extraction of urls that are not well formed (supplementary-material generated by pub2tei) (cherry picked from commit 39c0e43ee5d6cfb5a2f4d7ef0439ec814cceb73c) --- .../core/engines/DatasetDisambiguator.java | 77 +++++-------------- .../grobid/core/engines/DatasetParser.java | 6 +- .../grobid/core/utilities/XMLUtilities.java | 4 +- 3 files changed, 27 insertions(+), 60 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java index 4744b25..c1b78e7 100644 --- a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java +++ b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java @@ -1,73 +1,36 @@ package org.grobid.core.engines; -import nu.xom.Attribute; -import nu.xom.Element; +import com.fasterxml.jackson.core.io.JsonStringEncoder; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpEntity; import org.apache.http.client.config.RequestConfig; -import org.grobid.core.GrobidModels; -import org.grobid.core.data.DatasetComponent; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.conn.HttpHostConnectException; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.mime.HttpMultipartMode; +import org.apache.http.entity.mime.MultipartEntityBuilder; +import org.apache.http.entity.mime.content.StringBody; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; import org.grobid.core.data.Dataset; -import org.grobid.core.data.BiblioItem; -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentPiece; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.document.xml.XmlBuilderUtils; -import org.grobid.core.engines.config.GrobidAnalysisConfig; -import org.grobid.core.engines.label.DatasetTaggingLabels; -import org.grobid.core.engines.label.SegmentationLabels; -import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; -import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.features.FeaturesVectorDataseer; -import org.grobid.core.layout.BoundingBox; +import org.grobid.core.data.DatasetComponent; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; -import org.grobid.core.lexicon.DatastetLexicon; import org.grobid.core.utilities.DatastetConfiguration; -import org.grobid.core.utilities.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.InputSource; - -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - -import java.io.*; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.*; -import java.net.HttpURLConnection; +import java.io.File; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; - -import org.apache.http.HttpResponse; -import org.apache.http.NameValuePair; -import org.apache.http.client.HttpClient; -import org.apache.http.client.entity.UrlEncodedFormEntity; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpPost; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.message.BasicNameValuePair; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.HttpEntity; -import org.apache.http.util.EntityUtils; -import org.apache.http.entity.mime.content.StringBody; -import org.apache.http.entity.ContentType; -import org.apache.http.entity.mime.MultipartEntityBuilder; -import org.apache.http.entity.mime.HttpMultipartMode; -import org.apache.http.conn.HttpHostConnectException; -import org.apache.commons.lang3.tuple.Pair; - -import static org.apache.commons.lang3.StringUtils.*; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; +import java.util.*; /** * Dataset entity disambiguator. Once dataset mentions are recognized and grouped diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 60125b6..39cffe6 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -550,7 +550,11 @@ private List addUrlComponentsAsReferences(DatasetDocumentSeque String target = urlInfos.getMiddle(); // String type = urlInfos.getRight(); - DatasetComponent urlComponent = new DatasetComponent(sequence.getText().substring(pos.start, pos.end)); + String sequenceText = sequence.getText(); + if (sequenceText.length() <= pos.start || sequenceText.length() <= pos.end) { + continue; + } + DatasetComponent urlComponent = new DatasetComponent(sequenceText.substring(pos.start, pos.end)); urlComponent.setOffsetStart(pos.start); urlComponent.setOffsetEnd(pos.end); if (target != null) { diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index d5d5a95..63aebab 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -223,7 +223,7 @@ public static Pair>> g for (int j = 0; j < list2.getLength(); j++) { Node node2 = list2.item(j); if (node2.getNodeType() == Node.TEXT_NODE) { - String chunk = node2.getNodeValue(); + String chunk = normalize(node2.getNodeValue()); buf.append(chunk); found = true; indexPos += chunk.length(); @@ -231,7 +231,7 @@ public static Pair>> g } } } else if (node.getNodeType() == Node.TEXT_NODE) { - String chunk = node.getNodeValue(); + String chunk = normalize(node.getNodeValue()); buf.append(chunk); found = true; indexPos += chunk.length(); From d280410c35133a90ac1f194d3840a506fddcbcb6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 14 Apr 2025 08:14:15 +0200 Subject: [PATCH 2/3] avoid reference parsing errors --- src/main/java/org/grobid/core/engines/DatasetParser.java | 4 ++-- src/main/java/org/grobid/core/utilities/XMLUtilities.java | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index a57bee6..46398ee 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -2004,7 +2004,7 @@ public Pair>, List> processTEIDocument(org.w3c.do Pair referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target)); if (referenceInformation != null) { - BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight()); + BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem(doc, (org.w3c.dom.Element) referenceInformation.getRight()); String refTextClean = refText.replaceAll("[\\[\\], ]+", ""); biblioRefMap.put(refTextClean, biblioItem); @@ -2074,7 +2074,7 @@ public Pair>, List> processTEIDocument(org.w3c.do // TODO make sure that selectedSequences == allSentences above in the processPDF? - List allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList(); + List allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).collect(Collectors.toList()); List dataseerClassificationResults = classifyWithDataseerClassifier(allSentences); for (int i = 0; i < entities.size(); i++) { diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index 63aebab..0666b80 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -114,13 +114,13 @@ public static String getText(Element element) { return found ? buf.toString() : null; } - public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Element biblStructElement) { + public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Document doc, org.w3c.dom.Element biblStructElement) { BiblStructSaxHandler handler = new BiblStructSaxHandler(); String teiXML = null; try { SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser p = spf.newSAXParser(); - teiXML = serialize(null, biblStructElement); + teiXML = serialize(doc, biblStructElement); p.parse(new InputSource(new StringReader(teiXML)), handler); } catch(Exception e) { if (teiXML != null) @@ -271,7 +271,7 @@ public static String serialize(org.w3c.dom.Document doc, Node node) { XPathExpression xpathExp = xpathFactory.newXPath().compile( "//text()[normalize-space(.) = '']"); NodeList emptyTextNodes = (NodeList) - xpathExp.evaluate(doc, XPathConstants.NODESET); + xpathExp.evaluate(node, XPathConstants.NODESET); // Remove each empty text node from document. for (int i = 0; i < emptyTextNodes.getLength(); i++) { @@ -470,7 +470,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) { (textualElements.contains(n.getNodeName())) ) { // text content //String text = n.getTextContent(); - StringBuffer textBuffer = new StringBuffer(); + StringBuilder textBuffer = new StringBuilder(); NodeList childNodes = n.getChildNodes(); for(int y=0; y Date: Sun, 12 Apr 2026 22:17:09 +0000 Subject: [PATCH 3/3] Fix multiple bugs in TEI processing pipeline - Fix .toList() Java 16 incompatibility (use Collectors.toList() for Java 11) - Fix wrong customisation file name (software -> dataset) in DatasetDisambiguator - Add title to selectedSequences in processTEIDocument (was silently discarded) - Fix NPE in XMLUtilities.segment() when sentence detection fails - Fix biblioRefMap key mismatch (use consistent refKey integer keys) - Add bounds check for classifier results to prevent IndexOutOfBoundsException - Fix Content-Type mismatch: use APPLICATION_JSON instead of TEXT_PLAIN for JSON endpoints - Fix off-by-one in getLastDirectChild (loop now checks index 0) - Fix DatastetAnalyzer.getInstance() race condition (restore synchronized block) - Fix getTextNoRefMarkersAndMarkerPositions duplicating content for multi-child refs - Fix DatasetParser.getInstance() broken double-checked locking - Add null checks for originFile in finally blocks https://claude.ai/code/session_018EBZhK2RtGtsvN4E1rp2tF --- .../core/analyzers/DatastetAnalyzer.java | 5 +- .../core/engines/DatasetDisambiguator.java | 2 +- .../grobid/core/engines/DatasetParser.java | 23 +++++---- .../grobid/core/utilities/XMLUtilities.java | 49 ++++++++++--------- .../controller/DatastetProcessFile.java | 17 +++---- 5 files changed, 46 insertions(+), 50 deletions(-) diff --git a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java index b089324..e0a9b10 100644 --- a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java +++ b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java @@ -19,11 +19,10 @@ public class DatastetAnalyzer implements org.grobid.core.analyzers.Analyzer { public static DatastetAnalyzer getInstance() { if (instance == null) { - //double check idiom - // synchronized (instanceController) { + synchronized (DatastetAnalyzer.class) { if (instance == null) getNewInstance(); - // } + } } return instance; } diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java index c1b78e7..4562aec 100644 --- a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java +++ b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java @@ -187,7 +187,7 @@ public void ensureCustomizationReady() { LOGGER.debug("Calling: " + url.toString()); //System.out.println("Calling: " + url.toString()); // load the dataset customisation - File cutomisationFile = new File("resources/config/customisation-software.json"); + File cutomisationFile = new File("resources/config/customisation-dataset.json"); cutomisationFile = new File(cutomisationFile.getAbsolutePath()); String json = FileUtils.readFileToString(cutomisationFile, "UTF-8"); diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index 46398ee..f430514 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -78,18 +78,15 @@ public class DatasetParser extends AbstractParser { public static DatasetParser getInstance(DatastetConfiguration configuration) { if (instance == null) { - getNewInstance(configuration); + synchronized (DatasetParser.class) { + if (instance == null) { + instance = new DatasetParser(configuration); + } + } } return instance; } - /** - * Create a new instance. - */ - private static synchronized void getNewInstance(DatastetConfiguration configuration) { - instance = new DatasetParser(configuration); - } - protected DatasetParser(GrobidModel model) { super(model); } @@ -1607,6 +1604,7 @@ public Pair>, List> processTEIDocument(org.w3c.do DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, titleId); localSequence.setRelevantSectionsNamedDatasets(false); localSequence.setRelevantSectionsImplicitDatasets(false); + selectedSequences.add(localSequence); } } catch (XPathExpressionException e) { @@ -2005,11 +2003,9 @@ public Pair>, List> processTEIDocument(org.w3c.do Pair referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target)); if (referenceInformation != null) { BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem(doc, (org.w3c.dom.Element) referenceInformation.getRight()); - String refTextClean = refText.replaceAll("[\\[\\], ]+", ""); - - biblioRefMap.put(refTextClean, biblioItem); Integer refKey = biblioComponentWrapper.getRefKey(target); + biblioRefMap.put(String.valueOf(refKey), biblioItem); BiblioComponent biblioComponent = new BiblioComponent( biblioItem, refKey ); @@ -2082,6 +2078,9 @@ public Pair>, List> processTEIDocument(org.w3c.do if (CollectionUtils.isEmpty(localDatasets)) { continue; } + if (i >= dataseerClassificationResults.size()) { + break; + } for (Dataset localDataset : localDatasets) { if (localDataset == null) { continue; @@ -2242,7 +2241,7 @@ public Pair>, List> processTEIDocument(org.w3c.do // mark datasets present in Data Availability section(s) if (CollectionUtils.isNotEmpty(availabilitySequences)) { - List availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).toList(); + List availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).collect(Collectors.toList()); entities = markDAS(entities, availabilityTokens); } diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index 0666b80..d20a88c 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -92,7 +92,7 @@ public static Element getFirstDirectChild(Element parent, String name) { public static Element getLastDirectChild(Element parent, String name) { NodeList children = parent.getChildNodes(); - for(int j=children.getLength()-1; j>0; j--) { + for(int j=children.getLength()-1; j>=0; j--) { Node child = children.item(j); if (child instanceof Element && name.equals(child.getNodeName())) return (Element) child; @@ -192,30 +192,27 @@ public static Pair>> g } // get the ref marker text - NodeList list2 = node.getChildNodes(); - for (int j = 0; j < list2.getLength(); j++) { - Node subChildNode = list2.item(j); - if (subChildNode.getNodeType() == Node.TEXT_NODE) { - String chunk = normalize(getTextRecursively(node)); - - if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) { - Triple refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE); - right.put(StringUtils.strip(chunk), refInfo); - String holder = StringUtils.repeat(" ", chunk.length()); - buf.append(holder); - } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) { - org.apache.commons.lang3.tuple.Triple urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE); - right.put(StringUtils.strip(chunk), urlInfo); - // we still add added like normal text - buf.append(chunk); - found = true; - } else { - // other ref are filtered out - String holder = StringUtils.repeat(" ", chunk.length()); - buf.append(holder); - } - indexPos += chunk.length(); + String refFullText = normalize(getTextRecursively(node)); + if (refFullText != null && !refFullText.isEmpty()) { + String chunk = refFullText; + + if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) { + Triple refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE); + right.put(StringUtils.strip(chunk), refInfo); + String holder = StringUtils.repeat(" ", chunk.length()); + buf.append(holder); + } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) { + org.apache.commons.lang3.tuple.Triple urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE); + right.put(StringUtils.strip(chunk), urlInfo); + // we still add added like normal text + buf.append(chunk); + found = true; + } else { + // other ref are filtered out + String holder = StringUtils.repeat(" ", chunk.length()); + buf.append(holder); } + indexPos += chunk.length(); } } else { // get the text recursively @@ -487,6 +484,10 @@ public static void segment(org.w3c.dom.Document doc, Node node) { LOGGER.warn("The sentence segmentation failed for: " + text); } + if (theSentenceBoundaries == null || theSentenceBoundaries.isEmpty()) { + continue; + } + // we're making a first pass to ensure that there is no element broken by the segmentation List sentences = new ArrayList(); List toConcatenate = new ArrayList(); diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index 95a65bd..ae9504e 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -274,7 +274,7 @@ public static Response processDatasetPDF(final InputStream inputStream, if (!isResultOK(retValString)) { response = Response.status(Status.NO_CONTENT).build(); } else { - response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build(); + response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build(); } } catch (Exception exp) { LOGGER.error("An unexpected exception occurs. ", exp); @@ -366,12 +366,7 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean if (!isResultOK(retValString)) { response = Response.status(Status.NO_CONTENT).build(); } else { - response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build(); - /*response = Response - .ok() - .type("application/json") - .entity(retValString) - .build();*/ + response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build(); } } @@ -382,7 +377,8 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean LOGGER.error("An unexpected exception occurs. ", exp); response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build(); } finally { - IOUtilities.removeTempFile(originFile); + if (originFile != null) + IOUtilities.removeTempFile(originFile); } LOGGER.debug(methodLogOut()); return response; @@ -467,7 +463,7 @@ public static Response processDatasetTEI( if (!isResultOK(retValString)) { response = Response.status(Status.NO_CONTENT).build(); } else { - response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build(); + response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build(); } } @@ -478,7 +474,8 @@ public static Response processDatasetTEI( LOGGER.error("An unexpected exception occurs. ", exp); response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build(); } finally { - IOUtilities.removeTempFile(originFile); + if (originFile != null) + IOUtilities.removeTempFile(originFile); } LOGGER.debug(methodLogOut()); return response;