diff --git a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java index b089324..e0a9b10 100644 --- a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java +++ b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java @@ -19,11 +19,10 @@ public class DatastetAnalyzer implements org.grobid.core.analyzers.Analyzer { public static DatastetAnalyzer getInstance() { if (instance == null) { - //double check idiom - // synchronized (instanceController) { + synchronized (DatastetAnalyzer.class) { if (instance == null) getNewInstance(); - // } + } } return instance; } diff --git a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java index c1b78e7..4562aec 100644 --- a/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java +++ b/src/main/java/org/grobid/core/engines/DatasetDisambiguator.java @@ -187,7 +187,7 @@ public void ensureCustomizationReady() { LOGGER.debug("Calling: " + url.toString()); //System.out.println("Calling: " + url.toString()); // load the dataset customisation - File cutomisationFile = new File("resources/config/customisation-software.json"); + File cutomisationFile = new File("resources/config/customisation-dataset.json"); cutomisationFile = new File(cutomisationFile.getAbsolutePath()); String json = FileUtils.readFileToString(cutomisationFile, "UTF-8"); diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java index a57bee6..f430514 100644 --- a/src/main/java/org/grobid/core/engines/DatasetParser.java +++ b/src/main/java/org/grobid/core/engines/DatasetParser.java @@ -78,18 +78,15 @@ public class DatasetParser extends AbstractParser { public static DatasetParser getInstance(DatastetConfiguration configuration) { if (instance == null) { - getNewInstance(configuration); + synchronized (DatasetParser.class) { + if (instance == null) { + instance = new DatasetParser(configuration); + } + } } return instance; } - /** - * Create a new instance. - */ - private static synchronized void getNewInstance(DatastetConfiguration configuration) { - instance = new DatasetParser(configuration); - } - protected DatasetParser(GrobidModel model) { super(model); } @@ -1607,6 +1604,7 @@ public Pair>, List> processTEIDocument(org.w3c.do DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, titleId); localSequence.setRelevantSectionsNamedDatasets(false); localSequence.setRelevantSectionsImplicitDatasets(false); + selectedSequences.add(localSequence); } } catch (XPathExpressionException e) { @@ -2004,12 +2002,10 @@ public Pair>, List> processTEIDocument(org.w3c.do Pair referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target)); if (referenceInformation != null) { - BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight()); - String refTextClean = refText.replaceAll("[\\[\\], ]+", ""); - - biblioRefMap.put(refTextClean, biblioItem); + BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem(doc, (org.w3c.dom.Element) referenceInformation.getRight()); Integer refKey = biblioComponentWrapper.getRefKey(target); + biblioRefMap.put(String.valueOf(refKey), biblioItem); BiblioComponent biblioComponent = new BiblioComponent( biblioItem, refKey ); @@ -2074,7 +2070,7 @@ public Pair>, List> processTEIDocument(org.w3c.do // TODO make sure that selectedSequences == allSentences above in the processPDF? - List allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList(); + List allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).collect(Collectors.toList()); List dataseerClassificationResults = classifyWithDataseerClassifier(allSentences); for (int i = 0; i < entities.size(); i++) { @@ -2082,6 +2078,9 @@ public Pair>, List> processTEIDocument(org.w3c.do if (CollectionUtils.isEmpty(localDatasets)) { continue; } + if (i >= dataseerClassificationResults.size()) { + break; + } for (Dataset localDataset : localDatasets) { if (localDataset == null) { continue; @@ -2242,7 +2241,7 @@ public Pair>, List> processTEIDocument(org.w3c.do // mark datasets present in Data Availability section(s) if (CollectionUtils.isNotEmpty(availabilitySequences)) { - List availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).toList(); + List availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).collect(Collectors.toList()); entities = markDAS(entities, availabilityTokens); } diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index 63aebab..d20a88c 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -92,7 +92,7 @@ public static Element getFirstDirectChild(Element parent, String name) { public static Element getLastDirectChild(Element parent, String name) { NodeList children = parent.getChildNodes(); - for(int j=children.getLength()-1; j>0; j--) { + for(int j=children.getLength()-1; j>=0; j--) { Node child = children.item(j); if (child instanceof Element && name.equals(child.getNodeName())) return (Element) child; @@ -114,13 +114,13 @@ public static String getText(Element element) { return found ? buf.toString() : null; } - public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Element biblStructElement) { + public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Document doc, org.w3c.dom.Element biblStructElement) { BiblStructSaxHandler handler = new BiblStructSaxHandler(); String teiXML = null; try { SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser p = spf.newSAXParser(); - teiXML = serialize(null, biblStructElement); + teiXML = serialize(doc, biblStructElement); p.parse(new InputSource(new StringReader(teiXML)), handler); } catch(Exception e) { if (teiXML != null) @@ -192,30 +192,27 @@ public static Pair>> g } // get the ref marker text - NodeList list2 = node.getChildNodes(); - for (int j = 0; j < list2.getLength(); j++) { - Node subChildNode = list2.item(j); - if (subChildNode.getNodeType() == Node.TEXT_NODE) { - String chunk = normalize(getTextRecursively(node)); - - if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) { - Triple refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE); - right.put(StringUtils.strip(chunk), refInfo); - String holder = StringUtils.repeat(" ", chunk.length()); - buf.append(holder); - } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) { - org.apache.commons.lang3.tuple.Triple urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE); - right.put(StringUtils.strip(chunk), urlInfo); - // we still add added like normal text - buf.append(chunk); - found = true; - } else { - // other ref are filtered out - String holder = StringUtils.repeat(" ", chunk.length()); - buf.append(holder); - } - indexPos += chunk.length(); + String refFullText = normalize(getTextRecursively(node)); + if (refFullText != null && !refFullText.isEmpty()) { + String chunk = refFullText; + + if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) { + Triple refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE); + right.put(StringUtils.strip(chunk), refInfo); + String holder = StringUtils.repeat(" ", chunk.length()); + buf.append(holder); + } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) { + org.apache.commons.lang3.tuple.Triple urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE); + right.put(StringUtils.strip(chunk), urlInfo); + // we still add added like normal text + buf.append(chunk); + found = true; + } else { + // other ref are filtered out + String holder = StringUtils.repeat(" ", chunk.length()); + buf.append(holder); } + indexPos += chunk.length(); } } else { // get the text recursively @@ -271,7 +268,7 @@ public static String serialize(org.w3c.dom.Document doc, Node node) { XPathExpression xpathExp = xpathFactory.newXPath().compile( "//text()[normalize-space(.) = '']"); NodeList emptyTextNodes = (NodeList) - xpathExp.evaluate(doc, XPathConstants.NODESET); + xpathExp.evaluate(node, XPathConstants.NODESET); // Remove each empty text node from document. for (int i = 0; i < emptyTextNodes.getLength(); i++) { @@ -470,7 +467,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) { (textualElements.contains(n.getNodeName())) ) { // text content //String text = n.getTextContent(); - StringBuffer textBuffer = new StringBuffer(); + StringBuilder textBuffer = new StringBuilder(); NodeList childNodes = n.getChildNodes(); for(int y=0; y sentences = new ArrayList(); List toConcatenate = new ArrayList(); diff --git a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java index 95a65bd..ae9504e 100644 --- a/src/main/java/org/grobid/service/controller/DatastetProcessFile.java +++ b/src/main/java/org/grobid/service/controller/DatastetProcessFile.java @@ -274,7 +274,7 @@ public static Response processDatasetPDF(final InputStream inputStream, if (!isResultOK(retValString)) { response = Response.status(Status.NO_CONTENT).build(); } else { - response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build(); + response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build(); } } catch (Exception exp) { LOGGER.error("An unexpected exception occurs. ", exp); @@ -366,12 +366,7 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean if (!isResultOK(retValString)) { response = Response.status(Status.NO_CONTENT).build(); } else { - response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build(); - /*response = Response - .ok() - .type("application/json") - .entity(retValString) - .build();*/ + response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build(); } } @@ -382,7 +377,8 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean LOGGER.error("An unexpected exception occurs. ", exp); response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build(); } finally { - IOUtilities.removeTempFile(originFile); + if (originFile != null) + IOUtilities.removeTempFile(originFile); } LOGGER.debug(methodLogOut()); return response; @@ -467,7 +463,7 @@ public static Response processDatasetTEI( if (!isResultOK(retValString)) { response = Response.status(Status.NO_CONTENT).build(); } else { - response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build(); + response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build(); } } @@ -478,7 +474,8 @@ public static Response processDatasetTEI( LOGGER.error("An unexpected exception occurs. ", exp); response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build(); } finally { - IOUtilities.removeTempFile(originFile); + if (originFile != null) + IOUtilities.removeTempFile(originFile); } LOGGER.debug(methodLogOut()); return response;