Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ public class DatastetAnalyzer implements org.grobid.core.analyzers.Analyzer {

public static DatastetAnalyzer getInstance() {
if (instance == null) {
//double check idiom
// synchronized (instanceController) {
synchronized (DatastetAnalyzer.class) {
if (instance == null)
getNewInstance();
// }
}
}
return instance;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ public void ensureCustomizationReady() {
LOGGER.debug("Calling: " + url.toString());
//System.out.println("Calling: " + url.toString());
// load the dataset customisation
File cutomisationFile = new File("resources/config/customisation-software.json");
File cutomisationFile = new File("resources/config/customisation-dataset.json");
cutomisationFile = new File(cutomisationFile.getAbsolutePath());

String json = FileUtils.readFileToString(cutomisationFile, "UTF-8");
Expand Down
27 changes: 13 additions & 14 deletions src/main/java/org/grobid/core/engines/DatasetParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,15 @@ public class DatasetParser extends AbstractParser {

public static DatasetParser getInstance(DatastetConfiguration configuration) {
if (instance == null) {
getNewInstance(configuration);
synchronized (DatasetParser.class) {
if (instance == null) {
instance = new DatasetParser(configuration);
}
}
}
return instance;
}

/**
* Create a new instance.
*/
private static synchronized void getNewInstance(DatastetConfiguration configuration) {
instance = new DatasetParser(configuration);
}

protected DatasetParser(GrobidModel model) {
super(model);
}
Expand Down Expand Up @@ -1607,6 +1604,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
DatasetDocumentSequence localSequence = new DatasetDocumentSequence(normalizedText, titleId);
localSequence.setRelevantSectionsNamedDatasets(false);
localSequence.setRelevantSectionsImplicitDatasets(false);
selectedSequences.add(localSequence);
}

} catch (XPathExpressionException e) {
Expand Down Expand Up @@ -2004,12 +2002,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do

Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target));
if (referenceInformation != null) {
BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight());
String refTextClean = refText.replaceAll("[\\[\\], ]+", "");

biblioRefMap.put(refTextClean, biblioItem);
BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem(doc, (org.w3c.dom.Element) referenceInformation.getRight());

Integer refKey = biblioComponentWrapper.getRefKey(target);
biblioRefMap.put(String.valueOf(refKey), biblioItem);
BiblioComponent biblioComponent = new BiblioComponent(
biblioItem, refKey
);
Expand Down Expand Up @@ -2074,14 +2070,17 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do


// TODO make sure that selectedSequences == allSentences above in the processPDF?
List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).toList();
List<String> allSentences = selectedSequences.stream().map(DatasetDocumentSequence::getText).collect(Collectors.toList());
List<DataseerResults> dataseerClassificationResults = classifyWithDataseerClassifier(allSentences);

for (int i = 0; i < entities.size(); i++) {
List<Dataset> localDatasets = entities.get(i);
if (CollectionUtils.isEmpty(localDatasets)) {
continue;
}
if (i >= dataseerClassificationResults.size()) {
break;
}
for (Dataset localDataset : localDatasets) {
if (localDataset == null) {
continue;
Expand Down Expand Up @@ -2242,7 +2241,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do

// mark datasets present in Data Availability section(s)
if (CollectionUtils.isNotEmpty(availabilitySequences)) {
List<LayoutToken> availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).toList();
List<LayoutToken> availabilityTokens = availabilitySequences.stream().flatMap(as -> as.getTokens().stream()).collect(Collectors.toList());
entities = markDAS(entities, availabilityTokens);
}

Expand Down
57 changes: 29 additions & 28 deletions src/main/java/org/grobid/core/utilities/XMLUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ public static Element getFirstDirectChild(Element parent, String name) {

public static Element getLastDirectChild(Element parent, String name) {
NodeList children = parent.getChildNodes();
for(int j=children.getLength()-1; j>0; j--) {
for(int j=children.getLength()-1; j>=0; j--) {
Node child = children.item(j);
if (child instanceof Element && name.equals(child.getNodeName()))
return (Element) child;
Expand All @@ -114,13 +114,13 @@ public static String getText(Element element) {
return found ? buf.toString() : null;
}

public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Element biblStructElement) {
public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Document doc, org.w3c.dom.Element biblStructElement) {
BiblStructSaxHandler handler = new BiblStructSaxHandler();
String teiXML = null;
try {
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser p = spf.newSAXParser();
teiXML = serialize(null, biblStructElement);
teiXML = serialize(doc, biblStructElement);
p.parse(new InputSource(new StringReader(teiXML)), handler);
} catch(Exception e) {
if (teiXML != null)
Expand Down Expand Up @@ -192,30 +192,27 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g
}

// get the ref marker text
NodeList list2 = node.getChildNodes();
for (int j = 0; j < list2.getLength(); j++) {
Node subChildNode = list2.item(j);
if (subChildNode.getNodeType() == Node.TEXT_NODE) {
String chunk = normalize(getTextRecursively(node));

if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) {
Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE);
right.put(StringUtils.strip(chunk), refInfo);
String holder = StringUtils.repeat(" ", chunk.length());
buf.append(holder);
} else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) {
org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE);
right.put(StringUtils.strip(chunk), urlInfo);
// we still add added like normal text
buf.append(chunk);
found = true;
} else {
// other ref are filtered out
String holder = StringUtils.repeat(" ", chunk.length());
buf.append(holder);
}
indexPos += chunk.length();
String refFullText = normalize(getTextRecursively(node));
if (refFullText != null && !refFullText.isEmpty()) {
String chunk = refFullText;

if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) {
Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE);
right.put(StringUtils.strip(chunk), refInfo);
String holder = StringUtils.repeat(" ", chunk.length());
buf.append(holder);
} else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) {
org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE);
right.put(StringUtils.strip(chunk), urlInfo);
// we still add added like normal text
buf.append(chunk);
found = true;
} else {
// other ref are filtered out
String holder = StringUtils.repeat(" ", chunk.length());
buf.append(holder);
}
indexPos += chunk.length();
}
} else {
// get the text recursively
Expand Down Expand Up @@ -271,7 +268,7 @@ public static String serialize(org.w3c.dom.Document doc, Node node) {
XPathExpression xpathExp = xpathFactory.newXPath().compile(
"//text()[normalize-space(.) = '']");
NodeList emptyTextNodes = (NodeList)
xpathExp.evaluate(doc, XPathConstants.NODESET);
xpathExp.evaluate(node, XPathConstants.NODESET);

// Remove each empty text node from document.
for (int i = 0; i < emptyTextNodes.getLength(); i++) {
Expand Down Expand Up @@ -470,7 +467,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
(textualElements.contains(n.getNodeName())) ) {
// text content
//String text = n.getTextContent();
StringBuffer textBuffer = new StringBuffer();
StringBuilder textBuffer = new StringBuilder();
NodeList childNodes = n.getChildNodes();
for(int y=0; y<childNodes.getLength(); y++) {
textBuffer.append(serialize(doc, childNodes.item(y)));
Expand All @@ -487,6 +484,10 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
LOGGER.warn("The sentence segmentation failed for: " + text);
}

if (theSentenceBoundaries == null || theSentenceBoundaries.isEmpty()) {
continue;
}

// we're making a first pass to ensure that there is no element broken by the segmentation
List<String> sentences = new ArrayList<String>();
List<String> toConcatenate = new ArrayList<String>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ public static Response processDatasetPDF(final InputStream inputStream,
if (!isResultOK(retValString)) {
response = Response.status(Status.NO_CONTENT).build();
} else {
response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
}
} catch (Exception exp) {
LOGGER.error("An unexpected exception occurs. ", exp);
Expand Down Expand Up @@ -366,12 +366,7 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean
if (!isResultOK(retValString)) {
response = Response.status(Status.NO_CONTENT).build();
} else {
response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
/*response = Response
.ok()
.type("application/json")
.entity(retValString)
.build();*/
response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
}
}

Expand All @@ -382,7 +377,8 @@ public static Response processDatasetJATS(final InputStream inputStream, Boolean
LOGGER.error("An unexpected exception occurs. ", exp);
response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build();
} finally {
IOUtilities.removeTempFile(originFile);
if (originFile != null)
IOUtilities.removeTempFile(originFile);
}
LOGGER.debug(methodLogOut());
return response;
Expand Down Expand Up @@ -467,7 +463,7 @@ public static Response processDatasetTEI(
if (!isResultOK(retValString)) {
response = Response.status(Status.NO_CONTENT).build();
} else {
response = Response.status(Status.OK).entity(retValString).type(MediaType.TEXT_PLAIN).build();
response = Response.status(Status.OK).entity(retValString).type(MediaType.APPLICATION_JSON).build();
}
}

Expand All @@ -478,7 +474,8 @@ public static Response processDatasetTEI(
LOGGER.error("An unexpected exception occurs. ", exp);
response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build();
} finally {
IOUtilities.removeTempFile(originFile);
if (originFile != null)
IOUtilities.removeTempFile(originFile);
}
LOGGER.debug(methodLogOut());
return response;
Expand Down
Loading