diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/XWPFEventBasedWordExtractor.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/XWPFEventBasedWordExtractor.java new file mode 100644 index 00000000000..a7b48d2f40b --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/XWPFEventBasedWordExtractor.java @@ -0,0 +1,368 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.logging.log4j.Logger; +import org.apache.poi.logging.PoiLogManager; +import org.apache.poi.ooxml.POIXMLDocument; +import org.apache.poi.ooxml.POIXMLProperties; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackageRelationship; +import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; +import org.apache.poi.xwpf.extractor.internal.OOXMLWordAndPowerPointTextHandler; +import org.apache.poi.xwpf.extractor.internal.ParagraphProperties; +import org.apache.poi.xwpf.extractor.internal.RunProperties; +import org.apache.poi.xwpf.extractor.internal.XMLReaderUtils; +import org.apache.poi.xwpf.extractor.internal.XWPFListManager; +import org.apache.poi.xwpf.usermodel.XWPFNumbering; +import org.apache.poi.xwpf.usermodel.XWPFRelation; +import org.apache.xmlbeans.XmlException; +import org.xml.sax.SAXException; + +/** + * Experimental class that is based on POI's XSSFEventBasedExcelExtractor + *

+ * This is copied from Apache Tika. + *

+ * + * @since POI 5.4.2 + */ +public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { + + private static final Logger LOG = PoiLogManager.getLogger(XWPFEventBasedWordExtractor.class); + + private final OPCPackage container; + private final POIXMLProperties properties; + + public XWPFEventBasedWordExtractor(OPCPackage container) + throws XmlException, OpenXML4JException, IOException { + this.container = container; + this.properties = new POIXMLProperties(container); + } + + public OPCPackage getPackage() { + return this.container; + } + + public POIXMLProperties.CoreProperties getCoreProperties() { + return this.properties.getCoreProperties(); + } + + public POIXMLProperties.ExtendedProperties getExtendedProperties() { + return this.properties.getExtendedProperties(); + } + + public POIXMLProperties.CustomProperties getCustomProperties() { + return this.properties.getCustomProperties(); + } + + @Override + public POIXMLDocument getDocument() { + return null; + } + + @Override + public String getText() { + StringBuilder sb = new StringBuilder(); + //handle main document + List pps = + container.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType()); + if (pps != null) { + for (PackagePart pp : pps) { + //likely only one, but why not... + try { + handleDocumentPart(pp, sb); + } catch (IOException e) { + LOG.warn("IOException handling document part", e); + } catch (SAXException e) { + //swallow this because we don't actually call it + LOG.warn("SAXException handling document part", e); + } + } + } + //handle glossary document + pps = container.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType()); + + if (pps != null) { + for (PackagePart pp : pps) { + //likely only one, but why not... + try { + handleDocumentPart(pp, sb); + } catch (IOException e) { + LOG.warn("IOException handling glossary document part", e); + } catch (SAXException e) { + //swallow this because we don't actually call it + LOG.warn("SAXException handling glossary document part", e); + } + } + } + + return sb.toString(); + } + + @Override + public void setCloseFilesystem(boolean b) { + + } + + @Override + public boolean isCloseFilesystem() { + return false; + } + + @Override + public Closeable getFilesystem() { + return null; + } + + + private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) + throws IOException, SAXException { + //load the numbering/list manager and styles from the main document part + XWPFNumbering numbering = loadNumbering(documentPart); + XWPFListManager xwpfListManager = new XWPFListManager(numbering); + //TODO: XWPFStyles styles = loadStyles(documentPart); + + //headers + try { + PackageRelationshipCollection headersPRC = + documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation()); + if (headersPRC != null) { + for (int i = 0; i < headersPRC.size(); i++) { + PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i)); + handlePart(header, xwpfListManager, sb); + } + } + } catch (InvalidFormatException e) { + LOG.warn("Invalid format", e); + } + + //main document + handlePart(documentPart, xwpfListManager, sb); + + //for now, just dump other components at end + for (XWPFRelation rel : new XWPFRelation[]{XWPFRelation.FOOTNOTE, XWPFRelation.COMMENT, + XWPFRelation.FOOTER, XWPFRelation.ENDNOTE}) { + try { + PackageRelationshipCollection prc = + documentPart.getRelationshipsByType(rel.getRelation()); + if (prc != null) { + for (int i = 0; i < prc.size(); i++) { + PackagePart packagePart = + documentPart.getRelatedPart(prc.getRelationship(i)); + handlePart(packagePart, xwpfListManager, sb); + } + } + } catch (InvalidFormatException e) { + LOG.warn("Invalid format", e); + } + } + } + + private void handlePart(PackagePart packagePart, XWPFListManager xwpfListManager, + StringBuilder buffer) throws IOException, SAXException { + + Map hyperlinks = loadHyperlinkRelationships(packagePart); + try (InputStream stream = packagePart.getInputStream()) { + XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(stream), + new OOXMLWordAndPowerPointTextHandler(new XWPFToTextContentHandler(buffer), + hyperlinks)); + } + + } + + private Map loadHyperlinkRelationships(PackagePart bodyPart) { + Map hyperlinks = new HashMap<>(); + try { + PackageRelationshipCollection prc = + bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation()); + for (int i = 0; i < prc.size(); i++) { + PackageRelationship pr = prc.getRelationship(i); + if (pr == null) { + continue; + } + String id = pr.getId(); + String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString(); + if (id != null && url != null) { + hyperlinks.put(id, url); + } + } + } catch (InvalidFormatException e) { + LOG.warn("Invalid format", e); + } + return hyperlinks; + } + + private XWPFNumbering loadNumbering(PackagePart packagePart) { + try { + PackageRelationshipCollection numberingParts = + packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation()); + if (!numberingParts.isEmpty()) { + PackageRelationship numberingRelationShip = numberingParts.getRelationship(0); + if (numberingRelationShip == null) { + return null; + } + PackagePart numberingPart = container.getPart(numberingRelationShip); + if (numberingPart == null) { + return null; + } + return new XWPFNumbering(numberingPart); + } + } catch (OpenXML4JException e) { + LOG.warn("Couldn't load numbering", e); + } + return null; + } + + private static class XWPFToTextContentHandler + implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler { + private final StringBuilder buffer; + + public XWPFToTextContentHandler(StringBuilder buffer) { + this.buffer = buffer; + } + + @Override + public void run(RunProperties runProperties, String contents) { + buffer.append(contents); + } + + @Override + public void hyperlinkStart(String link) { + //no-op + } + + @Override + public void hyperlinkEnd() { + //no-op + } + + @Override + public void startParagraph(ParagraphProperties paragraphProperties) { + //no-op + } + + @Override + public void endParagraph() { + buffer.append("\n"); + } + + @Override + public void startTable() { + + } + + @Override + public void endTable() { + + } + + @Override + public void startTableRow() { + + } + + @Override + public void endTableRow() { + buffer.append("\n"); + } + + @Override + public void startTableCell() { + + } + + @Override + public void endTableCell() { + buffer.append("\t"); + } + + @Override + public void startSDT() { + + } + + @Override + public void endSDT() { + buffer.append("\n"); + } + + @Override + public void startEditedSection(String editor, Date date, + OOXMLWordAndPowerPointTextHandler.EditType editType) { + + } + + @Override + public void endEditedSection() { + + } + + @Override + public boolean isIncludeDeletedText() { + return true; + } + + @Override + public void footnoteReference(String id) { + + } + + @Override + public void endnoteReference(String id) { + + } + + @Override + public boolean isIncludeMoveFromText() { + return false; + } + + @Override + public void embeddedOLERef(String refId) { + //no-op + } + + @Override + public void embeddedPicRef(String picFileName, String picDescription) { + //no-op + } + + @Override + public void startBookmark(String id, String name) { + //no-op + } + + @Override + public void endBookmark(String id) { + //no-op + } + } +} diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/AbstractListManager.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/AbstractListManager.java new file mode 100644 index 00000000000..7b629940af8 --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/AbstractListManager.java @@ -0,0 +1,281 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor.internal; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.poi.util.NumberFormatter; + +/** + *

+ * This is copied from Apache Tika. + *

+ * + * @since POI 5.4.2 + */ +abstract class AbstractListManager { + private final static String BULLET = "\u00b7"; + + protected Map listLevelMap = + new HashMap<>(); + protected Map overrideTupleMap = new HashMap<>(); + + //helper class that is docx/doc format agnostic + protected static class ParagraphLevelCounter { + + //counts can == 0 if the format is decimal, make sure + //that flag values are < 0 + private final Integer NOT_SEEN_YET = -1; + private final Integer FIRST_SKIPPED = -2; + private final LevelTuple[] levelTuples; + Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)"); + private List counts = new ArrayList<>(); + private int lastLevel = -1; + + public ParagraphLevelCounter(LevelTuple[] levelTuples) { + this.levelTuples = levelTuples; + } + + public int getNumberOfLevels() { + return levelTuples.length; + } + + /** + * Apply this to every numbered paragraph in order. + * + * @param levelNumber level number that is being incremented + * @return the new formatted number string for this level + */ + public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) { + + for (int i = lastLevel + 1; i < levelNumber; i++) { + if (i >= counts.size()) { + int val = getStart(i, overrideLevelTuples); + counts.add(i, val); + } else { + int count = counts.get(i); + if (count == NOT_SEEN_YET) { + count = getStart(i, overrideLevelTuples); + counts.set(i, count); + } + } + } + + if (levelNumber < counts.size()) { + resetAfter(levelNumber, overrideLevelTuples); + int count = counts.get(levelNumber); + if (count == NOT_SEEN_YET) { + count = getStart(levelNumber, overrideLevelTuples); + } else { + count++; + } + counts.set(levelNumber, count); + lastLevel = levelNumber; + return format(levelNumber, overrideLevelTuples); + } + + counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples)); + lastLevel = levelNumber; + return format(levelNumber, overrideLevelTuples); + } + + /** + * @param level which level to format + * @return the string that represents the number and the surrounding text for this paragraph + */ + private String format(int level, LevelTuple[] overrideLevelTuples) { + if (level < 0 || level >= levelTuples.length) { + //log? + return ""; + } + boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : + levelTuples[level].isLegal; + //short circuit bullet + String numFmt = getNumFormat(level, isLegal, overrideLevelTuples); + if ("bullet".equals(numFmt)) { + return BULLET + " "; + } + + String lvlText = + (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ? + levelTuples[level].lvlText : overrideLevelTuples[level].lvlText; + StringBuilder sb = new StringBuilder(); + Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText); + int last = 0; + while (m.find()) { + sb.append(lvlText, last, m.start()); + String lvlString = m.group(1); + int lvlNum = -1; + try { + lvlNum = Integer.parseInt(lvlString); + } catch (NumberFormatException e) { + //swallow + } + String numString = ""; + //need to subtract 1 because, e.g. %1 is the format + //for the number at array offset 0 + numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples); + + sb.append(numString); + last = m.end(); + } + sb.append(lvlText.substring(last)); + if (sb.length() > 0) { + //TODO: add in character after number + sb.append(" "); + } + return sb.toString(); + } + + //actual level number; can return empty string if numberformatter fails + private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) { + + int numFmtStyle = 0; + String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples); + + int count = getCount(lvlNum); + if (count < 0) { + count = 1; + } + if ("lowerLetter".equals(numFmt)) { + numFmtStyle = 4; + } else if ("lowerRoman".equals(numFmt)) { + numFmtStyle = 2; + } else if ("decimal".equals(numFmt)) { + numFmtStyle = 0; + } else if ("upperLetter".equals(numFmt)) { + numFmtStyle = 3; + } else if ("upperRoman".equals(numFmt)) { + numFmtStyle = 1; + } else if ("bullet".equals(numFmt)) { + return ""; + //not yet handled by NumberFormatter...TODO: add to NumberFormatter? + } else if ("ordinal".equals(numFmt)) { + return ordinalize(count); + } else if ("decimalZero".equals(numFmt)) { + return "0" + NumberFormatter.getNumber(count, 0); + } else if ("none".equals(numFmt)) { + return ""; + } + try { + return NumberFormatter.getNumber(count, numFmtStyle); + } catch (IllegalArgumentException e) { + return ""; + } + } + + private String ordinalize(int count) { + //this is only good for locale == English + String countString = Integer.toString(count); + if (countString.endsWith("1")) { + return countString + "st"; + } else if (countString.endsWith("2")) { + return countString + "nd"; + } else if (countString.endsWith("3")) { + return countString + "rd"; + } + return countString + "th"; + } + + private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) { + if (lvlNum < 0 || lvlNum >= levelTuples.length) { + //log? + return "decimal"; + } + if (isLegal) { + //return decimal no matter the level if isLegal is true + return "decimal"; + } + return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ? + levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt; + } + + private int getCount(int lvlNum) { + if (lvlNum < 0 || lvlNum >= counts.size()) { + //log? + return 1; + } + return counts.get(lvlNum); + } + + private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) { + for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); + levelNumber++) { + int cnt = counts.get(levelNumber); + if (cnt == NOT_SEEN_YET) { + //do nothing + } else if (cnt == FIRST_SKIPPED) { + //do nothing + } else if (levelTuples.length > levelNumber) { + //never reset if restarts == 0 + int restart = (overrideLevelTuples == null || + overrideLevelTuples[levelNumber].restart < 0) ? + levelTuples[levelNumber].restart : + overrideLevelTuples[levelNumber].restart; + if (restart == 0) { + return; + } else if (restart == -1 || startlevelNumber <= restart - 1) { + counts.set(levelNumber, NOT_SEEN_YET); + } else { + //do nothing/don't reset + } + } else { + //reset! + counts.set(levelNumber, NOT_SEEN_YET); + } + } + } + + private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) { + if (levelNumber >= levelTuples.length) { + return 1; + } else { + return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ? + levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start; + } + } + } + + protected static class LevelTuple { + private final int start; + private final int restart; + private final String lvlText; + private final String numFmt; + private final boolean isLegal; + + public LevelTuple(String lvlText) { + this.lvlText = lvlText; + start = 1; + restart = -1; + numFmt = "decimal"; + isLegal = false; + } + + public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) { + this.start = start; + this.restart = restart; + this.lvlText = lvlText; + this.numFmt = numFmt; + this.isLegal = isLegal; + } + } +} diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ContentHandlerDecorator.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ContentHandlerDecorator.java new file mode 100644 index 00000000000..3a1c9eb72d1 --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ContentHandlerDecorator.java @@ -0,0 +1,224 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor.internal; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.ErrorHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Decorator base class for the {@link ContentHandler} interface. This class + * simply delegates all SAX events calls to an underlying decorated handler + * instance. Subclasses can provide extra decoration by overriding one or more + * of the SAX event methods. + *

+ * This is copied from Apache Tika. + *

+ * + * @since POI 5.4.2 + */ +public class ContentHandlerDecorator extends DefaultHandler { + + /** + * Decorated SAX event handler. + */ + private ContentHandler handler; + + /** + * Creates a decorator for the given SAX event handler. + * + * @param handler SAX event handler to be decorated + */ + public ContentHandlerDecorator(ContentHandler handler) { + assert handler != null; + this.handler = handler; + } + + /** + * Creates a decorator that by default forwards incoming SAX events to + * a dummy content handler that simply ignores all the events. Subclasses + * should use the {@link #setContentHandler(ContentHandler)} method to + * switch to a more usable underlying content handler. + */ + protected ContentHandlerDecorator() { + this(new DefaultHandler()); + } + + /** + * Sets the underlying content handler. All future SAX events will be + * directed to this handler instead of the one that was previously used. + * + * @param handler content handler + */ + protected void setContentHandler(ContentHandler handler) { + assert handler != null; + this.handler = handler; + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + try { + handler.startPrefixMapping(prefix, uri); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + try { + handler.endPrefixMapping(prefix); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void processingInstruction(String target, String data) throws SAXException { + try { + handler.processingInstruction(target, data); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void setDocumentLocator(Locator locator) { + handler.setDocumentLocator(locator); + } + + @Override + public void startDocument() throws SAXException { + try { + handler.startDocument(); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void endDocument() throws SAXException { + try { + handler.endDocument(); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void startElement(String uri, String localName, String name, Attributes atts) + throws SAXException { + try { + handler.startElement(uri, localName, name, atts); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void endElement(String uri, String localName, String name) throws SAXException { + try { + handler.endElement(uri, localName, name); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + try { + handler.characters(ch, start, length); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + try { + handler.ignorableWhitespace(ch, start, length); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void skippedEntity(String name) throws SAXException { + try { + handler.skippedEntity(name); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public String toString() { + return handler.toString(); + } + + /** + * Handle any exceptions thrown by methods in this class. This method + * provides a single place to implement custom exception handling. The + * default behaviour is simply to re-throw the given exception, but + * subclasses can also provide alternative ways of handling the situation. + * + * If the wrapped handler is itself a ContentHandlerDecorator, the call + * is delegated to the wrapped handler's {@link ContentHandlerDecorator#handleException(SAXException)} + * + * @param exception the exception that was thrown + * @throws SAXException the exception (if any) thrown to the client + */ + protected void handleException(SAXException exception) throws SAXException { + if (handler instanceof ContentHandlerDecorator) { + ((ContentHandlerDecorator)handler).handleException(exception); + } else { + throw exception; + } + } + + @Override + public void warning (SAXParseException exception) throws SAXException { + if (handler instanceof ErrorHandler) { + ((ErrorHandler)handler).warning(exception); + } else { + super.warning(exception); + } + } + + @Override + public void error (SAXParseException exception) throws SAXException { + if (handler instanceof ErrorHandler) { + ((ErrorHandler)handler).error(exception); + } else { + super.error(exception); + } + } + + @Override + public void fatalError (SAXParseException exception) + throws SAXException { + if (handler instanceof ErrorHandler) { + ((ErrorHandler)handler).fatalError(exception); + } else { + super.fatalError(exception); + } + } +} diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OOXMLWordAndPowerPointTextHandler.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OOXMLWordAndPowerPointTextHandler.java new file mode 100644 index 00000000000..3b3e21beaee --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OOXMLWordAndPowerPointTextHandler.java @@ -0,0 +1,634 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor.internal; + +import java.text.DateFormat; +import java.text.DateFormatSymbols; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.TimeZone; + +import org.apache.poi.xwpf.usermodel.UnderlinePatterns; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * This class is intended to handle anything that might contain IBodyElements: + * main document, headers, footers, notes, slides, etc. + *

+ *

+ *

+ * This class does not generally check for namespaces, and it can be applied + * to PPTX and DOCX for text extraction. + *

+ *

+ * This can be used to scrape content from charts. It currently ignores + * formula (<c:f/>) elements + *

+ *

+ * This does not work with .xlsx or .vsdx. + *

+ *

+ * This is copied from Apache Tika. + *

+ * + * @since POI 5.4.2 + */ + +public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { + + public final static String W_NS = + "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + private final static String R = "r"; + private final static String FLD = "fld"; + private final static String RPR = "rPr"; + private final static String P = "p"; + private final static String P_STYLE = "pStyle"; + private final static String PPR = "pPr"; + private final static String T = "t"; + private final static String TAB = "tab"; + private final static String B = "b"; + private final static String ILVL = "ilvl"; + private final static String NUM_ID = "numId"; + private final static String TC = "tc"; + private final static String TR = "tr"; + private final static String I = "i"; + private final static String U = "u"; + private final static String STRIKE = "strike"; + private final static String NUM_PR = "numPr"; + private final static String BR = "br"; + private final static String HYPERLINK = "hyperlink"; + private final static String HLINK_CLICK = "hlinkClick"; //pptx hlink + private final static String TBL = "tbl"; + private final static String PIC = "pic"; + private final static String PICT = "pict"; + private final static String IMAGEDATA = "imagedata"; + private final static String BLIP = "blip"; + private final static String CHOICE = "Choice"; + private final static String FALLBACK = "Fallback"; + private final static String OLE_OBJECT = "OLEObject"; + private final static String CR = "cr"; + private final static String V = "v"; + private final static String RUBY = "ruby"; //phonetic section + private final static String RT = "rt"; //phonetic run + private static final String VAL = "val"; + private final static String MC_NS = + "http://schemas.openxmlformats.org/markup-compatibility/2006"; + private final static String O_NS = "urn:schemas-microsoft-com:office:office"; + private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture"; + private final static String DRAWING_MAIN_NS = + "http://schemas.openxmlformats.org/drawingml/2006/main"; + private final static String V_NS = "urn:schemas-microsoft-com:vml"; + private final static String C_NS = "http://schemas.openxmlformats.org/drawingml/2006/chart"; + private final static String OFFICE_DOC_RELATIONSHIP_NS = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships"; + private final static char[] TAB_CHAR = new char[]{'\t'}; + private final static char NEWLINE = '\n'; + private final static String BOOKMARK_START = "bookmarkStart"; + private final static String BOOKMARK_END = "bookmarkEnd"; + private final static String FOOTNOTE_REFERENCE = "footnoteReference"; + private final static String INS = "ins"; + private final static String DEL = "del"; + private final static String DEL_TEXT = "delText"; + private final static String MOVE_FROM = "moveFrom"; + private final static String MOVE_TO = "moveTo"; + private final static String ENDNOTE_REFERENCE = "endnoteReference"; + private static final String TEXTBOX = "textbox"; + private static final TimeZone UTC = TimeZone.getTimeZone("UTC"); + private static final TimeZone MIDDAY = TimeZone.getTimeZone("GMT-12:00"); + + private final XWPFBodyContentsHandler bodyContentsHandler; + private final Map linkedRelationships; + private final RunProperties currRunProperties = new RunProperties(); + private final ParagraphProperties currPProperties = new ParagraphProperties(); + private final boolean includeTextBox; + private final boolean concatenatePhoneticRuns; + private final StringBuilder runBuffer = new StringBuilder(); + private final StringBuilder rubyBuffer = new StringBuilder(); + private boolean inR = false; + //in run or in field. TODO: convert this to an integer because you can have a run within a run + private boolean inT = false; + private boolean inRPr = false; + private boolean inNumPr = false; + private boolean inRt = false; + private boolean inPic = false; + private boolean inPict = false; + private String picDescription = null; + private String picRId = null; + private String picFilename = null; + //mechanism used to determine when to + //signal the start of the p, and still + //handle p with pPr and those without + private boolean lastStartElementWasP = false; + //have we signaled the start of a p? + //pPr can happen multiple times within a p + //

text

+ private boolean pStarted = false; + //alternate content can be embedded in itself. + //need to track depth. + //if in alternate, choose fallback, maybe make this configurable? + private int inACChoiceDepth = 0; + private int inACFallbackDepth = 0; + private boolean inDelText = false; + //buffers rt in ruby sections (see 17.3.3.25) + private boolean inHlinkClick = false; + private boolean inTextBox = false; + private boolean inV = false; //in c:v in chart file + private OOXMLWordAndPowerPointTextHandler.EditType editType = + OOXMLWordAndPowerPointTextHandler.EditType.NONE; + private final List dateFormats; + + public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, + Map hyperlinks) { + this(bodyContentsHandler, hyperlinks, true, true); + } + + public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, + Map hyperlinks, boolean includeTextBox, + boolean concatenatePhoneticRuns) { + this.bodyContentsHandler = bodyContentsHandler; + this.linkedRelationships = hyperlinks; + this.includeTextBox = includeTextBox; + this.concatenatePhoneticRuns = concatenatePhoneticRuns; + this.dateFormats = loadDateFormats(); + } + + @Override + public void startDocument() throws SAXException { + } + + @Override + public void endDocument() throws SAXException { + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd + + if (lastStartElementWasP && !PPR.equals(localName)) { + bodyContentsHandler.startParagraph(currPProperties); + } + + lastStartElementWasP = false; + + if (uri != null && uri.equals(MC_NS)) { + if (CHOICE.equals(localName)) { + inACChoiceDepth++; + } else if (FALLBACK.equals(localName)) { + inACFallbackDepth++; + } + } + + if (inACChoiceDepth > 0) { + return; + } + + if (!includeTextBox && localName.equals(TEXTBOX)) { + inTextBox = true; + return; + } + //these are sorted descending by frequency within docx files + //in our regression corpus. + //yes, I know, likely premature optimization... + if (RPR.equals(localName)) { + inRPr = true; + } else if (R.equals(localName)) { + inR = true; + } else if (T.equals(localName)) { + inT = true; + } else if (TAB.equals(localName)) { + runBuffer.append(TAB_CHAR); + } else if (P.equals(localName)) { + lastStartElementWasP = true; + } else if (B.equals(localName)) { //TODO: add bCs + if (inR && inRPr) { + currRunProperties.setBold(true); + } + } else if (TC.equals(localName)) { + bodyContentsHandler.startTableCell(); + } else if (P_STYLE.equals(localName)) { + String styleId = atts.getValue(W_NS, "val"); + currPProperties.setStyleID(styleId); + } else if (I.equals(localName)) { //TODO: add iCs + //rprs don't have to be inR; ignore those that aren't + if (inR && inRPr) { + currRunProperties.setItalics(true); + } + } else if (STRIKE.equals(localName)) { + if (inR && inRPr) { + currRunProperties.setStrike(true); + } + } else if (U.equals(localName)) { + if (inR && inRPr) { + currRunProperties.setUnderline(getStringVal(atts)); + } + } else if (TR.equals(localName)) { + bodyContentsHandler.startTableRow(); + } else if (NUM_PR.equals(localName)) { + inNumPr = true; + } else if (ILVL.equals(localName)) { + if (inNumPr) { + currPProperties.setIlvl(getIntVal(atts)); + } + } else if (NUM_ID.equals(localName)) { + if (inNumPr) { + currPProperties.setNumId(getIntVal(atts)); + } + } else if (BR.equals(localName)) { + runBuffer.append(NEWLINE); + } else if (BOOKMARK_START.equals(localName)) { + String name = atts.getValue(W_NS, "name"); + String id = atts.getValue(W_NS, "id"); + bodyContentsHandler.startBookmark(id, name); + } else if (BOOKMARK_END.equals(localName)) { + String id = atts.getValue(W_NS, "id"); + bodyContentsHandler.endBookmark(id); + } else if (HYPERLINK.equals(localName)) { //docx hyperlink + String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id"); + String hyperlink = null; + if (hyperlinkId != null) { + hyperlink = linkedRelationships.get(hyperlinkId); + bodyContentsHandler.hyperlinkStart(hyperlink); + } else { + String anchor = atts.getValue(W_NS, "anchor"); + if (anchor != null) { + anchor = "#" + anchor; + } + bodyContentsHandler.hyperlinkStart(anchor); + } + } else if (HLINK_CLICK.equals(localName)) { //pptx hyperlink + String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id"); + String hyperlink = null; + if (hyperlinkId != null) { + hyperlink = linkedRelationships.get(hyperlinkId); + bodyContentsHandler.hyperlinkStart(hyperlink); + inHlinkClick = true; + } + } else if (TBL.equals(localName)) { + bodyContentsHandler.startTable(); + } else if (BLIP.equals(localName)) { //check for DRAWING_NS + picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed"); + } else if ("cNvPr".equals(localName)) { //check for PIC_NS? + picDescription = atts.getValue("", "descr"); + } else if (PIC.equals(localName)) { + inPic = true; //check for PIC_NS? + } //TODO: add sdt, sdtPr, sdtContent goes here statistically + else if (FOOTNOTE_REFERENCE.equals(localName)) { + String id = atts.getValue(W_NS, "id"); + bodyContentsHandler.footnoteReference(id); + } else if (IMAGEDATA.equals(localName)) { + picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id"); + picDescription = atts.getValue(O_NS, "title"); + } else if (INS.equals(localName)) { + startEditedSection(editType.INSERT, atts); + } else if (DEL_TEXT.equals(localName)) { + inDelText = true; + } else if (DEL.equals(localName)) { + startEditedSection(editType.DELETE, atts); + } else if (MOVE_TO.equals(localName)) { + startEditedSection(EditType.MOVE_TO, atts); + } else if (MOVE_FROM.equals(localName)) { + startEditedSection(editType.MOVE_FROM, atts); + } else if (OLE_OBJECT.equals(localName)) { //check for O_NS? + String type = null; + String refId = null; + //TODO: clean this up and ...want to get ProgID? + for (int i = 0; i < atts.getLength(); i++) { + String attLocalName = atts.getLocalName(i); + String attValue = atts.getValue(i); + if (attLocalName.equals("Type")) { + type = attValue; + } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && + attLocalName.equals("id")) { + refId = attValue; + } + } + if ("Embed".equals(type)) { + bodyContentsHandler.embeddedOLERef(refId); + } + } else if (CR.equals(localName)) { + runBuffer.append(NEWLINE); + } else if (ENDNOTE_REFERENCE.equals(localName)) { + String id = atts.getValue(W_NS, "id"); + bodyContentsHandler.endnoteReference(id); + } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart + inV = true; + } else if (RT.equals(localName)) { + inRt = true; + } + + } + + private void startEditedSection(EditType editType, Attributes atts) throws SAXException { + String editAuthor = atts.getValue(W_NS, "author"); + String editDateString = atts.getValue(W_NS, "date"); + Date editDate = null; + if (editDateString != null) { + editDate = tryToParseDate(editDateString); + } + bodyContentsHandler.startEditedSection(editAuthor, editDate, editType); + this.editType = editType; + } + + private String getStringVal(Attributes atts) { + String valString = atts.getValue(W_NS, VAL); + if (valString != null) { + return valString; + } + return ""; + } + + private int getIntVal(Attributes atts) { + String valString = atts.getValue(W_NS, VAL); + if (valString != null) { + try { + return Integer.parseInt(valString); + } catch (NumberFormatException e) { + //swallow + } + } + return -1; + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + + if (CHOICE.equals(localName)) { + inACChoiceDepth--; + } else if (FALLBACK.equals(localName)) { + inACFallbackDepth--; + } + if (inACChoiceDepth > 0) { + return; + } + + if (!includeTextBox && localName.equals(TEXTBOX)) { + inTextBox = false; + return; + } + if (PIC.equals(localName)) { //PIC_NS + handlePict(); + inPic = false; + return; + } else if (RPR.equals(localName)) { + inRPr = false; + } else if (R.equals(localName)) { + handleEndOfRun(); + } else if (T.equals(localName)) { + inT = false; + } else if (PPR.equals(localName)) { + if (!pStarted) { + bodyContentsHandler.startParagraph(currPProperties); + pStarted = true; + } + currPProperties.reset(); + } else if (P.equals(localName)) { + if (runBuffer.length() > 0) { + //

...this will treat that as if it were + //a run...TODO: should we swallow whitespace that doesn't occur in a run? + bodyContentsHandler.run(currRunProperties, runBuffer.toString()); + runBuffer.setLength(0); + } + pStarted = false; + bodyContentsHandler.endParagraph(); + } else if (TC.equals(localName)) { + bodyContentsHandler.endTableCell(); + } else if (TR.equals(localName)) { + bodyContentsHandler.endTableRow(); + } else if (TBL.equals(localName)) { + bodyContentsHandler.endTable(); + } else if (FLD.equals(localName)) { + handleEndOfRun(); + } else if (DEL_TEXT.equals(localName)) { + inDelText = false; + } else if (INS.equals(localName) || DEL.equals(localName) || MOVE_TO.equals(localName) || + MOVE_FROM.equals(localName)) { + editType = EditType.NONE; + } else if (HYPERLINK.equals(localName)) { + bodyContentsHandler.hyperlinkEnd(); + } else if (PICT.equals(localName)) { + handlePict(); + } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart + inV = false; + handleEndOfRun(); + } else if (RT.equals(localName)) { + inRt = false; + } else if (RUBY.equals(localName)) { + handleEndOfRuby(); + } + } + + private void handleEndOfRuby() throws SAXException { + if (rubyBuffer.length() > 0) { + if (concatenatePhoneticRuns) { + bodyContentsHandler.run(currRunProperties, " (" + rubyBuffer.toString() + ")"); + } + rubyBuffer.setLength(0); + } + } + + private void handleEndOfRun() throws SAXException { + bodyContentsHandler.run(currRunProperties, runBuffer.toString()); + if (inHlinkClick) { + bodyContentsHandler.hyperlinkEnd(); + inHlinkClick = false; + } + inR = false; + runBuffer.setLength(0); + currRunProperties.setBold(false); + currRunProperties.setItalics(false); + currRunProperties.setStrike(false); + currRunProperties.setUnderline(UnderlinePatterns.NONE.name()); + } + + private void handlePict() throws SAXException { + String picFileName = null; + if (picRId != null) { + picFileName = linkedRelationships.get(picRId); + } + bodyContentsHandler.embeddedPicRef(picFileName, picDescription); + picDescription = null; + picRId = null; + inPic = false; + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + + if (inACChoiceDepth > 0) { + return; + } else if (!includeTextBox && inTextBox) { + return; + } + + if (editType.equals(EditType.MOVE_FROM) && inT) { + if (bodyContentsHandler.isIncludeMoveFromText()) { + appendToBuffer(ch, start, length); + } + } else if (inT) { + appendToBuffer(ch, start, length); + } else if (bodyContentsHandler.isIncludeDeletedText() && editType.equals(EditType.DELETE)) { + appendToBuffer(ch, start, length); + } else if (inV) { + appendToBuffer(ch, start, length); + appendToBuffer(TAB_CHAR, 0, 1); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (inACChoiceDepth > 0) { + return; + } else if (!includeTextBox && inTextBox) { + return; + } + + if (inT) { + appendToBuffer(ch, start, length); + } else if (bodyContentsHandler.isIncludeDeletedText() && inDelText) { + appendToBuffer(ch, start, length); + } + } + + private void appendToBuffer(char[] ch, int start, int length) throws SAXException { + if (inRt) { + rubyBuffer.append(ch, start, length); + } else { + runBuffer.append(ch, start, length); + } + } + + /** + * Tries to parse the date string; returns null if no parse was possible. + *

+ * This is not thread safe! + */ + private Date tryToParseDate(String dateString) { + // Java doesn't like timezones in the form ss+hh:mm + // It only likes the hhmm form, without the colon + int n = dateString.length(); + if (dateString.charAt(n - 3) == ':' && + (dateString.charAt(n - 6) == '+' || dateString.charAt(n - 6) == '-')) { + dateString = dateString.substring(0, n - 3) + dateString.substring(n - 2); + } + + for (DateFormat df : dateFormats) { + try { + return df.parse(dateString); + } catch (java.text.ParseException e) { + //swallow + } + } + return null; + } + + private static List loadDateFormats() { + List dateFormats = new ArrayList<>(); + // yyyy-mm-ddThh... + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone + // yyyy-mm-dd hh... + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone + // Date without time, set to Midday UTC + dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format + dateFormats.add(createDateFormat("yyyy:MM:dd", + MIDDAY)); // Image (IPTC/EXIF) format + + return dateFormats; + } + + private static DateFormat createDateFormat(String format, TimeZone timezone) { + final SimpleDateFormat sdf = new SimpleDateFormat(format, new DateFormatSymbols(Locale.US)); + if (timezone != null) { + sdf.setTimeZone(timezone); + } + return sdf; + } + + public enum EditType { + NONE, INSERT, DELETE, MOVE_TO, MOVE_FROM + } + + public interface XWPFBodyContentsHandler { + + void run(RunProperties runProperties, String contents) throws SAXException; + + /** + * @param link the link; can be null + */ + void hyperlinkStart(String link) throws SAXException; + + void hyperlinkEnd() throws SAXException; + + void startParagraph(ParagraphProperties paragraphProperties) throws SAXException; + + void endParagraph() throws SAXException; + + void startTable() throws SAXException; + + void endTable() throws SAXException; + + void startTableRow() throws SAXException; + + void endTableRow() throws SAXException; + + void startTableCell() throws SAXException; + + void endTableCell() throws SAXException; + + void startSDT() throws SAXException; + + void endSDT() throws SAXException; + + void startEditedSection(String editor, Date date, EditType editType) throws SAXException; + + void endEditedSection() throws SAXException; + + boolean isIncludeDeletedText() throws SAXException; + + void footnoteReference(String id) throws SAXException; + + void endnoteReference(String id) throws SAXException; + + boolean isIncludeMoveFromText() throws SAXException; + + void embeddedOLERef(String refId) throws SAXException; + + void embeddedPicRef(String picFileName, String picDescription) throws SAXException; + + void startBookmark(String id, String name) throws SAXException; + + void endBookmark(String id) throws SAXException; + } +} diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OfflineContentHandler.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OfflineContentHandler.java new file mode 100644 index 00000000000..47edc2455ec --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OfflineContentHandler.java @@ -0,0 +1,50 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor.internal; + +import org.apache.commons.io.input.ClosedInputStream; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; + +/** + * Content handler decorator that always returns an empty stream from the + * {@link #resolveEntity(String, String)} method to prevent potential + * network or other external resources from being accessed by an XML parser. + *

+ * This is copied from Apache Tika. + *

+ * + * @see TIKA-185 + * @since POI 5.4.2 + */ +final class OfflineContentHandler extends ContentHandlerDecorator { + + public OfflineContentHandler(ContentHandler handler) { + super(handler); + } + + /** + * Returns an empty stream. This will make an XML parser silently + * ignore any external entities. + */ + @Override + public InputSource resolveEntity(String publicId, String systemId) { + return new InputSource(new ClosedInputStream()); + } + +} + diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ParagraphProperties.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ParagraphProperties.java new file mode 100644 index 00000000000..d77e7d1f79e --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ParagraphProperties.java @@ -0,0 +1,61 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor.internal; + +/** + *

+ * This is copied from Apache Tika. + *

+ * + * @since POI 5.4.2 + */ +public class ParagraphProperties { + + private String styleId; + private int ilvl = -1; + private int numId = -1; + + public String getStyleID() { + return styleId; + } + + public void setStyleID(String styleId) { + this.styleId = styleId; + } + + public void reset() { + styleId = null; + ilvl = -1; + numId = -1; + } + + public int getIlvl() { + return ilvl; + } + + public void setIlvl(int ilvl) { + this.ilvl = ilvl; + } + + public int getNumId() { + return numId; + } + + public void setNumId(int numId) { + this.numId = numId; + } +} diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/RunProperties.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/RunProperties.java new file mode 100644 index 00000000000..2feb7646cec --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/RunProperties.java @@ -0,0 +1,76 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor.internal; + +import org.apache.poi.xwpf.usermodel.UnderlinePatterns; + +/** + * WARNING: This class is mutable. Make a copy of it + * if you want persistence! + *

+ * This is copied from Apache Tika. + *

+ * + * @since POI 5.4.2 + */ +public final class RunProperties { + boolean italics = false; + boolean bold = false; + boolean strikeThrough = false; + + UnderlinePatterns underline = UnderlinePatterns.NONE; + + public boolean isItalics() { + return italics; + } + + public void setItalics(boolean italics) { + this.italics = italics; + } + + public boolean isBold() { + return bold; + } + + public void setBold(boolean bold) { + this.bold = bold; + } + + public boolean isStrikeThrough() { + return strikeThrough; + } + + public void setStrike(boolean strikeThrough) { + this.strikeThrough = strikeThrough; + } + + public UnderlinePatterns getUnderline() { + return underline; + } + + public void setUnderline(String underlineString) { + if (underlineString == null || underlineString.equals("")) { + underline = UnderlinePatterns.SINGLE; + } else if (UnderlinePatterns.NONE.name().equals(underlineString)) { + underline = UnderlinePatterns.NONE; + } else { + //TODO -- fill out rest + underline = UnderlinePatterns.SINGLE; + } + } +} + diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XMLReaderUtils.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XMLReaderUtils.java new file mode 100644 index 00000000000..d5abe5f9b83 --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XMLReaderUtils.java @@ -0,0 +1,50 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor.internal; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; + +import org.apache.poi.util.XMLHelper; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + *

+ * This is copied from Apache Tika. + *

+ * + * @since POI 5.4.2 + */ +public final class XMLReaderUtils implements Serializable { + + /** + * This checks context for a user specified {@link SAXParser}. + * If one is not found, this reuses a SAXParser from the pool. + */ + public static void parseSAX(InputStream is, ContentHandler contentHandler) + throws IOException, SAXException { + try { + XMLHelper.getSaxParserFactory().newSAXParser().parse(is, new OfflineContentHandler(contentHandler)); + } catch (ParserConfigurationException e) { + throw new SAXException(e); + } + } +} diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XWPFListManager.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XWPFListManager.java new file mode 100644 index 00000000000..6e458d8f70b --- /dev/null +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XWPFListManager.java @@ -0,0 +1,193 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.extractor.internal; + +import java.math.BigInteger; + +import org.apache.poi.xwpf.usermodel.XWPFAbstractNum; +import org.apache.poi.xwpf.usermodel.XWPFNum; +import org.apache.poi.xwpf.usermodel.XWPFNumbering; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl; + +/** + *

+ * This is copied from Apache Tika. + *

+ * + * @since POI 5.4.2 + */ +public class XWPFListManager extends AbstractListManager { + + /** + * Empty singleton to be used when there is no list manager. + * Always returns empty string. + */ + public final static XWPFListManager EMPTY_LIST = new EmptyListManager(); + private final static String SKIP_FORMAT = Character.toString((char) 61623); +//if this shows up as the lvlText, don't show a number + + private final XWPFNumbering numbering; + + //map of numId (which paragraph series is this a member of?), levelcounts + public XWPFListManager(XWPFNumbering numbering) { + this.numbering = numbering; + } + + /** + * @param paragraph paragraph + * @return the formatted number or an empty string if something went wrong + */ + public String getFormattedNumber(final XWPFParagraph paragraph) { + return getFormattedNumber(paragraph.getNumID(), + paragraph.getNumIlvl() == null ? -1 : paragraph.getNumIlvl().intValue()); + } + + public String getFormattedNumber(BigInteger numId, int iLvl) { + if (numbering == null || iLvl < 0 || numId == null) { + return ""; + } + + int currNumId = numId.intValue(); + + XWPFNum xwpfNum = numbering.getNum(numId); + + if (xwpfNum == null) { + return ""; + } + CTNum ctNum = xwpfNum.getCTNum(); + CTDecimalNumber abNum = ctNum.getAbstractNumId(); + int currAbNumId = abNum.getVal().intValue(); + + ParagraphLevelCounter lc = listLevelMap.get(currAbNumId); + LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId); + if (lc == null) { + lc = loadLevelTuples(abNum); + } + if (overrideTuples == null) { + overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels()); + } + + String formattedString = lc.incrementLevel(iLvl, overrideTuples); + + listLevelMap.put(currAbNumId, lc); + overrideTupleMap.put(currNumId, overrideTuples); + + return formattedString; + + } + + private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) { + LevelTuple[] levelTuples = new LevelTuple[length]; + int overrideLength = ctNum.sizeOfLvlOverrideArray(); + if (overrideLength == 0) { + return null; + } + for (int i = 0; i < length; i++) { + LevelTuple tuple; + if (i >= overrideLength) { + tuple = new LevelTuple("%" + i + "."); + } else { + CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i); + if (ctNumLvl != null) { + tuple = buildTuple(i, ctNumLvl.getLvl()); + } else { + tuple = new LevelTuple("%" + i + "."); + } + } + levelTuples[i] = tuple; + } + return levelTuples; + } + + + private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) { + //Unfortunately, we need to go this far into the underlying structure + //to get the abstract num information for the edge case where + //someone skips a level and the format is not context-free, e.g. "1.B.i". + XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal()); + CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum(); + + LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()]; + for (int i = 0; i < levels.length; i++) { + levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i)); + } + return new ParagraphLevelCounter(levels); + } + + private LevelTuple buildTuple(int level, CTLvl ctLvl) { + boolean isLegal = false; + int start = 1; + int restart = -1; + String lvlText = "%" + level + "."; + String numFmt = "decimal"; + + + if (ctLvl != null && ctLvl.getIsLgl() != null) { + isLegal = true; + } + + if (ctLvl != null && ctLvl.getNumFmt() != null && ctLvl.getNumFmt().getVal() != null) { + numFmt = ctLvl.getNumFmt().getVal().toString(); + } + if (ctLvl != null && ctLvl.getLvlRestart() != null && + ctLvl.getLvlRestart().getVal() != null) { + restart = ctLvl.getLvlRestart().getVal().intValue(); + } + if (ctLvl != null && ctLvl.getStart() != null && ctLvl.getStart().getVal() != null) { + start = ctLvl.getStart().getVal().intValue(); + } else { + + //this is a hack. Currently, this gets the lowest possible + //start for a given numFmt. We should probably try to grab the + //restartNumberingAfterBreak value in + //e.g. ??? + if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || + "decimalZero".equals(numFmt)) { + start = 0; + } else { + start = 1; + } + } + if (ctLvl != null && ctLvl.getLvlText() != null && ctLvl.getLvlText().getVal() != null) { + lvlText = ctLvl.getLvlText().getVal(); + } + return new LevelTuple(start, restart, lvlText, numFmt, isLegal); + } + + + private static class EmptyListManager extends XWPFListManager { + EmptyListManager() { + super(null); + } + + @Override + public String getFormattedNumber(XWPFParagraph paragraph) { + return ""; + } + + @Override + public String getFormattedNumber(BigInteger numId, int iLvl) { + return ""; + } + + } +} diff --git a/poi-ooxml/src/test/java/org/apache/poi/xwpf/XWPFTestDataSamples.java b/poi-ooxml/src/test/java/org/apache/poi/xwpf/XWPFTestDataSamples.java index 510765d318e..3701336bb18 100644 --- a/poi-ooxml/src/test/java/org/apache/poi/xwpf/XWPFTestDataSamples.java +++ b/poi-ooxml/src/test/java/org/apache/poi/xwpf/XWPFTestDataSamples.java @@ -21,6 +21,8 @@ Licensed to the Apache Software Foundation (ASF) under one or more import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.poi.POIDataSamples; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.util.IOUtils; import org.apache.poi.xwpf.usermodel.XWPFDocument; @@ -31,6 +33,11 @@ public static XWPFDocument openSampleDocument(String sampleName) throws IOExcept return new XWPFDocument(is); } + public static OPCPackage openSampleOPCPackage(String sampleName) throws IOException, InvalidFormatException { + InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleName); + return OPCPackage.open(is); + } + public static XWPFDocument writeOutAndReadBack(XWPFDocument doc) throws IOException { UnsynchronizedByteArrayOutputStream baos = UnsynchronizedByteArrayOutputStream.builder().setBufferSize(4096).get(); doc.write(baos); diff --git a/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index 90f4c817a51..c44e4df4d4c 100644 --- a/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -33,6 +33,7 @@ Licensed to the Apache Software Foundation (ASF) under one or more import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.util.StringUtil; import org.apache.poi.xssf.usermodel.XSSFRelation; import org.apache.poi.xwpf.XWPFTestDataSamples; @@ -40,6 +41,7 @@ Licensed to the Apache Software Foundation (ASF) under one or more import org.apache.poi.xwpf.usermodel.XWPFSDT; import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlObject; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRow; @@ -57,11 +59,12 @@ class TestXWPFWordExtractor { */ @Test void testGetSimpleText() throws IOException { - try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("sample.docx"); - XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) { - + try ( + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("sample.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc) + ) { String text = extractor.getText(); - assertTrue(text.length() > 0); + assertFalse(text.isEmpty()); // Check contents assertStartsWith(text, @@ -77,6 +80,27 @@ void testGetSimpleText() throws IOException { } } + @Test + void testGetSimpleTextEventBased() throws Exception { + try ( + OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("sample.docx"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg) + ) { + String text = extractor.getText(); + assertFalse(text.isEmpty()); + + // result is a bit different from the one in testGetSimpleText (extra whitespace) + + // Check contents + assertContains(text, + "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio." + ); + assertContains(text, + "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n" + ); + } + } + /** * Tests getting the text out of a complex file */ @@ -86,7 +110,7 @@ void testGetComplexText() throws IOException { XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) { String text = extractor.getText(); - assertTrue(text.length() > 0); + assertFalse(text.isEmpty()); char euro = '\u20ac'; @@ -107,6 +131,31 @@ void testGetComplexText() throws IOException { } } + @Test + void testGetComplexTextEventBased() throws Exception { + try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("IllustrativeCases.docx"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) { + + String text = extractor.getText(); + assertFalse(text.isEmpty()); + + char euro = '\u20ac'; + + // Check contents + assertStartsWith(text, + " \n(V) ILLUSTRATIVE CASES\n\n" + ); + assertContains(text, + "As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n"// \n\n\n" + ); + + // TODO find out why this fails + //assertEndsWith(text, + // "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n" + //); + } + } + @Test void testGetWithHyperlinks() throws IOException { try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("TestDocument.docx"); @@ -234,6 +283,16 @@ void testInsertedDeletedText() throws IOException { } } + @Test + void testInsertedDeletedTextEventBased() throws Exception { + try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("delins.docx"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) { + + assertContains(extractor.getText(), "pendant worn"); + assertContains(extractor.getText(), "extremely well"); + } + } + @Test void testParagraphHeader() throws IOException { try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Headers.docx"); @@ -245,6 +304,17 @@ void testParagraphHeader() throws IOException { } } + @Test + void testParagraphHeaderEventBased() throws Exception { + try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("Headers.docx"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) { + + assertContains(extractor.getText(), "Section 1"); + assertContains(extractor.getText(), "Section 2"); + assertContains(extractor.getText(), "Section 3"); + } + } + /** * Test that we can open and process .docm * (macro enabled) docx files (bug #45690) @@ -260,6 +330,18 @@ void testDOCMFiles() throws IOException { } } + @Disabled + @Test + void testDOCMFilesEventBased() throws Exception { + try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("45690.docm"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) { + + assertContains(extractor.getText(), "2004"); + assertContains(extractor.getText(), "2008"); + assertContains(extractor.getText(), "(120 "); + } + } + /** * Test that we handle things like tabs and * carriage returns properly in the text that @@ -289,7 +371,18 @@ void testNoFieldCodes() throws IOException { try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("FieldCodes.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) { String text = extractor.getText(); - assertTrue(text.length() > 0); + assertFalse(text.isEmpty()); + assertFalse(text.contains("AUTHOR")); + assertFalse(text.contains("CREATEDATE")); + } + } + + @Test + void testNoFieldCodesEventBased() throws Exception { + try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("FieldCodes.docx"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) { + String text = extractor.getText(); + assertFalse(text.isEmpty()); assertFalse(text.contains("AUTHOR")); assertFalse(text.contains("CREATEDATE")); } @@ -304,7 +397,7 @@ void testFldSimpleContent() throws IOException { try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("FldSimple.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) { String text = extractor.getText(); - assertTrue(text.length() > 0); + assertFalse(text.isEmpty()); assertContains(text, "FldSimple.docx"); } } @@ -318,7 +411,7 @@ void testDrawings() throws IOException { try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("drawing.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) { String text = extractor.getText(); - assertTrue(text.length() > 0); + assertFalse(text.isEmpty()); } } @@ -465,8 +558,23 @@ void testGlossary() throws IOException { @Test void testPartsInTemplate() throws IOException { - try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("60316b.dotx")) { - XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + try ( + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("60316b.dotx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc) + ) { + String txt = extractor.getText(); + assertContains(txt, "header 2"); + assertContains(txt, "footer 1"); + } + } + + @Disabled // parts in template not supported in event based + @Test + void testPartsInTemplateEventBased() throws Exception { + try ( + OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("60316b.dotx"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg) + ) { String txt = extractor.getText(); assertContains(txt, "header 2"); assertContains(txt, "footer 1"); @@ -475,17 +583,33 @@ void testPartsInTemplate() throws IOException { @Test void bug55966() throws IOException { - try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55966.docx")) { + try ( + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55966.docx"); + XWPFWordExtractor extractedDoc = new XWPFWordExtractor(doc) + ) { String expected = "Content control within a paragraph is here text content from within a paragraph second control with a new\n" + "line\n" + "\n" + "Content control that is the entire paragraph\n"; - XWPFWordExtractor extractedDoc = new XWPFWordExtractor(doc); - String actual = extractedDoc.getText(); + assertEquals(expected, actual); + } + } - extractedDoc.close(); + @Disabled // extra test found in the event based extractor + @Test + void bug55966EventBased() throws Exception { + try ( + OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("55966.docx"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg) + ) { + String expected = "Content control within a paragraph is here text content from within a paragraph second control with a new\n" + + "line\n" + + "\n" + + "Content control that is the entire paragraph\n"; + + String actual = extractor.getText(); assertEquals(expected, actual); } } @@ -499,6 +623,16 @@ void testCapitalizedFlag() throws IOException { } } + @Disabled // capitalized flag not supported in event based + @Test + void testCapitalizedFlagEventBased() throws Exception { + try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("capitalized.docx"); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) { + String txt = extractor.getText(); + assertEquals( "The following word is: CAPITALIZED.", txt.trim()); + } + } + @Test void testTika2163() throws IOException { try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("ChronologicalResume.dotx"); @@ -508,6 +642,18 @@ void testTika2163() throws IOException { } } + @Test + void testTika2163EventBased() throws Exception { + final String filename = "ChronologicalResume.dotx"; + try ( + OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage(filename); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg) + ) { + String txt = extractor.getText(); + assertContains(txt, "but a great-looking résumé doesn’t have to be!"); + } + } + @Test void testTika3816() throws IOException { try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("tika-3816.docx"); @@ -519,6 +665,19 @@ void testTika3816() throws IOException { } } + @Disabled // whitespace issue in text + @Test + void testTika3816EventBased() throws Exception { + final String filename = "tika-3816.docx"; + try ( + OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage(filename); + XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg) + ) { + String txt = extractor.getText(); + assertContains(txt, "Note\tDetails"); + } + } + private static List extractSDTsFromBody(XWPFDocument document) { XWPFSDT sdt; XmlCursor xmlcursor = document.getDocument().getBody().newCursor(); diff --git a/poi-scratchpad/src/main/java/org/apache/poi/hwpf/converter/NumberFormatter.java b/poi-scratchpad/src/main/java/org/apache/poi/hwpf/converter/NumberFormatter.java index 483c227c23b..a84d851cb6d 100644 --- a/poi-scratchpad/src/main/java/org/apache/poi/hwpf/converter/NumberFormatter.java +++ b/poi-scratchpad/src/main/java/org/apache/poi/hwpf/converter/NumberFormatter.java @@ -19,9 +19,6 @@ package org.apache.poi.hwpf.converter; -import java.util.Arrays; -import java.util.Locale; - import org.apache.poi.util.Beta; /** @@ -29,75 +26,9 @@ */ @Beta public final class NumberFormatter { - // use char[] instead of String to speed up StringBuilder.append(), especially in JDK 11+ - // where StringBuilder internally switched from char[] to byte[] - private static final char[][] ROMAN_LETTERS = Arrays.stream( - new String[] { "m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i" }). - map(String::toCharArray). - toArray(char[][]::new); - - private static final int[] ROMAN_VALUES = { 1000, 900, 500, 400, 100, 90, - 50, 40, 10, 9, 5, 4, 1 }; - - private static final int T_ARABIC = 0; - private static final int T_LOWER_LETTER = 4; - private static final int T_LOWER_ROMAN = 2; - private static final int T_ORDINAL = 5; - private static final int T_UPPER_LETTER = 3; - private static final int T_UPPER_ROMAN = 1; + // code was moved to org.apache.poi.util.NumberFormatter public static String getNumber( int num, int style ) { - switch ( style ) { - case T_UPPER_ROMAN: - return toRoman( num ).toUpperCase(Locale.ROOT); - case T_LOWER_ROMAN: - return toRoman( num ); - case T_UPPER_LETTER: - return toLetters( num ).toUpperCase(Locale.ROOT); - case T_LOWER_LETTER: - return toLetters( num ); - case T_ARABIC: - case T_ORDINAL: - default: - return String.valueOf( num ); - } - } - - private static String toLetters(int number) { - if ( number <= 0 ) { - throw new IllegalArgumentException( "Unsupported number: " + number ); - } - - int num = number; - final int radix = 26; - - char[] buf = new char[33]; - int charPos = buf.length; - - while (num > 0) { - num--; // 1 => a, not 0 => a - int remainder = num % radix; - buf[--charPos] = (char)('a'+remainder); - num = (num - remainder) / radix; - } - - return new String(buf, charPos, (buf.length - charPos)); - } - - private static String toRoman( int number ) { - if ( number <= 0 ) - throw new IllegalArgumentException( "Unsupported number: " + number ); - - StringBuilder result = new StringBuilder(); - - for ( int i = 0; i < ROMAN_LETTERS.length; i++ ) { - char[] letter = ROMAN_LETTERS[i]; - int value = ROMAN_VALUES[i]; - while ( number >= value ) { - number -= value; - result.append( letter ); - } - } - return result.toString(); + return org.apache.poi.util.NumberFormatter.getNumber(num, style); } } diff --git a/poi/src/main/java/org/apache/poi/util/NumberFormatter.java b/poi/src/main/java/org/apache/poi/util/NumberFormatter.java new file mode 100644 index 00000000000..f88103d5412 --- /dev/null +++ b/poi/src/main/java/org/apache/poi/util/NumberFormatter.java @@ -0,0 +1,101 @@ +/* + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ + +package org.apache.poi.util; + +import java.util.Arrays; +import java.util.Locale; + +/** + * Utility class to translate numbers in letters, usually for lists. + */ +@Beta +public class NumberFormatter { + // use char[] instead of String to speed up StringBuilder.append(), especially in JDK 11+ + // where StringBuilder internally switched from char[] to byte[] + private static final char[][] ROMAN_LETTERS = Arrays.stream( + new String[] { "m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i" }). + map(String::toCharArray). + toArray(char[][]::new); + + private static final int[] ROMAN_VALUES = { 1000, 900, 500, 400, 100, 90, + 50, 40, 10, 9, 5, 4, 1 }; + + private static final int T_ARABIC = 0; + private static final int T_LOWER_LETTER = 4; + private static final int T_LOWER_ROMAN = 2; + private static final int T_ORDINAL = 5; + private static final int T_UPPER_LETTER = 3; + private static final int T_UPPER_ROMAN = 1; + + public static String getNumber( int num, int style ) { + switch ( style ) { + case T_UPPER_ROMAN: + return toRoman( num ).toUpperCase(Locale.ROOT); + case T_LOWER_ROMAN: + return toRoman( num ); + case T_UPPER_LETTER: + return toLetters( num ).toUpperCase(Locale.ROOT); + case T_LOWER_LETTER: + return toLetters( num ); + case T_ARABIC: + case T_ORDINAL: + default: + return String.valueOf( num ); + } + } + + private static String toLetters(int number) { + if ( number <= 0 ) { + throw new IllegalArgumentException( "Unsupported number: " + number ); + } + + int num = number; + final int radix = 26; + + char[] buf = new char[33]; + int charPos = buf.length; + + while (num > 0) { + num--; // 1 => a, not 0 => a + int remainder = num % radix; + buf[--charPos] = (char)('a'+remainder); + num = (num - remainder) / radix; + } + + return new String(buf, charPos, (buf.length - charPos)); + } + + private static String toRoman( int number ) { + if ( number <= 0 ) + throw new IllegalArgumentException( "Unsupported number: " + number ); + + StringBuilder result = new StringBuilder(); + + for ( int i = 0; i < ROMAN_LETTERS.length; i++ ) { + char[] letter = ROMAN_LETTERS[i]; + int value = ROMAN_VALUES[i]; + while ( number >= value ) { + number -= value; + result.append( letter ); + } + } + return result.toString(); + } +}