hyperlinks, boolean includeTextBox,
+ boolean concatenatePhoneticRuns) {
+ this.bodyContentsHandler = bodyContentsHandler;
+ this.linkedRelationships = hyperlinks;
+ this.includeTextBox = includeTextBox;
+ this.concatenatePhoneticRuns = concatenatePhoneticRuns;
+ this.dateFormats = loadDateFormats();
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts)
+ throws SAXException {
+ //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
+
+ if (lastStartElementWasP && !PPR.equals(localName)) {
+ bodyContentsHandler.startParagraph(currPProperties);
+ }
+
+ lastStartElementWasP = false;
+
+ if (uri != null && uri.equals(MC_NS)) {
+ if (CHOICE.equals(localName)) {
+ inACChoiceDepth++;
+ } else if (FALLBACK.equals(localName)) {
+ inACFallbackDepth++;
+ }
+ }
+
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+ if (!includeTextBox && localName.equals(TEXTBOX)) {
+ inTextBox = true;
+ return;
+ }
+ //these are sorted descending by frequency within docx files
+ //in our regression corpus.
+ //yes, I know, likely premature optimization...
+ if (RPR.equals(localName)) {
+ inRPr = true;
+ } else if (R.equals(localName)) {
+ inR = true;
+ } else if (T.equals(localName)) {
+ inT = true;
+ } else if (TAB.equals(localName)) {
+ runBuffer.append(TAB_CHAR);
+ } else if (P.equals(localName)) {
+ lastStartElementWasP = true;
+ } else if (B.equals(localName)) { //TODO: add bCs
+ if (inR && inRPr) {
+ currRunProperties.setBold(true);
+ }
+ } else if (TC.equals(localName)) {
+ bodyContentsHandler.startTableCell();
+ } else if (P_STYLE.equals(localName)) {
+ String styleId = atts.getValue(W_NS, "val");
+ currPProperties.setStyleID(styleId);
+ } else if (I.equals(localName)) { //TODO: add iCs
+ //rprs don't have to be inR; ignore those that aren't
+ if (inR && inRPr) {
+ currRunProperties.setItalics(true);
+ }
+ } else if (STRIKE.equals(localName)) {
+ if (inR && inRPr) {
+ currRunProperties.setStrike(true);
+ }
+ } else if (U.equals(localName)) {
+ if (inR && inRPr) {
+ currRunProperties.setUnderline(getStringVal(atts));
+ }
+ } else if (TR.equals(localName)) {
+ bodyContentsHandler.startTableRow();
+ } else if (NUM_PR.equals(localName)) {
+ inNumPr = true;
+ } else if (ILVL.equals(localName)) {
+ if (inNumPr) {
+ currPProperties.setIlvl(getIntVal(atts));
+ }
+ } else if (NUM_ID.equals(localName)) {
+ if (inNumPr) {
+ currPProperties.setNumId(getIntVal(atts));
+ }
+ } else if (BR.equals(localName)) {
+ runBuffer.append(NEWLINE);
+ } else if (BOOKMARK_START.equals(localName)) {
+ String name = atts.getValue(W_NS, "name");
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.startBookmark(id, name);
+ } else if (BOOKMARK_END.equals(localName)) {
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.endBookmark(id);
+ } else if (HYPERLINK.equals(localName)) { //docx hyperlink
+ String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ String hyperlink = null;
+ if (hyperlinkId != null) {
+ hyperlink = linkedRelationships.get(hyperlinkId);
+ bodyContentsHandler.hyperlinkStart(hyperlink);
+ } else {
+ String anchor = atts.getValue(W_NS, "anchor");
+ if (anchor != null) {
+ anchor = "#" + anchor;
+ }
+ bodyContentsHandler.hyperlinkStart(anchor);
+ }
+ } else if (HLINK_CLICK.equals(localName)) { //pptx hyperlink
+ String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ String hyperlink = null;
+ if (hyperlinkId != null) {
+ hyperlink = linkedRelationships.get(hyperlinkId);
+ bodyContentsHandler.hyperlinkStart(hyperlink);
+ inHlinkClick = true;
+ }
+ } else if (TBL.equals(localName)) {
+ bodyContentsHandler.startTable();
+ } else if (BLIP.equals(localName)) { //check for DRAWING_NS
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+ } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
+ picDescription = atts.getValue("", "descr");
+ } else if (PIC.equals(localName)) {
+ inPic = true; //check for PIC_NS?
+ } //TODO: add sdt, sdtPr, sdtContent goes here statistically
+ else if (FOOTNOTE_REFERENCE.equals(localName)) {
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.footnoteReference(id);
+ } else if (IMAGEDATA.equals(localName)) {
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ picDescription = atts.getValue(O_NS, "title");
+ } else if (INS.equals(localName)) {
+ startEditedSection(editType.INSERT, atts);
+ } else if (DEL_TEXT.equals(localName)) {
+ inDelText = true;
+ } else if (DEL.equals(localName)) {
+ startEditedSection(editType.DELETE, atts);
+ } else if (MOVE_TO.equals(localName)) {
+ startEditedSection(EditType.MOVE_TO, atts);
+ } else if (MOVE_FROM.equals(localName)) {
+ startEditedSection(editType.MOVE_FROM, atts);
+ } else if (OLE_OBJECT.equals(localName)) { //check for O_NS?
+ String type = null;
+ String refId = null;
+ //TODO: clean this up and ...want to get ProgID?
+ for (int i = 0; i < atts.getLength(); i++) {
+ String attLocalName = atts.getLocalName(i);
+ String attValue = atts.getValue(i);
+ if (attLocalName.equals("Type")) {
+ type = attValue;
+ } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) &&
+ attLocalName.equals("id")) {
+ refId = attValue;
+ }
+ }
+ if ("Embed".equals(type)) {
+ bodyContentsHandler.embeddedOLERef(refId);
+ }
+ } else if (CR.equals(localName)) {
+ runBuffer.append(NEWLINE);
+ } else if (ENDNOTE_REFERENCE.equals(localName)) {
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.endnoteReference(id);
+ } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
+ inV = true;
+ } else if (RT.equals(localName)) {
+ inRt = true;
+ }
+
+ }
+
+ private void startEditedSection(EditType editType, Attributes atts) throws SAXException {
+ String editAuthor = atts.getValue(W_NS, "author");
+ String editDateString = atts.getValue(W_NS, "date");
+ Date editDate = null;
+ if (editDateString != null) {
+ editDate = tryToParseDate(editDateString);
+ }
+ bodyContentsHandler.startEditedSection(editAuthor, editDate, editType);
+ this.editType = editType;
+ }
+
+ private String getStringVal(Attributes atts) {
+ String valString = atts.getValue(W_NS, VAL);
+ if (valString != null) {
+ return valString;
+ }
+ return "";
+ }
+
+ private int getIntVal(Attributes atts) {
+ String valString = atts.getValue(W_NS, VAL);
+ if (valString != null) {
+ try {
+ return Integer.parseInt(valString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ }
+ return -1;
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+
+ if (CHOICE.equals(localName)) {
+ inACChoiceDepth--;
+ } else if (FALLBACK.equals(localName)) {
+ inACFallbackDepth--;
+ }
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+ if (!includeTextBox && localName.equals(TEXTBOX)) {
+ inTextBox = false;
+ return;
+ }
+ if (PIC.equals(localName)) { //PIC_NS
+ handlePict();
+ inPic = false;
+ return;
+ } else if (RPR.equals(localName)) {
+ inRPr = false;
+ } else if (R.equals(localName)) {
+ handleEndOfRun();
+ } else if (T.equals(localName)) {
+ inT = false;
+ } else if (PPR.equals(localName)) {
+ if (!pStarted) {
+ bodyContentsHandler.startParagraph(currPProperties);
+ pStarted = true;
+ }
+ currPProperties.reset();
+ } else if (P.equals(localName)) {
+ if (runBuffer.length() > 0) {
+ //
...this will treat that as if it were
+ //a run...TODO: should we swallow whitespace that doesn't occur in a run?
+ bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+ runBuffer.setLength(0);
+ }
+ pStarted = false;
+ bodyContentsHandler.endParagraph();
+ } else if (TC.equals(localName)) {
+ bodyContentsHandler.endTableCell();
+ } else if (TR.equals(localName)) {
+ bodyContentsHandler.endTableRow();
+ } else if (TBL.equals(localName)) {
+ bodyContentsHandler.endTable();
+ } else if (FLD.equals(localName)) {
+ handleEndOfRun();
+ } else if (DEL_TEXT.equals(localName)) {
+ inDelText = false;
+ } else if (INS.equals(localName) || DEL.equals(localName) || MOVE_TO.equals(localName) ||
+ MOVE_FROM.equals(localName)) {
+ editType = EditType.NONE;
+ } else if (HYPERLINK.equals(localName)) {
+ bodyContentsHandler.hyperlinkEnd();
+ } else if (PICT.equals(localName)) {
+ handlePict();
+ } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
+ inV = false;
+ handleEndOfRun();
+ } else if (RT.equals(localName)) {
+ inRt = false;
+ } else if (RUBY.equals(localName)) {
+ handleEndOfRuby();
+ }
+ }
+
+ private void handleEndOfRuby() throws SAXException {
+ if (rubyBuffer.length() > 0) {
+ if (concatenatePhoneticRuns) {
+ bodyContentsHandler.run(currRunProperties, " (" + rubyBuffer.toString() + ")");
+ }
+ rubyBuffer.setLength(0);
+ }
+ }
+
+ private void handleEndOfRun() throws SAXException {
+ bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+ if (inHlinkClick) {
+ bodyContentsHandler.hyperlinkEnd();
+ inHlinkClick = false;
+ }
+ inR = false;
+ runBuffer.setLength(0);
+ currRunProperties.setBold(false);
+ currRunProperties.setItalics(false);
+ currRunProperties.setStrike(false);
+ currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
+ }
+
+ private void handlePict() throws SAXException {
+ String picFileName = null;
+ if (picRId != null) {
+ picFileName = linkedRelationships.get(picRId);
+ }
+ bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+ picDescription = null;
+ picRId = null;
+ inPic = false;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+
+ if (inACChoiceDepth > 0) {
+ return;
+ } else if (!includeTextBox && inTextBox) {
+ return;
+ }
+
+ if (editType.equals(EditType.MOVE_FROM) && inT) {
+ if (bodyContentsHandler.isIncludeMoveFromText()) {
+ appendToBuffer(ch, start, length);
+ }
+ } else if (inT) {
+ appendToBuffer(ch, start, length);
+ } else if (bodyContentsHandler.isIncludeDeletedText() && editType.equals(EditType.DELETE)) {
+ appendToBuffer(ch, start, length);
+ } else if (inV) {
+ appendToBuffer(ch, start, length);
+ appendToBuffer(TAB_CHAR, 0, 1);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (inACChoiceDepth > 0) {
+ return;
+ } else if (!includeTextBox && inTextBox) {
+ return;
+ }
+
+ if (inT) {
+ appendToBuffer(ch, start, length);
+ } else if (bodyContentsHandler.isIncludeDeletedText() && inDelText) {
+ appendToBuffer(ch, start, length);
+ }
+ }
+
+ private void appendToBuffer(char[] ch, int start, int length) throws SAXException {
+ if (inRt) {
+ rubyBuffer.append(ch, start, length);
+ } else {
+ runBuffer.append(ch, start, length);
+ }
+ }
+
+ /**
+ * Tries to parse the date string; returns null if no parse was possible.
+ *
+ * This is not thread safe!
+ */
+ private Date tryToParseDate(String dateString) {
+ // Java doesn't like timezones in the form ss+hh:mm
+ // It only likes the hhmm form, without the colon
+ int n = dateString.length();
+ if (dateString.charAt(n - 3) == ':' &&
+ (dateString.charAt(n - 6) == '+' || dateString.charAt(n - 6) == '-')) {
+ dateString = dateString.substring(0, n - 3) + dateString.substring(n - 2);
+ }
+
+ for (DateFormat df : dateFormats) {
+ try {
+ return df.parse(dateString);
+ } catch (java.text.ParseException e) {
+ //swallow
+ }
+ }
+ return null;
+ }
+
+ private static List loadDateFormats() {
+ List dateFormats = new ArrayList<>();
+ // yyyy-mm-ddThh...
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone
+ // yyyy-mm-dd hh...
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone
+ // Date without time, set to Midday UTC
+ dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format
+ dateFormats.add(createDateFormat("yyyy:MM:dd",
+ MIDDAY)); // Image (IPTC/EXIF) format
+
+ return dateFormats;
+ }
+
+ private static DateFormat createDateFormat(String format, TimeZone timezone) {
+ final SimpleDateFormat sdf = new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
+ if (timezone != null) {
+ sdf.setTimeZone(timezone);
+ }
+ return sdf;
+ }
+
+ public enum EditType {
+ NONE, INSERT, DELETE, MOVE_TO, MOVE_FROM
+ }
+
+ public interface XWPFBodyContentsHandler {
+
+ void run(RunProperties runProperties, String contents) throws SAXException;
+
+ /**
+ * @param link the link; can be null
+ */
+ void hyperlinkStart(String link) throws SAXException;
+
+ void hyperlinkEnd() throws SAXException;
+
+ void startParagraph(ParagraphProperties paragraphProperties) throws SAXException;
+
+ void endParagraph() throws SAXException;
+
+ void startTable() throws SAXException;
+
+ void endTable() throws SAXException;
+
+ void startTableRow() throws SAXException;
+
+ void endTableRow() throws SAXException;
+
+ void startTableCell() throws SAXException;
+
+ void endTableCell() throws SAXException;
+
+ void startSDT() throws SAXException;
+
+ void endSDT() throws SAXException;
+
+ void startEditedSection(String editor, Date date, EditType editType) throws SAXException;
+
+ void endEditedSection() throws SAXException;
+
+ boolean isIncludeDeletedText() throws SAXException;
+
+ void footnoteReference(String id) throws SAXException;
+
+ void endnoteReference(String id) throws SAXException;
+
+ boolean isIncludeMoveFromText() throws SAXException;
+
+ void embeddedOLERef(String refId) throws SAXException;
+
+ void embeddedPicRef(String picFileName, String picDescription) throws SAXException;
+
+ void startBookmark(String id, String name) throws SAXException;
+
+ void endBookmark(String id) throws SAXException;
+ }
+}
diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OfflineContentHandler.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OfflineContentHandler.java
new file mode 100644
index 00000000000..47edc2455ec
--- /dev/null
+++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/OfflineContentHandler.java
@@ -0,0 +1,50 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.extractor.internal;
+
+import org.apache.commons.io.input.ClosedInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+
+/**
+ * Content handler decorator that always returns an empty stream from the
+ * {@link #resolveEntity(String, String)} method to prevent potential
+ * network or other external resources from being accessed by an XML parser.
+ *
+ * This is copied from Apache Tika.
+ *
+ *
+ * @see TIKA-185
+ * @since POI 5.4.2
+ */
+final class OfflineContentHandler extends ContentHandlerDecorator {
+
+ public OfflineContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ /**
+ * Returns an empty stream. This will make an XML parser silently
+ * ignore any external entities.
+ */
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId) {
+ return new InputSource(new ClosedInputStream());
+ }
+
+}
+
diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ParagraphProperties.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ParagraphProperties.java
new file mode 100644
index 00000000000..d77e7d1f79e
--- /dev/null
+++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/ParagraphProperties.java
@@ -0,0 +1,61 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.extractor.internal;
+
+/**
+ *
+ * This is copied from Apache Tika.
+ *
+ *
+ * @since POI 5.4.2
+ */
+public class ParagraphProperties {
+
+ private String styleId;
+ private int ilvl = -1;
+ private int numId = -1;
+
+ public String getStyleID() {
+ return styleId;
+ }
+
+ public void setStyleID(String styleId) {
+ this.styleId = styleId;
+ }
+
+ public void reset() {
+ styleId = null;
+ ilvl = -1;
+ numId = -1;
+ }
+
+ public int getIlvl() {
+ return ilvl;
+ }
+
+ public void setIlvl(int ilvl) {
+ this.ilvl = ilvl;
+ }
+
+ public int getNumId() {
+ return numId;
+ }
+
+ public void setNumId(int numId) {
+ this.numId = numId;
+ }
+}
diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/RunProperties.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/RunProperties.java
new file mode 100644
index 00000000000..2feb7646cec
--- /dev/null
+++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/RunProperties.java
@@ -0,0 +1,76 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.extractor.internal;
+
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
+
+/**
+ * WARNING: This class is mutable. Make a copy of it
+ * if you want persistence!
+ *
+ * This is copied from Apache Tika.
+ *
+ *
+ * @since POI 5.4.2
+ */
+public final class RunProperties {
+ boolean italics = false;
+ boolean bold = false;
+ boolean strikeThrough = false;
+
+ UnderlinePatterns underline = UnderlinePatterns.NONE;
+
+ public boolean isItalics() {
+ return italics;
+ }
+
+ public void setItalics(boolean italics) {
+ this.italics = italics;
+ }
+
+ public boolean isBold() {
+ return bold;
+ }
+
+ public void setBold(boolean bold) {
+ this.bold = bold;
+ }
+
+ public boolean isStrikeThrough() {
+ return strikeThrough;
+ }
+
+ public void setStrike(boolean strikeThrough) {
+ this.strikeThrough = strikeThrough;
+ }
+
+ public UnderlinePatterns getUnderline() {
+ return underline;
+ }
+
+ public void setUnderline(String underlineString) {
+ if (underlineString == null || underlineString.equals("")) {
+ underline = UnderlinePatterns.SINGLE;
+ } else if (UnderlinePatterns.NONE.name().equals(underlineString)) {
+ underline = UnderlinePatterns.NONE;
+ } else {
+ //TODO -- fill out rest
+ underline = UnderlinePatterns.SINGLE;
+ }
+ }
+}
+
diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XMLReaderUtils.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XMLReaderUtils.java
new file mode 100644
index 00000000000..d5abe5f9b83
--- /dev/null
+++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XMLReaderUtils.java
@@ -0,0 +1,50 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.extractor.internal;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+
+import org.apache.poi.util.XMLHelper;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ *
+ * This is copied from Apache Tika.
+ *
+ *
+ * @since POI 5.4.2
+ */
+public final class XMLReaderUtils implements Serializable {
+
+ /**
+ * This checks context for a user specified {@link SAXParser}.
+ * If one is not found, this reuses a SAXParser from the pool.
+ */
+ public static void parseSAX(InputStream is, ContentHandler contentHandler)
+ throws IOException, SAXException {
+ try {
+ XMLHelper.getSaxParserFactory().newSAXParser().parse(is, new OfflineContentHandler(contentHandler));
+ } catch (ParserConfigurationException e) {
+ throw new SAXException(e);
+ }
+ }
+}
diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XWPFListManager.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XWPFListManager.java
new file mode 100644
index 00000000000..6e458d8f70b
--- /dev/null
+++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/extractor/internal/XWPFListManager.java
@@ -0,0 +1,193 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.extractor.internal;
+
+import java.math.BigInteger;
+
+import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
+import org.apache.poi.xwpf.usermodel.XWPFNum;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl;
+
+/**
+ *
+ * This is copied from Apache Tika.
+ *
+ *
+ * @since POI 5.4.2
+ */
+public class XWPFListManager extends AbstractListManager {
+
+ /**
+ * Empty singleton to be used when there is no list manager.
+ * Always returns empty string.
+ */
+ public final static XWPFListManager EMPTY_LIST = new EmptyListManager();
+ private final static String SKIP_FORMAT = Character.toString((char) 61623);
+//if this shows up as the lvlText, don't show a number
+
+ private final XWPFNumbering numbering;
+
+ //map of numId (which paragraph series is this a member of?), levelcounts
+ public XWPFListManager(XWPFNumbering numbering) {
+ this.numbering = numbering;
+ }
+
+ /**
+ * @param paragraph paragraph
+ * @return the formatted number or an empty string if something went wrong
+ */
+ public String getFormattedNumber(final XWPFParagraph paragraph) {
+ return getFormattedNumber(paragraph.getNumID(),
+ paragraph.getNumIlvl() == null ? -1 : paragraph.getNumIlvl().intValue());
+ }
+
+ public String getFormattedNumber(BigInteger numId, int iLvl) {
+ if (numbering == null || iLvl < 0 || numId == null) {
+ return "";
+ }
+
+ int currNumId = numId.intValue();
+
+ XWPFNum xwpfNum = numbering.getNum(numId);
+
+ if (xwpfNum == null) {
+ return "";
+ }
+ CTNum ctNum = xwpfNum.getCTNum();
+ CTDecimalNumber abNum = ctNum.getAbstractNumId();
+ int currAbNumId = abNum.getVal().intValue();
+
+ ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+ LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+ if (lc == null) {
+ lc = loadLevelTuples(abNum);
+ }
+ if (overrideTuples == null) {
+ overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels());
+ }
+
+ String formattedString = lc.incrementLevel(iLvl, overrideTuples);
+
+ listLevelMap.put(currAbNumId, lc);
+ overrideTupleMap.put(currNumId, overrideTuples);
+
+ return formattedString;
+
+ }
+
+ private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) {
+ LevelTuple[] levelTuples = new LevelTuple[length];
+ int overrideLength = ctNum.sizeOfLvlOverrideArray();
+ if (overrideLength == 0) {
+ return null;
+ }
+ for (int i = 0; i < length; i++) {
+ LevelTuple tuple;
+ if (i >= overrideLength) {
+ tuple = new LevelTuple("%" + i + ".");
+ } else {
+ CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i);
+ if (ctNumLvl != null) {
+ tuple = buildTuple(i, ctNumLvl.getLvl());
+ } else {
+ tuple = new LevelTuple("%" + i + ".");
+ }
+ }
+ levelTuples[i] = tuple;
+ }
+ return levelTuples;
+ }
+
+
+ private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) {
+ //Unfortunately, we need to go this far into the underlying structure
+ //to get the abstract num information for the edge case where
+ //someone skips a level and the format is not context-free, e.g. "1.B.i".
+ XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal());
+ CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum();
+
+ LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()];
+ for (int i = 0; i < levels.length; i++) {
+ levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i));
+ }
+ return new ParagraphLevelCounter(levels);
+ }
+
+ private LevelTuple buildTuple(int level, CTLvl ctLvl) {
+ boolean isLegal = false;
+ int start = 1;
+ int restart = -1;
+ String lvlText = "%" + level + ".";
+ String numFmt = "decimal";
+
+
+ if (ctLvl != null && ctLvl.getIsLgl() != null) {
+ isLegal = true;
+ }
+
+ if (ctLvl != null && ctLvl.getNumFmt() != null && ctLvl.getNumFmt().getVal() != null) {
+ numFmt = ctLvl.getNumFmt().getVal().toString();
+ }
+ if (ctLvl != null && ctLvl.getLvlRestart() != null &&
+ ctLvl.getLvlRestart().getVal() != null) {
+ restart = ctLvl.getLvlRestart().getVal().intValue();
+ }
+ if (ctLvl != null && ctLvl.getStart() != null && ctLvl.getStart().getVal() != null) {
+ start = ctLvl.getStart().getVal().intValue();
+ } else {
+
+ //this is a hack. Currently, this gets the lowest possible
+ //start for a given numFmt. We should probably try to grab the
+ //restartNumberingAfterBreak value in
+ //e.g. ???
+ if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) ||
+ "decimalZero".equals(numFmt)) {
+ start = 0;
+ } else {
+ start = 1;
+ }
+ }
+ if (ctLvl != null && ctLvl.getLvlText() != null && ctLvl.getLvlText().getVal() != null) {
+ lvlText = ctLvl.getLvlText().getVal();
+ }
+ return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+ }
+
+
+ private static class EmptyListManager extends XWPFListManager {
+ EmptyListManager() {
+ super(null);
+ }
+
+ @Override
+ public String getFormattedNumber(XWPFParagraph paragraph) {
+ return "";
+ }
+
+ @Override
+ public String getFormattedNumber(BigInteger numId, int iLvl) {
+ return "";
+ }
+
+ }
+}
diff --git a/poi-ooxml/src/test/java/org/apache/poi/xwpf/XWPFTestDataSamples.java b/poi-ooxml/src/test/java/org/apache/poi/xwpf/XWPFTestDataSamples.java
index 510765d318e..3701336bb18 100644
--- a/poi-ooxml/src/test/java/org/apache/poi/xwpf/XWPFTestDataSamples.java
+++ b/poi-ooxml/src/test/java/org/apache/poi/xwpf/XWPFTestDataSamples.java
@@ -21,6 +21,8 @@ Licensed to the Apache Software Foundation (ASF) under one or more
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.POIDataSamples;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
@@ -31,6 +33,11 @@ public static XWPFDocument openSampleDocument(String sampleName) throws IOExcept
return new XWPFDocument(is);
}
+ public static OPCPackage openSampleOPCPackage(String sampleName) throws IOException, InvalidFormatException {
+ InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleName);
+ return OPCPackage.open(is);
+ }
+
public static XWPFDocument writeOutAndReadBack(XWPFDocument doc) throws IOException {
UnsynchronizedByteArrayOutputStream baos = UnsynchronizedByteArrayOutputStream.builder().setBufferSize(4096).get();
doc.write(baos);
diff --git a/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
index 90f4c817a51..c44e4df4d4c 100644
--- a/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
+++ b/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
@@ -33,6 +33,7 @@ Licensed to the Apache Software Foundation (ASF) under one or more
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.util.StringUtil;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.XWPFTestDataSamples;
@@ -40,6 +41,7 @@ Licensed to the Apache Software Foundation (ASF) under one or more
import org.apache.poi.xwpf.usermodel.XWPFSDT;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRow;
@@ -57,11 +59,12 @@ class TestXWPFWordExtractor {
*/
@Test
void testGetSimpleText() throws IOException {
- try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("sample.docx");
- XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) {
-
+ try (
+ XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("sample.docx");
+ XWPFWordExtractor extractor = new XWPFWordExtractor(doc)
+ ) {
String text = extractor.getText();
- assertTrue(text.length() > 0);
+ assertFalse(text.isEmpty());
// Check contents
assertStartsWith(text,
@@ -77,6 +80,27 @@ void testGetSimpleText() throws IOException {
}
}
+ @Test
+ void testGetSimpleTextEventBased() throws Exception {
+ try (
+ OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("sample.docx");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)
+ ) {
+ String text = extractor.getText();
+ assertFalse(text.isEmpty());
+
+ // result is a bit different from the one in testGetSimpleText (extra whitespace)
+
+ // Check contents
+ assertContains(text,
+ "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
+ );
+ assertContains(text,
+ "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
+ );
+ }
+ }
+
/**
* Tests getting the text out of a complex file
*/
@@ -86,7 +110,7 @@ void testGetComplexText() throws IOException {
XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) {
String text = extractor.getText();
- assertTrue(text.length() > 0);
+ assertFalse(text.isEmpty());
char euro = '\u20ac';
@@ -107,6 +131,31 @@ void testGetComplexText() throws IOException {
}
}
+ @Test
+ void testGetComplexTextEventBased() throws Exception {
+ try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("IllustrativeCases.docx");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) {
+
+ String text = extractor.getText();
+ assertFalse(text.isEmpty());
+
+ char euro = '\u20ac';
+
+ // Check contents
+ assertStartsWith(text,
+ " \n(V) ILLUSTRATIVE CASES\n\n"
+ );
+ assertContains(text,
+ "As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n"// \n\n\n"
+ );
+
+ // TODO find out why this fails
+ //assertEndsWith(text,
+ // "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n"
+ //);
+ }
+ }
+
@Test
void testGetWithHyperlinks() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("TestDocument.docx");
@@ -234,6 +283,16 @@ void testInsertedDeletedText() throws IOException {
}
}
+ @Test
+ void testInsertedDeletedTextEventBased() throws Exception {
+ try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("delins.docx");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) {
+
+ assertContains(extractor.getText(), "pendant worn");
+ assertContains(extractor.getText(), "extremely well");
+ }
+ }
+
@Test
void testParagraphHeader() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Headers.docx");
@@ -245,6 +304,17 @@ void testParagraphHeader() throws IOException {
}
}
+ @Test
+ void testParagraphHeaderEventBased() throws Exception {
+ try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("Headers.docx");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) {
+
+ assertContains(extractor.getText(), "Section 1");
+ assertContains(extractor.getText(), "Section 2");
+ assertContains(extractor.getText(), "Section 3");
+ }
+ }
+
/**
* Test that we can open and process .docm
* (macro enabled) docx files (bug #45690)
@@ -260,6 +330,18 @@ void testDOCMFiles() throws IOException {
}
}
+ @Disabled
+ @Test
+ void testDOCMFilesEventBased() throws Exception {
+ try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("45690.docm");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) {
+
+ assertContains(extractor.getText(), "2004");
+ assertContains(extractor.getText(), "2008");
+ assertContains(extractor.getText(), "(120 ");
+ }
+ }
+
/**
* Test that we handle things like tabs and
* carriage returns properly in the text that
@@ -289,7 +371,18 @@ void testNoFieldCodes() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("FieldCodes.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) {
String text = extractor.getText();
- assertTrue(text.length() > 0);
+ assertFalse(text.isEmpty());
+ assertFalse(text.contains("AUTHOR"));
+ assertFalse(text.contains("CREATEDATE"));
+ }
+ }
+
+ @Test
+ void testNoFieldCodesEventBased() throws Exception {
+ try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("FieldCodes.docx");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) {
+ String text = extractor.getText();
+ assertFalse(text.isEmpty());
assertFalse(text.contains("AUTHOR"));
assertFalse(text.contains("CREATEDATE"));
}
@@ -304,7 +397,7 @@ void testFldSimpleContent() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("FldSimple.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) {
String text = extractor.getText();
- assertTrue(text.length() > 0);
+ assertFalse(text.isEmpty());
assertContains(text, "FldSimple.docx");
}
}
@@ -318,7 +411,7 @@ void testDrawings() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("drawing.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) {
String text = extractor.getText();
- assertTrue(text.length() > 0);
+ assertFalse(text.isEmpty());
}
}
@@ -465,8 +558,23 @@ void testGlossary() throws IOException {
@Test
void testPartsInTemplate() throws IOException {
- try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("60316b.dotx")) {
- XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
+ try (
+ XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("60316b.dotx");
+ XWPFWordExtractor extractor = new XWPFWordExtractor(doc)
+ ) {
+ String txt = extractor.getText();
+ assertContains(txt, "header 2");
+ assertContains(txt, "footer 1");
+ }
+ }
+
+ @Disabled // parts in template not supported in event based
+ @Test
+ void testPartsInTemplateEventBased() throws Exception {
+ try (
+ OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("60316b.dotx");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)
+ ) {
String txt = extractor.getText();
assertContains(txt, "header 2");
assertContains(txt, "footer 1");
@@ -475,17 +583,33 @@ void testPartsInTemplate() throws IOException {
@Test
void bug55966() throws IOException {
- try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55966.docx")) {
+ try (
+ XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55966.docx");
+ XWPFWordExtractor extractedDoc = new XWPFWordExtractor(doc)
+ ) {
String expected = "Content control within a paragraph is here text content from within a paragraph second control with a new\n" +
"line\n" +
"\n" +
"Content control that is the entire paragraph\n";
- XWPFWordExtractor extractedDoc = new XWPFWordExtractor(doc);
-
String actual = extractedDoc.getText();
+ assertEquals(expected, actual);
+ }
+ }
- extractedDoc.close();
+ @Disabled // extra test found in the event based extractor
+ @Test
+ void bug55966EventBased() throws Exception {
+ try (
+ OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("55966.docx");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)
+ ) {
+ String expected = "Content control within a paragraph is here text content from within a paragraph second control with a new\n" +
+ "line\n" +
+ "\n" +
+ "Content control that is the entire paragraph\n";
+
+ String actual = extractor.getText();
assertEquals(expected, actual);
}
}
@@ -499,6 +623,16 @@ void testCapitalizedFlag() throws IOException {
}
}
+ @Disabled // capitalized flag not supported in event based
+ @Test
+ void testCapitalizedFlagEventBased() throws Exception {
+ try (OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage("capitalized.docx");
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)) {
+ String txt = extractor.getText();
+ assertEquals( "The following word is: CAPITALIZED.", txt.trim());
+ }
+ }
+
@Test
void testTika2163() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("ChronologicalResume.dotx");
@@ -508,6 +642,18 @@ void testTika2163() throws IOException {
}
}
+ @Test
+ void testTika2163EventBased() throws Exception {
+ final String filename = "ChronologicalResume.dotx";
+ try (
+ OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage(filename);
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)
+ ) {
+ String txt = extractor.getText();
+ assertContains(txt, "but a great-looking résumé doesn’t have to be!");
+ }
+ }
+
@Test
void testTika3816() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("tika-3816.docx");
@@ -519,6 +665,19 @@ void testTika3816() throws IOException {
}
}
+ @Disabled // whitespace issue in text
+ @Test
+ void testTika3816EventBased() throws Exception {
+ final String filename = "tika-3816.docx";
+ try (
+ OPCPackage pkg = XWPFTestDataSamples.openSampleOPCPackage(filename);
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(pkg)
+ ) {
+ String txt = extractor.getText();
+ assertContains(txt, "Note\tDetails");
+ }
+ }
+
private static List extractSDTsFromBody(XWPFDocument document) {
XWPFSDT sdt;
XmlCursor xmlcursor = document.getDocument().getBody().newCursor();
diff --git a/poi-scratchpad/src/main/java/org/apache/poi/hwpf/converter/NumberFormatter.java b/poi-scratchpad/src/main/java/org/apache/poi/hwpf/converter/NumberFormatter.java
index 483c227c23b..a84d851cb6d 100644
--- a/poi-scratchpad/src/main/java/org/apache/poi/hwpf/converter/NumberFormatter.java
+++ b/poi-scratchpad/src/main/java/org/apache/poi/hwpf/converter/NumberFormatter.java
@@ -19,9 +19,6 @@
package org.apache.poi.hwpf.converter;
-import java.util.Arrays;
-import java.util.Locale;
-
import org.apache.poi.util.Beta;
/**
@@ -29,75 +26,9 @@
*/
@Beta
public final class NumberFormatter {
- // use char[] instead of String to speed up StringBuilder.append(), especially in JDK 11+
- // where StringBuilder internally switched from char[] to byte[]
- private static final char[][] ROMAN_LETTERS = Arrays.stream(
- new String[] { "m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i" }).
- map(String::toCharArray).
- toArray(char[][]::new);
-
- private static final int[] ROMAN_VALUES = { 1000, 900, 500, 400, 100, 90,
- 50, 40, 10, 9, 5, 4, 1 };
-
- private static final int T_ARABIC = 0;
- private static final int T_LOWER_LETTER = 4;
- private static final int T_LOWER_ROMAN = 2;
- private static final int T_ORDINAL = 5;
- private static final int T_UPPER_LETTER = 3;
- private static final int T_UPPER_ROMAN = 1;
+ // code was moved to org.apache.poi.util.NumberFormatter
public static String getNumber( int num, int style ) {
- switch ( style ) {
- case T_UPPER_ROMAN:
- return toRoman( num ).toUpperCase(Locale.ROOT);
- case T_LOWER_ROMAN:
- return toRoman( num );
- case T_UPPER_LETTER:
- return toLetters( num ).toUpperCase(Locale.ROOT);
- case T_LOWER_LETTER:
- return toLetters( num );
- case T_ARABIC:
- case T_ORDINAL:
- default:
- return String.valueOf( num );
- }
- }
-
- private static String toLetters(int number) {
- if ( number <= 0 ) {
- throw new IllegalArgumentException( "Unsupported number: " + number );
- }
-
- int num = number;
- final int radix = 26;
-
- char[] buf = new char[33];
- int charPos = buf.length;
-
- while (num > 0) {
- num--; // 1 => a, not 0 => a
- int remainder = num % radix;
- buf[--charPos] = (char)('a'+remainder);
- num = (num - remainder) / radix;
- }
-
- return new String(buf, charPos, (buf.length - charPos));
- }
-
- private static String toRoman( int number ) {
- if ( number <= 0 )
- throw new IllegalArgumentException( "Unsupported number: " + number );
-
- StringBuilder result = new StringBuilder();
-
- for ( int i = 0; i < ROMAN_LETTERS.length; i++ ) {
- char[] letter = ROMAN_LETTERS[i];
- int value = ROMAN_VALUES[i];
- while ( number >= value ) {
- number -= value;
- result.append( letter );
- }
- }
- return result.toString();
+ return org.apache.poi.util.NumberFormatter.getNumber(num, style);
}
}
diff --git a/poi/src/main/java/org/apache/poi/util/NumberFormatter.java b/poi/src/main/java/org/apache/poi/util/NumberFormatter.java
new file mode 100644
index 00000000000..f88103d5412
--- /dev/null
+++ b/poi/src/main/java/org/apache/poi/util/NumberFormatter.java
@@ -0,0 +1,101 @@
+/*
+ * ====================================================================
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ */
+
+package org.apache.poi.util;
+
+import java.util.Arrays;
+import java.util.Locale;
+
+/**
+ * Utility class to translate numbers in letters, usually for lists.
+ */
+@Beta
+public class NumberFormatter {
+ // use char[] instead of String to speed up StringBuilder.append(), especially in JDK 11+
+ // where StringBuilder internally switched from char[] to byte[]
+ private static final char[][] ROMAN_LETTERS = Arrays.stream(
+ new String[] { "m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i" }).
+ map(String::toCharArray).
+ toArray(char[][]::new);
+
+ private static final int[] ROMAN_VALUES = { 1000, 900, 500, 400, 100, 90,
+ 50, 40, 10, 9, 5, 4, 1 };
+
+ private static final int T_ARABIC = 0;
+ private static final int T_LOWER_LETTER = 4;
+ private static final int T_LOWER_ROMAN = 2;
+ private static final int T_ORDINAL = 5;
+ private static final int T_UPPER_LETTER = 3;
+ private static final int T_UPPER_ROMAN = 1;
+
+ public static String getNumber( int num, int style ) {
+ switch ( style ) {
+ case T_UPPER_ROMAN:
+ return toRoman( num ).toUpperCase(Locale.ROOT);
+ case T_LOWER_ROMAN:
+ return toRoman( num );
+ case T_UPPER_LETTER:
+ return toLetters( num ).toUpperCase(Locale.ROOT);
+ case T_LOWER_LETTER:
+ return toLetters( num );
+ case T_ARABIC:
+ case T_ORDINAL:
+ default:
+ return String.valueOf( num );
+ }
+ }
+
+ private static String toLetters(int number) {
+ if ( number <= 0 ) {
+ throw new IllegalArgumentException( "Unsupported number: " + number );
+ }
+
+ int num = number;
+ final int radix = 26;
+
+ char[] buf = new char[33];
+ int charPos = buf.length;
+
+ while (num > 0) {
+ num--; // 1 => a, not 0 => a
+ int remainder = num % radix;
+ buf[--charPos] = (char)('a'+remainder);
+ num = (num - remainder) / radix;
+ }
+
+ return new String(buf, charPos, (buf.length - charPos));
+ }
+
+ private static String toRoman( int number ) {
+ if ( number <= 0 )
+ throw new IllegalArgumentException( "Unsupported number: " + number );
+
+ StringBuilder result = new StringBuilder();
+
+ for ( int i = 0; i < ROMAN_LETTERS.length; i++ ) {
+ char[] letter = ROMAN_LETTERS[i];
+ int value = ROMAN_VALUES[i];
+ while ( number >= value ) {
+ number -= value;
+ result.append( letter );
+ }
+ }
+ return result.toString();
+ }
+}