Index: src/ooxml/java/org/apache/poi/util/StAXHelper.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- src/ooxml/java/org/apache/poi/util/StAXHelper.java (revision ) +++ src/ooxml/java/org/apache/poi/util/StAXHelper.java (revision ) @@ -0,0 +1,83 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.util; + +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; +import java.io.InputStream; +import java.lang.reflect.Method; + + +/** + * Provides handy methods for working with StAX readers + */ +public final class StAXHelper { + private static POILogger logger = POILogFactory.getLogger(StAXHelper.class); + + private StAXHelper() {} + + /** + * Creates a new StAX XMLReader, with sensible defaults + */ + public static synchronized XMLStreamReader newXMLStreamReader(InputStream is) throws XMLStreamException { + return staxFactory.createXMLStreamReader(is); + } + + + private static final XMLInputFactory staxFactory; + static { + staxFactory = XMLInputFactory.newInstance(); + trySetStAXProperty(staxFactory, XMLInputFactory.IS_NAMESPACE_AWARE, true); + trySetStAXProperty(staxFactory, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + trySetStAXProperty(staxFactory, XMLInputFactory.IS_VALIDATING, false); + //TODO: do we need an ignoring xmlresolver? + //staxFactory.setXMLResolver(IGNORING_ENTITY_RESOLVER); + trySetXercesSecurityManager(staxFactory); + } + + private static void trySetStAXProperty(XMLInputFactory xmlInputFactory, String property, boolean enabled) { + try { + xmlInputFactory.setProperty(property, enabled); + } catch (Exception e) { + logger.log(POILogger.WARN, "StAX Property unsupported", property, e); + } catch (AbstractMethodError ame) { + logger.log(POILogger.WARN, "Cannot set StAX feature because outdated XML parser in classpath", + property, ame); + } + } + + private static void trySetXercesSecurityManager(XMLInputFactory staxFactory) { + // Try built-in JVM one first, standalone if not + for (String securityManagerClassName : new String[] { + "com.sun.org.apache.xerces.internal.util.SecurityManager", + "org.apache.xerces.util.SecurityManager" + }) { + try { + Object mgr = Class.forName(securityManagerClassName).newInstance(); + Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE); + setLimit.invoke(mgr, 4096); + staxFactory.setProperty("http://apache.org/xml/properties/security-manager", mgr); + // Stop once one can be setup without error + return; + } catch (Exception e) { + logger.log(POILogger.WARN, "StAX Security Manager could not be setup", e); + } + } + } +} Index: src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java (date 1456341576000) +++ src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java (revision ) @@ -18,6 +18,9 @@ import static org.apache.poi.POIXMLTypeLoader.DEFAULT_XML_OPTIONS; +import javax.xml.namespace.QName; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; @@ -32,8 +35,6 @@ import java.util.List; import java.util.Map; -import javax.xml.namespace.QName; - import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLDocumentPart; import org.apache.poi.POIXMLException; @@ -55,6 +56,7 @@ import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; import org.apache.poi.util.PackageHelper; +import org.apache.poi.util.StAXHelper; import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlException; @@ -161,7 +163,9 @@ @Override protected void onDocumentRead() throws IOException { try { - DocumentDocument doc = DocumentDocument.Factory.parse(getPackagePart().getInputStream(), DEFAULT_XML_OPTIONS); + XMLStreamReader reader = StAXHelper.newXMLStreamReader(getPackagePart().getInputStream()); + + DocumentDocument doc = DocumentDocument.Factory.parse(reader, DEFAULT_XML_OPTIONS); ctDocument = doc.getDocument(); initFootnotes(); @@ -238,6 +242,8 @@ } initHyperlinks(); } catch (XmlException e) { + throw new POIXMLException(e); + } catch (XMLStreamException e) { throw new POIXMLException(e); } } Index: src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (date 1456341576000) +++ src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (revision ) @@ -409,7 +409,6 @@ public void testCheckboxes() throws IOException { XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("checkboxes.docx"); - System.out.println(doc); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); assertEquals("This is a small test for checkboxes \nunchecked: |_| \n" + @@ -417,6 +416,15 @@ "Test a checkbox within a textbox: |_| -> |X|\n\n\n" + "In Table:\n|_|\t|X|\n\n\n" + "In Sequence:\n|X||_||X|\n", extractor.getText()); + extractor.close(); + } + + public void testBug57031() throws Exception { + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("57031.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + + // Check it gives text without error + extractor.getText(); extractor.close(); } }