--- src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java (revision 955183) +++ src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java (working copy) @@ -54,6 +54,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STLineSpacingRule; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STTextAlignment; +import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; @@ -133,7 +134,13 @@ while (c.toNextSelection()) { XmlObject o = c.getObject(); if (o instanceof CTText) { - text.append(((CTText) o).getStringValue()); + String tagName = o.getDomNode().getNodeName(); + // field codes (w:instrText, defined in spec sec. 17.16.23) + // come up as instances of CTText, but they only + // pollute the output + if (!tagName.equals("w:instrText")) { + text.append(((CTText) o).getStringValue()); + } } if (o instanceof CTPTab) { text.append("\t"); --- src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (revision 955183) +++ src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (working copy) @@ -237,4 +237,17 @@ // Now check the first paragraph in total assertTrue(extractor.getText().contains("a\tb\n")); } + + /** + * The output should not contain field codes, e.g. those specified in the + * w:instrText tag (spec sec. 17.16.23) + */ + public void testNoFieldCodes() { + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("FieldCodes.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + String text = extractor.getText(); + assertTrue(text.length() > 0); + assertFalse(text.contains("AUTHOR")); + assertFalse(text.contains("CREATEDATE")); + } }