commit 4cfd83f2a5ef12eb1d6acfebea3dda52aa926943 Author: maxcom Date: Fri Jul 24 15:15:29 2009 +0400 docx: text extraction from deleted/inserted blocks diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java index 81a4add..4906fcf 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java @@ -18,6 +18,7 @@ package org.apache.poi.xwpf.usermodel; import java.math.BigInteger; import java.util.ArrayList; +import java.util.Arrays; import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlObject; @@ -56,23 +57,24 @@ public class XWPFParagraph { // TODO - replace this with some sort of XPath expression // to directly find all the CTRs, in the right order ArrayList rs = new ArrayList(); - CTR[] tmp; // Get the main text runs - tmp = paragraph.getRArray(); - for (int i = 0; i < tmp.length; i++) { - rs.add(tmp[i]); - } + rs.addAll(Arrays.asList(paragraph.getRArray())); // Not sure quite what these are, but they hold // more text runs CTSdtRun[] sdts = paragraph.getSdtArray(); for (int i = 0; i < sdts.length; i++) { CTSdtContentRun run = sdts[i].getSdtContent(); - tmp = run.getRArray(); - for (int j = 0; j < tmp.length; j++) { - rs.add(tmp[j]); - } + rs.addAll(Arrays.asList(run.getRArray())); + } + + for (CTRunTrackChange c : paragraph.getDelArray()) { + rs.addAll(Arrays.asList(c.getRArray())); + } + + for (CTRunTrackChange c : paragraph.getInsArray()) { + rs.addAll(Arrays.asList(c.getRArray())); } // Get text of the paragraph diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index 9a0e648..f35dfb0 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -191,6 +191,13 @@ public class TestXWPFWordExtractor extends TestCase { assertTrue(extractor.getText().contains("XXX")); } + public void testInsertedDeletedText() throws Exception { + XWPFDocument doc = open("delins.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + + assertTrue(extractor.getText().contains("pendant worn")); + assertTrue(extractor.getText().contains("extremely well")); + } //TODO use the same logic for opening test files as in HSSFTestDataSamples private XWPFDocument open(String sampleFileName) throws IOException { diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/delins.docx b/src/scratchpad/testcases/org/apache/poi/hwpf/data/delins.docx new file mode 100644 index 0000000..b530691 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/delins.docx differ