Link Here
|
21 |
|
21 |
|
22 |
import org.apache.poi.POIXMLDocument; |
22 |
import org.apache.poi.POIXMLDocument; |
23 |
import org.apache.poi.POIXMLTextExtractor; |
23 |
import org.apache.poi.POIXMLTextExtractor; |
|
|
24 |
import org.apache.poi.POIXMLException; |
24 |
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; |
25 |
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; |
25 |
import org.apache.poi.openxml4j.opc.OPCPackage; |
26 |
import org.apache.poi.openxml4j.opc.OPCPackage; |
26 |
import org.apache.poi.xwpf.model.XWPFCommentsDecorator; |
27 |
import org.apache.poi.xwpf.model.XWPFCommentsDecorator; |
Link Here
|
31 |
import org.apache.poi.xwpf.usermodel.XWPFParagraph; |
32 |
import org.apache.poi.xwpf.usermodel.XWPFParagraph; |
32 |
import org.apache.poi.xwpf.usermodel.XWPFTable; |
33 |
import org.apache.poi.xwpf.usermodel.XWPFTable; |
33 |
import org.apache.xmlbeans.XmlException; |
34 |
import org.apache.xmlbeans.XmlException; |
|
|
35 |
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; |
34 |
|
36 |
|
35 |
/** |
37 |
/** |
36 |
* Helper class to extract text from an OOXML Word file |
38 |
* Helper class to extract text from an OOXML Word file |
Link Here
|
72 |
public String getText() { |
74 |
public String getText() { |
73 |
StringBuffer text = new StringBuffer(); |
75 |
StringBuffer text = new StringBuffer(); |
74 |
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); |
76 |
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); |
75 |
|
77 |
|
76 |
// Start out with all headers |
78 |
// Start out with all headers |
77 |
// TODO - put them in where they're needed |
79 |
extractHeaders(text, hfPolicy); |
78 |
if(hfPolicy.getFirstPageHeader() != null) { |
|
|
79 |
text.append( hfPolicy.getFirstPageHeader().getText() ); |
80 |
} |
81 |
if(hfPolicy.getEvenPageHeader() != null) { |
82 |
text.append( hfPolicy.getEvenPageHeader().getText() ); |
83 |
} |
84 |
if(hfPolicy.getDefaultHeader() != null) { |
85 |
text.append( hfPolicy.getDefaultHeader().getText() ); |
86 |
} |
87 |
|
80 |
|
88 |
// First up, all our paragraph based text |
81 |
// First up, all our paragraph based text |
89 |
Iterator<XWPFParagraph> i = document.getParagraphsIterator(); |
82 |
Iterator<XWPFParagraph> i = document.getParagraphsIterator(); |
90 |
while(i.hasNext()) { |
83 |
while(i.hasNext()) { |
91 |
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( |
84 |
XWPFParagraph paragraph = i.next(); |
92 |
new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks)); |
|
|
93 |
text.append(decorator.getText()+"\n"); |
94 |
} |
95 |
|
85 |
|
|
|
86 |
|
87 |
try { |
88 |
CTSectPr ctSectPr = null; |
89 |
if (paragraph.getCTP().getPPr()!=null) { |
90 |
ctSectPr = paragraph.getCTP().getPPr().getSectPr(); |
91 |
} |
92 |
|
93 |
XWPFHeaderFooterPolicy headerFooterPolicy = null; |
94 |
|
95 |
if (ctSectPr!=null) { |
96 |
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); |
97 |
|
98 |
extractHeaders(text, headerFooterPolicy); |
99 |
} |
100 |
|
101 |
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( |
102 |
new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks)); |
103 |
text.append(decorator.getText()).append('\n'); |
104 |
|
105 |
if (ctSectPr!=null) { |
106 |
extractFooters(text, headerFooterPolicy); |
107 |
} |
108 |
} catch (IOException e) { |
109 |
throw new POIXMLException(e); |
110 |
} catch (XmlException e) { |
111 |
throw new POIXMLException(e); |
112 |
} |
113 |
} |
114 |
|
96 |
// Then our table based text |
115 |
// Then our table based text |
97 |
Iterator<XWPFTable> j = document.getTablesIterator(); |
116 |
Iterator<XWPFTable> j = document.getTablesIterator(); |
98 |
while(j.hasNext()) { |
117 |
while(j.hasNext()) { |
99 |
text.append(j.next().getText()+"\n"); |
118 |
text.append(j.next().getText()).append('\n'); |
100 |
} |
119 |
} |
101 |
|
120 |
|
102 |
// Finish up with all the footers |
121 |
// Finish up with all the footers |
103 |
// TODO - put them in where they're needed |
122 |
extractFooters(text, hfPolicy); |
104 |
if(hfPolicy.getFirstPageFooter() != null) { |
|
|
105 |
text.append( hfPolicy.getFirstPageFooter().getText() ); |
106 |
} |
107 |
if(hfPolicy.getEvenPageFooter() != null) { |
108 |
text.append( hfPolicy.getEvenPageFooter().getText() ); |
109 |
} |
110 |
if(hfPolicy.getDefaultFooter() != null) { |
111 |
text.append( hfPolicy.getDefaultFooter().getText() ); |
112 |
} |
113 |
|
123 |
|
114 |
return text.toString(); |
124 |
return text.toString(); |
115 |
} |
125 |
} |
|
|
126 |
|
127 |
private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { |
128 |
if(hfPolicy.getFirstPageFooter() != null) { |
129 |
text.append( hfPolicy.getFirstPageFooter().getText() ); |
130 |
} |
131 |
if(hfPolicy.getEvenPageFooter() != null) { |
132 |
text.append( hfPolicy.getEvenPageFooter().getText() ); |
133 |
} |
134 |
if(hfPolicy.getDefaultFooter() != null) { |
135 |
text.append( hfPolicy.getDefaultFooter().getText() ); |
136 |
} |
137 |
} |
138 |
|
139 |
private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { |
140 |
if(hfPolicy.getFirstPageHeader() != null) { |
141 |
text.append( hfPolicy.getFirstPageHeader().getText() ); |
142 |
} |
143 |
if(hfPolicy.getEvenPageHeader() != null) { |
144 |
text.append( hfPolicy.getEvenPageHeader().getText() ); |
145 |
} |
146 |
if(hfPolicy.getDefaultHeader() != null) { |
147 |
text.append( hfPolicy.getDefaultHeader().getText() ); |
148 |
} |
149 |
} |
116 |
} |
150 |
} |