ASF Bugzilla – Attachment 24159 Details for
Bug 47727
[PATCH] XWPFWordExtractor does not extract some headers/footers
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
patch
headfoot.patch (text/plain), 7.42 KB, created by
Maxim Valyanskiy
on 2009-08-24 06:57:56 UTC
(
hide
)
Description:
patch
Filename:
MIME Type:
Creator:
Maxim Valyanskiy
Created:
2009-08-24 06:57:56 UTC
Size:
7.42 KB
patch
obsolete
>Index: src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java >=================================================================== >--- src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java (revision 807220) >+++ src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java (working copy) >@@ -83,19 +83,26 @@ > private XWPFHeader defaultHeader; > private XWPFFooter defaultFooter; > >- >+ /** >+ * Figures out the policy for the given document, >+ * and creates any header and footer objects >+ * as required. >+ */ >+ public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException { >+ this(doc, doc.getDocument().getBody().getSectPr()); >+ } >+ > /** > * Figures out the policy for the given document, > * and creates any header and footer objects > * as required. > */ >- public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException { >+ public XWPFHeaderFooterPolicy(XWPFDocument doc, CTSectPr sectPr) throws IOException, XmlException { > // Grab what headers and footers have been defined > // For now, we don't care about different ranges, as it > // doesn't seem that .docx properly supports that > // feature of the file format yet > this.doc = doc; >- CTSectPr sectPr = doc.getDocument().getBody().getSectPr(); > for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) { > // Get the header > CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i); >Index: src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java >=================================================================== >--- src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java (revision 807220) >+++ src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java (working copy) >@@ -21,6 +21,7 @@ > > import org.apache.poi.POIXMLDocument; > import org.apache.poi.POIXMLTextExtractor; >+import org.apache.poi.POIXMLException; > import org.apache.poi.openxml4j.exceptions.OpenXML4JException; > import org.apache.poi.openxml4j.opc.OPCPackage; > import org.apache.poi.xwpf.model.XWPFCommentsDecorator; >@@ -31,6 +32,7 @@ > import org.apache.poi.xwpf.usermodel.XWPFParagraph; > import org.apache.poi.xwpf.usermodel.XWPFTable; > import org.apache.xmlbeans.XmlException; >+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; > > /** > * Helper class to extract text from an OOXML Word file >@@ -72,45 +74,77 @@ > public String getText() { > StringBuffer text = new StringBuffer(); > XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); >- >+ > // Start out with all headers >- // TODO - put them in where they're needed >- if(hfPolicy.getFirstPageHeader() != null) { >- text.append( hfPolicy.getFirstPageHeader().getText() ); >- } >- if(hfPolicy.getEvenPageHeader() != null) { >- text.append( hfPolicy.getEvenPageHeader().getText() ); >- } >- if(hfPolicy.getDefaultHeader() != null) { >- text.append( hfPolicy.getDefaultHeader().getText() ); >- } >+ extractHeaders(text, hfPolicy); > > // First up, all our paragraph based text > Iterator<XWPFParagraph> i = document.getParagraphsIterator(); > while(i.hasNext()) { >- XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( >- new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks)); >- text.append(decorator.getText()+"\n"); >- } >+ XWPFParagraph paragraph = i.next(); > >+ >+ try { >+ CTSectPr ctSectPr = null; >+ if (paragraph.getCTP().getPPr()!=null) { >+ ctSectPr = paragraph.getCTP().getPPr().getSectPr(); >+ } >+ >+ XWPFHeaderFooterPolicy headerFooterPolicy = null; >+ >+ if (ctSectPr!=null) { >+ headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); >+ >+ extractHeaders(text, headerFooterPolicy); >+ } >+ >+ XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( >+ new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks)); >+ text.append(decorator.getText()).append('\n'); >+ >+ if (ctSectPr!=null) { >+ extractFooters(text, headerFooterPolicy); >+ } >+ } catch (IOException e) { >+ throw new POIXMLException(e); >+ } catch (XmlException e) { >+ throw new POIXMLException(e); >+ } >+ } >+ > // Then our table based text > Iterator<XWPFTable> j = document.getTablesIterator(); > while(j.hasNext()) { >- text.append(j.next().getText()+"\n"); >+ text.append(j.next().getText()).append('\n'); > } > > // Finish up with all the footers >- // TODO - put them in where they're needed >- if(hfPolicy.getFirstPageFooter() != null) { >- text.append( hfPolicy.getFirstPageFooter().getText() ); >- } >- if(hfPolicy.getEvenPageFooter() != null) { >- text.append( hfPolicy.getEvenPageFooter().getText() ); >- } >- if(hfPolicy.getDefaultFooter() != null) { >- text.append( hfPolicy.getDefaultFooter().getText() ); >- } >+ extractFooters(text, hfPolicy); > > return text.toString(); > } >+ >+ private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { >+ if(hfPolicy.getFirstPageFooter() != null) { >+ text.append( hfPolicy.getFirstPageFooter().getText() ); >+ } >+ if(hfPolicy.getEvenPageFooter() != null) { >+ text.append( hfPolicy.getEvenPageFooter().getText() ); >+ } >+ if(hfPolicy.getDefaultFooter() != null) { >+ text.append( hfPolicy.getDefaultFooter().getText() ); >+ } >+ } >+ >+ private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { >+ if(hfPolicy.getFirstPageHeader() != null) { >+ text.append( hfPolicy.getFirstPageHeader().getText() ); >+ } >+ if(hfPolicy.getEvenPageHeader() != null) { >+ text.append( hfPolicy.getEvenPageHeader().getText() ); >+ } >+ if(hfPolicy.getDefaultHeader() != null) { >+ text.append( hfPolicy.getDefaultHeader().getText() ); >+ } >+ } > } >Index: src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java >=================================================================== >--- src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (revision 807220) >+++ src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (working copy) >@@ -198,4 +198,13 @@ > assertTrue(extractor.getText().contains("extremely well")); > } > >+ public void testParagraphHeader() { >+ XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Headers.docx"); >+ XWPFWordExtractor extractor = new XWPFWordExtractor(doc); >+ >+ assertTrue(extractor.getText().contains("Section 1")); >+ assertTrue(extractor.getText().contains("Section 2")); >+ assertTrue(extractor.getText().contains("Section 3")); >+ } >+ > }
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 47727
: 24159 |
24160