Attachment #24159 for bug #47727

View | Details | Raw Unified | Return to bug 47727
Collapse All | Expand All




	private XWPFHeader defaultHeader;
	private XWPFFooter defaultFooter;
	
        /**
         * Figures out the policy for the given document,
         *  and creates any header and footer objects
         *  as required.
         */
        public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException {
                this(doc, doc.getDocument().getBody().getSectPr());
        }

	/**
	 * Figures out the policy for the given document,
	 *  and creates any header and footer objects
	 *  as required.
	 */
	public XWPFHeaderFooterPolicy(XWPFDocument doc, CTSectPr sectPr) throws IOException, XmlException {
		// Grab what headers and footers have been defined
		// For now, we don't care about different ranges, as it
		//  doesn't seem that .docx properly supports that
		//  feature of the file format yet
		this.doc = doc;
		CTSectPr sectPr = doc.getDocument().getBody().getSectPr();
		for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) {
			// Get the header
			CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i);





import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.POIXMLException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;

import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;

/**
 * Helper class to extract text from an OOXML Word file

	public String getText() {
		StringBuffer text = new StringBuffer();
		XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();

		// Start out with all headers
                extractHeaders(text, hfPolicy);
		if(hfPolicy.getFirstPageHeader() != null) {
			text.append( hfPolicy.getFirstPageHeader().getText() );
		}
		if(hfPolicy.getEvenPageHeader() != null) {
			text.append( hfPolicy.getEvenPageHeader().getText() );
		}
		if(hfPolicy.getDefaultHeader() != null) {
			text.append( hfPolicy.getDefaultHeader().getText() );
		}
		
		// First up, all our paragraph based text
		Iterator<XWPFParagraph> i = document.getParagraphsIterator();
		while(i.hasNext()) {
                        XWPFParagraph paragraph = i.next();
					new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
			text.append(decorator.getText()+"\n");
		}


                        try {
                                CTSectPr ctSectPr = null;
                                if (paragraph.getCTP().getPPr()!=null) {
                                        ctSectPr = paragraph.getCTP().getPPr().getSectPr();
                                }

                                XWPFHeaderFooterPolicy headerFooterPolicy = null;

                                if (ctSectPr!=null) {
                                        headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);

                                        extractHeaders(text, headerFooterPolicy);
                                }

                                XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
                                                new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
                                text.append(decorator.getText()).append('\n');

                                if (ctSectPr!=null) {
                                        extractFooters(text, headerFooterPolicy);
                                }
                        } catch (IOException e) {
                                throw new POIXMLException(e);
                        } catch (XmlException e) {
                                throw new POIXMLException(e);
                        }
                }

		// Then our table based text
		Iterator<XWPFTable> j = document.getTablesIterator();
		while(j.hasNext()) {
                        text.append(j.next().getText()).append('\n');
		}
		
		// Finish up with all the footers
                extractFooters(text, hfPolicy);
		if(hfPolicy.getFirstPageFooter() != null) {
			text.append( hfPolicy.getFirstPageFooter().getText() );
		}
		if(hfPolicy.getEvenPageFooter() != null) {
			text.append( hfPolicy.getEvenPageFooter().getText() );
		}
		if(hfPolicy.getDefaultFooter() != null) {
			text.append( hfPolicy.getDefaultFooter().getText() );
		}
		
		return text.toString();
	}

        private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
                if(hfPolicy.getFirstPageFooter() != null) {
                        text.append( hfPolicy.getFirstPageFooter().getText() );
                }
                if(hfPolicy.getEvenPageFooter() != null) {
                        text.append( hfPolicy.getEvenPageFooter().getText() );
                }
                if(hfPolicy.getDefaultFooter() != null) {
                        text.append( hfPolicy.getDefaultFooter().getText() );
                }
        }

        private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
                if(hfPolicy.getFirstPageHeader() != null) {
                        text.append( hfPolicy.getFirstPageHeader().getText() );
                }
                if(hfPolicy.getEvenPageHeader() != null) {
                        text.append( hfPolicy.getEvenPageHeader().getText() );
                }
                if(hfPolicy.getDefaultHeader() != null) {
                        text.append( hfPolicy.getDefaultHeader().getText() );
                }
        }
}

Lines 198-201 Link Here

(-)src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java (+9 lines)
198	assertTrue(extractor.getText().contains("extremely well"));	198	assertTrue(extractor.getText().contains("extremely well"));
199	}	199	}
200		200
		201	public void testParagraphHeader() {
		202	XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Headers.docx");
		203	XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
		204
		205	assertTrue(extractor.getText().contains("Section 1"));
		206	assertTrue(extractor.getText().contains("Section 2"));
		207	assertTrue(extractor.getText().contains("Section 3"));
		208	}
		209
201	}	210	}

Return to bug 47727

Lines 21-26 Link Here

(-)src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java (-26 / +60 lines)
21		21
22	import org.apache.poi.POIXMLDocument;	22	import org.apache.poi.POIXMLDocument;
23	import org.apache.poi.POIXMLTextExtractor;	23	import org.apache.poi.POIXMLTextExtractor;
		24	import org.apache.poi.POIXMLException;
24	import org.apache.poi.openxml4j.exceptions.OpenXML4JException;	25	import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
25	import org.apache.poi.openxml4j.opc.OPCPackage;	26	import org.apache.poi.openxml4j.opc.OPCPackage;
26	import org.apache.poi.xwpf.model.XWPFCommentsDecorator;	27	import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
Lines 31-36 Link Here
31	import org.apache.poi.xwpf.usermodel.XWPFParagraph;	32	import org.apache.poi.xwpf.usermodel.XWPFParagraph;
32	import org.apache.poi.xwpf.usermodel.XWPFTable;	33	import org.apache.poi.xwpf.usermodel.XWPFTable;
33	import org.apache.xmlbeans.XmlException;	34	import org.apache.xmlbeans.XmlException;
		35	import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
34		36
35	/**	37	/**
36	* Helper class to extract text from an OOXML Word file	38	* Helper class to extract text from an OOXML Word file
Lines 72-116 Link Here
72	public String getText() {	74	public String getText() {
73	StringBuffer text = new StringBuffer();	75	StringBuffer text = new StringBuffer();
74	XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();	76	XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
75		77
76	// Start out with all headers	78	// Start out with all headers
77	// TODO - put them in where they're needed	79	extractHeaders(text, hfPolicy);
78	if(hfPolicy.getFirstPageHeader() != null) {
79	text.append( hfPolicy.getFirstPageHeader().getText() );
80	}
81	if(hfPolicy.getEvenPageHeader() != null) {
82	text.append( hfPolicy.getEvenPageHeader().getText() );
83	}
84	if(hfPolicy.getDefaultHeader() != null) {
85	text.append( hfPolicy.getDefaultHeader().getText() );
86	}
87		80
88	// First up, all our paragraph based text	81	// First up, all our paragraph based text
89	Iterator<XWPFParagraph> i = document.getParagraphsIterator();	82	Iterator<XWPFParagraph> i = document.getParagraphsIterator();
90	while(i.hasNext()) {	83	while(i.hasNext()) {
91	XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(	84	XWPFParagraph paragraph = i.next();
92	new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
93	text.append(decorator.getText()+"\n");
94	}
95		85
		86
		87	try {
		88	CTSectPr ctSectPr = null;
		89	if (paragraph.getCTP().getPPr()!=null) {
		90	ctSectPr = paragraph.getCTP().getPPr().getSectPr();
		91	}
		92
		93	XWPFHeaderFooterPolicy headerFooterPolicy = null;
		94
		95	if (ctSectPr!=null) {
		96	headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
		97
		98	extractHeaders(text, headerFooterPolicy);
		99	}
		100
		101	XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
		102	new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
		103	text.append(decorator.getText()).append('\n');
		104
		105	if (ctSectPr!=null) {
		106	extractFooters(text, headerFooterPolicy);
		107	}
		108	} catch (IOException e) {
		109	throw new POIXMLException(e);
		110	} catch (XmlException e) {
		111	throw new POIXMLException(e);
		112	}
		113	}
		114
96	// Then our table based text	115	// Then our table based text
97	Iterator<XWPFTable> j = document.getTablesIterator();	116	Iterator<XWPFTable> j = document.getTablesIterator();
98	while(j.hasNext()) {	117	while(j.hasNext()) {
99	text.append(j.next().getText()+"\n");	118	text.append(j.next().getText()).append('\n');
100	}	119	}
101		120
102	// Finish up with all the footers	121	// Finish up with all the footers
103	// TODO - put them in where they're needed	122	extractFooters(text, hfPolicy);
104	if(hfPolicy.getFirstPageFooter() != null) {
105	text.append( hfPolicy.getFirstPageFooter().getText() );
106	}
107	if(hfPolicy.getEvenPageFooter() != null) {
108	text.append( hfPolicy.getEvenPageFooter().getText() );
109	}
110	if(hfPolicy.getDefaultFooter() != null) {
111	text.append( hfPolicy.getDefaultFooter().getText() );
112	}
113		123
114	return text.toString();	124	return text.toString();
115	}	125	}
		126
		127	private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
		128	if(hfPolicy.getFirstPageFooter() != null) {
		129	text.append( hfPolicy.getFirstPageFooter().getText() );
		130	}
		131	if(hfPolicy.getEvenPageFooter() != null) {
		132	text.append( hfPolicy.getEvenPageFooter().getText() );
		133	}
		134	if(hfPolicy.getDefaultFooter() != null) {
		135	text.append( hfPolicy.getDefaultFooter().getText() );
		136	}
		137	}
		138
		139	private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
		140	if(hfPolicy.getFirstPageHeader() != null) {
		141	text.append( hfPolicy.getFirstPageHeader().getText() );
		142	}
		143	if(hfPolicy.getEvenPageHeader() != null) {
		144	text.append( hfPolicy.getEvenPageHeader().getText() );
		145	}
		146	if(hfPolicy.getDefaultHeader() != null) {
		147	text.append( hfPolicy.getDefaultHeader().getText() );
		148	}
		149	}
116	}	150	}

Lines 83-101 Link Here

(-)src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java (-3 / +10 lines)
83	private XWPFHeader defaultHeader;	83	private XWPFHeader defaultHeader;
84	private XWPFFooter defaultFooter;	84	private XWPFFooter defaultFooter;
85		85
86		86	/**
		87	* Figures out the policy for the given document,
		88	* and creates any header and footer objects
		89	* as required.
		90	*/
		91	public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException {
		92	this(doc, doc.getDocument().getBody().getSectPr());
		93	}
		94
87	/**	95	/**
88	* Figures out the policy for the given document,	96	* Figures out the policy for the given document,
89	* and creates any header and footer objects	97	* and creates any header and footer objects
90	* as required.	98	* as required.
91	*/	99	*/
92	public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException {	100	public XWPFHeaderFooterPolicy(XWPFDocument doc, CTSectPr sectPr) throws IOException, XmlException {
93	// Grab what headers and footers have been defined	101	// Grab what headers and footers have been defined
94	// For now, we don't care about different ranges, as it	102	// For now, we don't care about different ranges, as it
95	// doesn't seem that .docx properly supports that	103	// doesn't seem that .docx properly supports that
96	// feature of the file format yet	104	// feature of the file format yet
97	this.doc = doc;	105	this.doc = doc;
98	CTSectPr sectPr = doc.getDocument().getBody().getSectPr();
99	for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) {	106	for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) {
100	// Get the header	107	// Get the header
101	CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i);	108	CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i);