Bug 45223

Summary:	NegativeArraySizeException in WordDocument constructor.
Product:	POI	Reporter:	Andrei V Maksimov <maksimov.andrei>
Component:	HDF	Assignee:	POI Developers List <dev>
Status:	RESOLVED WONTFIX
Severity:	normal
Priority:	P2
Version:	3.0-FINAL
Target Milestone:	---
Hardware:	PC
OS:	Linux

Description Andrei V Maksimov 2008-06-17 14:47:05 UTC

DOCParser class: 

package resumecrawler.utils.parser;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.poi.hdf.extractor.WordDocument;

/**
 *
 * @author solaris
 */
public class DOCParser implements Parser {
    File file = null;
    
    public DOCParser(File f) {
        file = f;
    }
    
    public DOCParser(String file) {
        this.file = new File(file);
    }
    
    public void parse(String fieldName, Document doc) {
        String content = "";
        try {            
            WordDocument wd = new WordDocument(file.toString());
            StringWriter docTextWriter = new StringWriter();
            wd.writeAllText(new PrintWriter(docTextWriter));
            docTextWriter.close();
            content = docTextWriter.toString();            
        } catch (IOException ex) {
            Logger.getLogger(PDFParser.class.getName()).log(Level.SEVERE, null, ex);
        } 
        doc.add(new Field(fieldName, content, Field.Store.YES, Field.Index.TOKENIZED));
    }
}

Error stack trace:
Exception in thread "main" java.lang.NegativeArraySizeException
        at org.apache.poi.hdf.extractor.data.ListTables.createLVL(ListTables.java:171)
        at org.apache.poi.hdf.extractor.data.ListTables.initLFO(ListTables.java:149)
        at org.apache.poi.hdf.extractor.data.ListTables.<init>(ListTables.java:43)
        at org.apache.poi.hdf.extractor.WordDocument.createListTables(WordDocument.java:1640)
        at org.apache.poi.hdf.extractor.WordDocument.findFormatting(WordDocument.java:365)
        at org.apache.poi.hdf.extractor.WordDocument.processComplexFile(WordDocument.java:292)
        at org.apache.poi.hdf.extractor.WordDocument.readFIB(WordDocument.java:244)
        at org.apache.poi.hdf.extractor.WordDocument.<init>(WordDocument.java:194)
        at org.apache.poi.hdf.extractor.WordDocument.<init>(WordDocument.java:183)
        at resumecrawler.utils.parser.DOCParser.parse(DOCParser.java:37)
.......

file.toString() returns something like this: /home/solaris/crawler/StoreDocuments/9cbb0d2ab441c5a900b7e072915ba298.doc

Comment 1 Nick Burch 2008-06-19 04:43:01 UTC

You're likely to have much more luck with hwpf than with hdf, hdf is unsupported

For word text extracting, try org.apache.poi.hwpf.extractor.WordExtractor -
http://poi.apache.org/apidocs/org/apache/poi/hwpf/extractor/WordExtractor.html