DOCParser class: package resumecrawler.utils.parser; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.poi.hdf.extractor.WordDocument; /** * * @author solaris */ public class DOCParser implements Parser { File file = null; public DOCParser(File f) { file = f; } public DOCParser(String file) { this.file = new File(file); } public void parse(String fieldName, Document doc) { String content = ""; try { WordDocument wd = new WordDocument(file.toString()); StringWriter docTextWriter = new StringWriter(); wd.writeAllText(new PrintWriter(docTextWriter)); docTextWriter.close(); content = docTextWriter.toString(); } catch (IOException ex) { Logger.getLogger(PDFParser.class.getName()).log(Level.SEVERE, null, ex); } doc.add(new Field(fieldName, content, Field.Store.YES, Field.Index.TOKENIZED)); } } Error stack trace: Exception in thread "main" java.lang.NegativeArraySizeException at org.apache.poi.hdf.extractor.data.ListTables.createLVL(ListTables.java:171) at org.apache.poi.hdf.extractor.data.ListTables.initLFO(ListTables.java:149) at org.apache.poi.hdf.extractor.data.ListTables.<init>(ListTables.java:43) at org.apache.poi.hdf.extractor.WordDocument.createListTables(WordDocument.java:1640) at org.apache.poi.hdf.extractor.WordDocument.findFormatting(WordDocument.java:365) at org.apache.poi.hdf.extractor.WordDocument.processComplexFile(WordDocument.java:292) at org.apache.poi.hdf.extractor.WordDocument.readFIB(WordDocument.java:244) at org.apache.poi.hdf.extractor.WordDocument.<init>(WordDocument.java:194) at org.apache.poi.hdf.extractor.WordDocument.<init>(WordDocument.java:183) at resumecrawler.utils.parser.DOCParser.parse(DOCParser.java:37) ....... file.toString() returns something like this: /home/solaris/crawler/StoreDocuments/9cbb0d2ab441c5a900b7e072915ba298.doc
You're likely to have much more luck with hwpf than with hdf, hdf is unsupported For word text extracting, try org.apache.poi.hwpf.extractor.WordExtractor - http://poi.apache.org/apidocs/org/apache/poi/hwpf/extractor/WordExtractor.html