/* ==================================================================== Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==================================================================== */ package org.apache.poi.hssf.eventusermodel.examples; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFRequest; import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener; import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener; import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord; import org.apache.poi.hssf.model.HSSFFormulaParser; import org.apache.poi.hssf.record.BOFRecord; import org.apache.poi.hssf.record.BlankRecord; import org.apache.poi.hssf.record.BoolErrRecord; import org.apache.poi.hssf.record.BoundSheetRecord; import org.apache.poi.hssf.record.FormulaRecord; import org.apache.poi.hssf.record.LabelRecord; import org.apache.poi.hssf.record.LabelSSTRecord; import org.apache.poi.hssf.record.NoteRecord; import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.RKRecord; import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.record.StringRecord; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * A XLS -> CSV processor, that uses the MissingRecordAware * EventModel code to ensure it outputs all columns and rows. * * @author Nick Burch */ public class XLS2CSVmra implements HSSFListener { private int minColumns; private POIFSFileSystem fs; private PrintStream output; private int lastRowNumber; private int lastColumnNumber; /** Should we output the formula, or the value it has? */ private boolean outputFormulaValues = true; /** For parsing Formulas */ private SheetRecordCollectingListener workbookBuildingListener; private HSSFWorkbook stubWorkbook; // Records we pick up as we process private SSTRecord sstRecord; private FormatTrackingHSSFListener formatListener; /** So we known which sheet we're on */ private int sheetIndex = -1; private BoundSheetRecord[] orderedBSRs; private ArrayList boundSheetRecords = new ArrayList(); // For handling formulas with string results private int nextRow; private int nextColumn; private boolean outputNextStringRecord; //List to store value of cells in a row private List rowContent = new ArrayList(); //is the data in cell usefull private boolean validData; //is it the begining of new sheet private boolean isFirstRowFirstColumn; /** * Creates a new XLS -> CSV converter * * @param fs * The POIFSFileSystem to process * @param output * The PrintStream to output the CSV to * @param minColumns * The minimum number of columns to output, or -1 for no minimum */ public XLS2CSVmra(POIFSFileSystem fs, PrintStream output, int minColumns) { this.fs = fs; this.output = output; this.minColumns = minColumns; } /** * Creates a new XLS -> CSV converter * * @param filename * The file to process * @param minColumns * The minimum number of columns to output, or -1 for no minimum * @throws IOException * @throws FileNotFoundException */ public XLS2CSVmra(String filename, int minColumns) throws IOException, FileNotFoundException { this(new POIFSFileSystem(new FileInputStream(filename)), System.out, minColumns); } /** * Initiates the processing of the XLS file to CSV */ public void process() throws IOException { MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this); formatListener = new FormatTrackingHSSFListener(listener); HSSFEventFactory factory = new HSSFEventFactory(); HSSFRequest request = new HSSFRequest(); if (outputFormulaValues) { request.addListenerForAllRecords(formatListener); } else { workbookBuildingListener = new SheetRecordCollectingListener(formatListener); request.addListenerForAllRecords(workbookBuildingListener); } factory.processWorkbookEvents(request, fs); } /** * Main HSSFListener method, processes events, and outputs the * CSV as the file is processed. */ public void processRecord(Record record) { int thisRow = -1; int thisColumn = -1; String thisStr = null; validData = false; switch (record.getSid()) { case BoundSheetRecord.sid: boundSheetRecords.add((BoundSheetRecord)record); break; case BOFRecord.sid: BOFRecord br = (BOFRecord) record; if (br.getType() == BOFRecord.TYPE_WORKSHEET) { // Print last row of previous sheet if (rowContent.size() > 0) { // Print out any missing commas if needed if (minColumns > 0) { // Columns are 0 based if (lastColumnNumber == -1) { lastColumnNumber = 0; } for (int i = lastColumnNumber; i < (minColumns); i++) { rowContent.add(""); } } // print at the End the row printRow(rowContent); if(rowContent.size()>0) output.println(); rowContent = new ArrayList(); } isFirstRowFirstColumn = true; // Create sub workbook if required if (workbookBuildingListener != null && stubWorkbook == null) { stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook(); } // Output the worksheet name // Works by ordering the BSRs by the location of // their BOFRecords, and then knowing that we // process BOFRecords in byte offset order sheetIndex++; if (orderedBSRs == null) { orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords); } output.println(); output.println(orderedBSRs[sheetIndex].getSheetname() + " [" + (sheetIndex + 1) + "]:"); } break; case SSTRecord.sid: sstRecord = (SSTRecord) record; break; case BlankRecord.sid: BlankRecord brec = (BlankRecord) record; thisRow = brec.getRow(); thisColumn = brec.getColumn(); thisStr = ""; break; case BoolErrRecord.sid: BoolErrRecord berec = (BoolErrRecord) record; thisRow = berec.getRow(); thisColumn = berec.getColumn(); thisStr = ""; break; case FormulaRecord.sid: FormulaRecord frec = (FormulaRecord) record; thisRow = frec.getRow(); thisColumn = frec.getColumn(); if (outputFormulaValues) { if (frec.hasCachedResultString()) { // Formula result is a string // This is stored in the next record outputNextStringRecord = true; nextRow = frec.getRow(); nextColumn = frec.getColumn(); } else { thisStr = formatListener.formatNumberDateCell(frec); validData = true; } } else { thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"'; validData = true; } break; case StringRecord.sid: if (outputNextStringRecord) { // String for formula StringRecord srec = (StringRecord) record; thisStr = srec.getString(); thisRow = nextRow; thisColumn = nextColumn; outputNextStringRecord = false; validData = true; } break; case LabelRecord.sid: LabelRecord lrec = (LabelRecord) record; thisRow = lrec.getRow(); thisColumn = lrec.getColumn(); thisStr = '"' + lrec.getValue() + '"'; validData = true; break; case LabelSSTRecord.sid: LabelSSTRecord lsrec = (LabelSSTRecord) record; thisRow = lsrec.getRow(); thisColumn = lsrec.getColumn(); if (sstRecord == null) { thisStr = '"' + "(No SST Record, can't identify string)" + '"'; } else { thisStr = '"' + sstRecord.getString(lsrec.getSSTIndex()).toString() + '"'; validData = true; } break; case NoteRecord.sid: NoteRecord nrec = (NoteRecord) record; thisRow = nrec.getRow(); thisColumn = nrec.getColumn(); // TODO: Find object to match nrec.getShapeId() thisStr = '"' + "(TODO)" + '"'; break; case NumberRecord.sid: NumberRecord numrec = (NumberRecord) record; thisRow = numrec.getRow(); thisColumn = numrec.getColumn(); // Format thisStr = formatListener.formatNumberDateCell(numrec); validData = true; break; case RKRecord.sid: RKRecord rkrec = (RKRecord) record; thisRow = rkrec.getRow(); thisColumn = rkrec.getColumn(); thisStr = '"' + "(TODO)" + '"'; break; default: break; } // Handle missing column if (record instanceof MissingCellDummyRecord) { MissingCellDummyRecord mc = (MissingCellDummyRecord) record; thisRow = mc.getRow(); thisColumn = mc.getColumn(); thisStr = ""; } // handle beginning of a sheet if (isFirstRowFirstColumn) { rowContent = new ArrayList(); isFirstRowFirstColumn = false; lastRowNumber = -1; lastColumnNumber = -1; } // Handle new row if (thisRow != -1 && thisRow != lastRowNumber) { lastColumnNumber = -1; } // Handle end of row if (thisRow > lastRowNumber) { // Print out any missing commas if needed if (minColumns > 0) { // Columns are 0 based if (lastColumnNumber == -1) { lastColumnNumber = 0; } for (int i = lastColumnNumber; i < (minColumns); i++) { rowContent.add(""); } } // We're onto a new row lastColumnNumber = -1; // print at the End the row printRow(rowContent); if(rowContent.size()>0) output.println(); rowContent = new ArrayList(); } if (validData) { rowContent.add(thisStr); } else { if (thisRow != -1 && thisColumn == lastColumnNumber + 1 && !(record instanceof FormulaRecord)) { rowContent.add(""); } else { thisRow = thisColumn = -1; } } // Update column and row count if (thisRow > -1) lastRowNumber = thisRow; if (thisColumn > -1) lastColumnNumber = thisColumn; } public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Use:"); System.err.println(" XLS2CSVmra [min columns]"); System.exit(1); } int minColumns = -1; if (args.length >= 2) { minColumns = Integer.parseInt(args[1]); } XLS2CSVmra xls2csv = new XLS2CSVmra(args[0], minColumns); xls2csv.process(); } private void printRow(List rowContent) { int i; for (i=0;i