--- src/ooxml/java/org/apache/poi/xssf/streaming/SXSSFWorkbook.java (revision 8bf3ebfcee2e2799486de4ed8a5e063616b4083a) +++ src/ooxml/java/org/apache/poi/xssf/streaming/SXSSFWorkbook.java (revision ) @@ -17,37 +17,26 @@ package org.apache.poi.xssf.streaming; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipFile; -import java.util.zip.ZipOutputStream; - import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.formula.udf.UDFFinder; -import org.apache.poi.ss.usermodel.CellStyle; -import org.apache.poi.ss.usermodel.CreationHelper; -import org.apache.poi.ss.usermodel.DataFormat; -import org.apache.poi.ss.usermodel.Font; -import org.apache.poi.ss.usermodel.Name; -import org.apache.poi.ss.usermodel.PictureData; +import org.apache.poi.ss.usermodel.*; import org.apache.poi.ss.usermodel.Row.MissingCellPolicy; -import org.apache.poi.ss.usermodel.Sheet; -import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.util.CellRangeAddress; import org.apache.poi.util.TempFile; +import org.apache.poi.xssf.model.DBMappedSharedStringsTable; import org.apache.poi.xssf.model.SharedStringsTable; +import org.apache.poi.xssf.model.SharedStringsTableType; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import java.io.*; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.List; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; +import java.util.zip.ZipOutputStream; + /** * Streaming version of XSSFWorkbook implementing the "BigGridDemo" strategy. * @@ -231,7 +220,38 @@ } } } + /** + * Constructs an workbook from an existing workbook. + *

+ * When a new node is created via createRow() and the total number + * of unflushed records would exceed the specified value, then the + * row with the lowest index value is flushed and cannot be accessed + * via getRow() anymore. + *

+ *

+ * A value of -1 indicates unlimited access. In this case all + * records that have not been flushed by a call to flush() are available + * for random access. + *

+ *

+ * A value of 0 is not allowed because it would flush any newly created row + * without having a chance to specify any cells. + *

+ * + * @param workbook the template workbook + * @param rowAccessWindowSize + * @param compressTmpFiles whether to use gzip compression for temporary files + *@param sharedStringsTableType whether to use default shared string or custom + */ + public SXSSFWorkbook(XSSFWorkbook workbook, int rowAccessWindowSize, boolean compressTmpFiles, SharedStringsTableType sharedStringsTableType) { + this(workbook, rowAccessWindowSize, compressTmpFiles, true); + if (sharedStringsTableType == SharedStringsTableType.LOW_FOOTPRINT_MAP_DB_SST) { + this._sharedStringSource = new DBMappedSharedStringsTable(); + } + } + + /** * Construct an empty workbook and specify the window for row access. *

* When a new node is created via createRow() and the total number @@ -350,11 +370,13 @@ { xis.close(); } - } - else - { + } else { + if (isDBMappedSharedStringsTableEntry(ze)) { + injectSharedStringFromTempFile(zos); + } else { - copyStream(is, zos); - } + copyStream(is, zos); + } + } is.close(); } } @@ -368,6 +390,22 @@ zip.close(); } } + + private boolean isDBMappedSharedStringsTableEntry(ZipEntry ze) { + return ze.getName().equals("xl/sharedStrings.xml") && DBMappedSharedStringsTable.class.isInstance(_sharedStringSource); + } + + private void injectSharedStringFromTempFile(ZipOutputStream zos) throws IOException { + DBMappedSharedStringsTable _sst = (DBMappedSharedStringsTable) _sharedStringSource; + _sst.commit();//creating DBMapped SharedStringsTable.xml file + InputStream is = _sst.getSharedStringInputStream(); //injecting DBMapped SharedStringsTable.xml file in target output + copyStream(is, zos); + is.close(); + if (!_sst.getTemp_shared_string_file().delete()) { + throw new IOException("Could not delete temporary file after processing: " + _sst.getTemp_shared_string_file().getName()); + } + } + private static void copyStream(InputStream in, OutputStream out) throws IOException { byte[] chunk = new byte[1024]; int count; --- src/ooxml/java/org/apache/poi/xssf/model/SharedStringsTableType.java (revision ) +++ src/ooxml/java/org/apache/poi/xssf/model/SharedStringsTableType.java (revision ) @@ -0,0 +1,37 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xssf.model; + +/** + * enum to specify shared strings table to use + */ +public enum SharedStringsTableType { + DEFAULT_SST(SharedStringsTable.class),//in memory shared strings string table + LOW_FOOTPRINT_MAP_DB_SST(DBMappedSharedStringsTable.class); //streaming version low foot print shared strings table + /** + * Defines what object is used to construct instances of this relationship + */ + private Class instance; + + private SharedStringsTableType(Class sharedStringsTableInstance) { + instance = sharedStringsTableInstance; + } + + public Class getInstance() { + return instance; + } +} --- src/ooxml/testcases/org/apache/poi/xssf/streaming/TestDBMappedSharedStringsTableOption.java (revision ) +++ src/ooxml/testcases/org/apache/poi/xssf/streaming/TestDBMappedSharedStringsTableOption.java (revision ) @@ -0,0 +1,216 @@ +package org.apache.poi.xssf.streaming; + +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.xssf.model.SharedStringsTableType; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.*; +import java.util.Random; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestDBMappedSharedStringsTableOption { + //Streaming version of workbook + private SXSSFWorkbook workbook; + + private SXSSFSheet sheet; + + private File outputFile; + public static final String TEST_OUTPUT_DIR = "poi.test.xssf.output.dir"; + + @Before + public void setUp() { + outputFile = new File(System.getProperty(TEST_OUTPUT_DIR), "output.xlsx"); + setupWorkBook(); + setupBlankSheet(); + } + + private void setupWorkBook() { + XSSFWorkbook wb = new XSSFWorkbook(); + workbook = new SXSSFWorkbook(wb, 2, false, SharedStringsTableType.LOW_FOOTPRINT_MAP_DB_SST); + } + + private void setupBlankSheet() { + sheet = (SXSSFSheet) workbook.createSheet("Employee Data"); + } + + @After + public void cleanup() { + outputFile.delete(); + } + + @Test + public void testWrite100UniqueRecordsOf10Char() throws IOException { + int recordCount = 100; + addUniqueRecordsToSheet(0, 100, 10); + writeAndAssertRecord(recordCount); + } + + @Test + public void testWrite1MUniqueRecordsOf100Char() { + int recordCount = 1000000; + addUniqueRecordsToSheet(0, recordCount, 100); + writeAndAssertRecord(recordCount); + } + + @Test + public void testWriteFromTextFile() { + int recordCount = 3; + File textInputFile = new File(System.getProperty(TEST_OUTPUT_DIR), "temp.txt"); + try { + FileWriter w = new FileWriter(textInputFile); + for (int i = 1; i <= recordCount; i++) { + w.write("Line" + i + ",FirstColumn,SecondColumn,ThirdColumn\r\n"); + } + w.close(); + } catch (IOException e) { + } + addRecordsFromFile("temp.txt"); + writeAndAssertRecord(recordCount); + textInputFile.delete(); + } + + @Test + public void testWrite1MRandomRecordsOf10Char() { + int recordCount = 100000; + addRandomRecordsToSheet(0, recordCount, 200000, 10); + writeAndAssertRecord(recordCount); + } + + @Test + public void test1MRecordHavingRepetitiveRecordsOf10Char() { + int recordCount = 1000000; + addUniqueRecordsToSheet(0, 200000, 10); + addUniqueRecordsToSheet(200000, 200000, 10); + addUniqueRecordsToSheet(400000, 200000, 10); + addUniqueRecordsToSheet(600000, 200000, 10); + addUniqueRecordsToSheet(800000, 200000, 10); + writeAndAssertRecord(recordCount); + } + + @Test + public void testWriteAllDuplicateRecord() { + int recordCount = 100000; + addRepeatingRecordsToSheet(recordCount); + writeAndAssertRecord(recordCount); + } + + private void writeAndAssertRecord(int recordCount) { + System.out.print("Started writing....."); + try { + FileOutputStream out = new FileOutputStream(outputFile); + workbook.write(out); + out.close(); + } catch (Exception e) { + assertTrue("Excel File Creation Failed: " + e.getMessage(), false); + } + System.out.println("File creation done...Asserting"); + + //NOTE: all tests can be executed within -Xmx100M by commenting out assertion code below + + try { + assertRows(new XSSFWorkbook(outputFile), recordCount); + } catch (Exception e) { + assertTrue("Invalid Output File: " + e.getMessage(), false); + } + //---- + } + + private void addUniqueRecordsToSheet(int fromRowNum, int numberOfRecords, int constantStringLength) { + System.out.print("adding records to sheet....."); + int i = 0; + String constantString = getStringOf(constantStringLength); + while (i++ < numberOfRecords) { + if (i % 10000 == 0) System.out.print(i + ","); + Row row = sheet.createRow(fromRowNum++); + Object[] objArr = new Object[]{constantString + i}; + int cellNum = 0; + for (Object obj : objArr) { + Cell cell = row.createCell(cellNum++); + if (obj instanceof String) { + cell.setCellValue((String) obj); + } else if (obj instanceof Integer) + cell.setCellValue((Integer) obj); + } + } + } + + private String getStringOf(int length) { + StringBuilder str = new StringBuilder(); + for (int j = 0; j < length; j++) { + str.append("a"); + } + return str.toString(); + } + + private void addRandomRecordsToSheet(int fromRowNum, int numberOfRecords, int recordLength, int constantStringLength) { + int i = 0; + String constantString = getStringOf(constantStringLength); + while (i++ < numberOfRecords) { + if (i % 1000 == 0) System.out.print(i + ","); + Row row = sheet.createRow(fromRowNum++); + Object[] objArr = new Object[]{constantString + new Random().nextInt(recordLength)}; + int cellNum = 0; + for (Object obj : objArr) { + Cell cell = row.createCell(cellNum++); + if (obj instanceof String) + cell.setCellValue((String) obj); + else if (obj instanceof Integer) + cell.setCellValue((Integer) obj); + } + } + } + + private void addRecordsFromFile(String fileName) { + System.out.print("adding records to sheet....."); + try { + int fromRowNum = 0; + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName))); + String line = null; + while ((line = br.readLine()) != null) { + Row row = sheet.createRow(fromRowNum++); + Object[] objArr = line.split(","); + int cellNum = 0; + for (Object obj : objArr) { + Cell cell = row.createCell(cellNum++); + if (obj instanceof String) + cell.setCellValue((String) obj); + else if (obj instanceof Integer) + cell.setCellValue((Integer) obj); + } + } + br.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void addRepeatingRecordsToSheet(int count) { + int rownum = 0; + int i = 0; + String constantString = getStringOf(10); + while (i++ < count) { + Row row = sheet.createRow(rownum++); + Object[] objArr = new Object[]{constantString}; + int cellnum = 0; + for (Object obj : objArr) { + Cell cell = row.createCell(cellnum++); + if (obj instanceof String) + cell.setCellValue((String) obj); + else if (obj instanceof Integer) + cell.setCellValue((Integer) obj); + } + } + } + + public void assertRows(Workbook wb, int expectedRecordCount) { + assertEquals(expectedRecordCount, wb.getSheetAt(0).getLastRowNum() + 1); + } + +} --- src/ooxml/java/org/apache/poi/xssf/model/DBMappedSharedStringsTable.java (revision ) +++ src/ooxml/java/org/apache/poi/xssf/model/DBMappedSharedStringsTable.java (revision ) @@ -0,0 +1,236 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.xssf.model; + +import org.apache.poi.util.TempFile; +import org.apache.xmlbeans.XmlOptions; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.HTreeMap; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst; + +import javax.xml.stream.XMLStreamException; +import java.io.*; +import java.math.BigInteger; +import java.security.SecureRandom; +import java.util.Collections; + +/** + * SharedStringsTable With Map DB implementation + * To reduce memory footprint of POI’s shared strings table + * It flows data to disk as per availability of memory (Reference) + */ +public class DBMappedSharedStringsTable extends SharedStringsTable implements AutoCloseable { + + /** + * Maps strings and their indexes in the stringVsIndexSTMap map db + */ + private DB stringVsIndexMapDB; + //string vs index map to lookup existing record in stTable look at add entry method + private HTreeMap stringVsIndexSTMap; + /** + * Maps strings and their indexes in the stringVsIndexSTMap map db + */ + private DB indexVsStringMapDB; + //index vs string map to retrieve record with index + private HTreeMap indexVsStringSTMap; + + private final File temp_shared_string_file; + + /** + * An integer representing the total count of strings in the workbook. This count does not + * include any numbers, it counts only the total of text strings in the workbook. + */ + private int count; + + /** + * An integer representing the total count of unique strings in the Shared String Table. + * A string is unique even if it is a copy of another string, but has different formatting applied + * at the character level. + */ + private int uniqueCount; + + private final static XmlOptions options = new XmlOptions(); + + + static { + options.put(XmlOptions.SAVE_INNER); + options.put(XmlOptions.SAVE_AGGRESSIVE_NAMESPACES); + options.put(XmlOptions.SAVE_USE_DEFAULT_NAMESPACE); + options.setSaveImplicitNamespaces(Collections.singletonMap("", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")); + } + + public DBMappedSharedStringsTable() { + super(); + temp_shared_string_file = createTempFile("poi-shared-string-table", ".xml"); + initMapDbBasedSharedStringTableMap(); + } + + public FileInputStream getSharedStringInputStream() throws IOException { + return new FileInputStream(temp_shared_string_file); + } + + public FileOutputStream getSharedStringsTableOutputStream() throws IOException { + return new FileOutputStream(temp_shared_string_file); + } + + public File getTemp_shared_string_file() { + return temp_shared_string_file; + } + + private File createTempFile(String prefix, String suffix) { + try { + return TempFile.createTempFile(prefix, suffix); + } catch (IOException e) { + throw new RuntimeException("Couldn't create required temp file", e); + } + } + + private void initMapDbBasedSharedStringTableMap() { + initStringVsIndexBasedMapDB(); + initIndexVsStringBasedMapDB(); + } + + private void initStringVsIndexBasedMapDB() { + int HARD_REF_CACHE_INITIAL_CAPACITY = 65536;//for HardRef cache it is initial capacity of underlying table (HashMap) Default cache size is 32768 setting it to 65536 + stringVsIndexMapDB = DBMaker.newFileDB(createTempFile("stringVsIndexMapDBFile", "")) + .transactionDisable() + .cacheHardRefEnable() + .cacheSize(HARD_REF_CACHE_INITIAL_CAPACITY) + .deleteFilesAfterClose() + .mmapFileEnablePartial() + .closeOnJvmShutdown().make(); + stringVsIndexSTMap = stringVsIndexMapDB.createHashMap(new BigInteger(130, new SecureRandom()).toString(32)).make(); + } + + private void initIndexVsStringBasedMapDB() { + indexVsStringMapDB = DBMaker.newFileDB(createTempFile("indexVsStringMapDBFile", "")) + .transactionDisable() + .cacheDisable() //caching not required indexVsStringSTMap will be used to write all existing values + .deleteFilesAfterClose() + .mmapFileEnablePartial() + .closeOnJvmShutdown().make(); + indexVsStringSTMap = indexVsStringMapDB.createHashMap(new BigInteger(130, new SecureRandom()).toString(32)).make(); + } + + private String getKey(CTRst st) { + return st.xmlText(options); + } + + /** + * Return an integer representing the total count of strings in the workbook. This count does not + * include any numbers, it counts only the total of text strings in the workbook. + * + * @return the total count of strings in the workbook + */ + @Override + public int getCount() { + return count; + } + + /** + * Returns an integer representing the total count of unique strings in the Shared String Table. + * A string is unique even if it is a copy of another string, but has different formatting applied + * at the character level. + * + * @return the total count of unique strings in the workbook + */ + @Override + public int getUniqueCount() { + return uniqueCount; + } + + /** + * Add an entry to this Shared String table (a new value is appened to the end). + *

+ *

+ * If the Shared String table already contains this CTRst bean, its index is returned. + * Otherwise a new entry is aded. + *

+ * + * @param st the entry to add + * @return index the index of added entry + */ + @Override + public int addEntry(CTRst st) { + String s = getKey(st); + count++; + if (stringVsIndexSTMap.containsKey(s)) { + return stringVsIndexSTMap.get(s); + } + //new unique record + stringVsIndexSTMap.put(s, uniqueCount); + indexVsStringSTMap.put(uniqueCount, s); + return uniqueCount++; + } + + @Override + public void commit() throws IOException { + FileOutputStream sharedStringOutputStream = getSharedStringsTableOutputStream(); + writeTo(sharedStringOutputStream); + sharedStringOutputStream.close(); + } + + @Override + public void close() throws Exception { + stringVsIndexSTMap.clear(); + indexVsStringSTMap.clear(); + stringVsIndexMapDB.close(); + indexVsStringMapDB.close(); + } + + /** + * Write this table out as XML. + * + * @param out The stream to write to. + * @throws java.io.IOException if an error occurs while writing. + */ + public void writeTo(OutputStream out) throws IOException { + //re-create the sst table every time saving a workbook at the end after adding all record using map DB + try { + Writer writer = new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); + addDefaultXmlOptions(writer); + if (uniqueCount != 0) { + addStringItems(writer); + addEndDocument(writer); + } + writer.flush(); + } catch (XMLStreamException e) { + throw new RuntimeException("Couldn't write to SharedStringsTable", e); + } + } + + private void addDefaultXmlOptions(Writer writer) throws XMLStreamException, IOException { + writer.write("\r\n"); + String isNoSIElements = uniqueCount == 0 ? "/" : ""; + writer.write(""); + } + + private void addStringItems(Writer writer) throws XMLStreamException, IOException { + for (int i = 0; i < uniqueCount; i++) { + String s = indexVsStringSTMap.get(i); + writer.write(""); + writer.write(s); + writer.write(""); + } + } + + private void addEndDocument(Writer writer) throws XMLStreamException, IOException { + writer.write(""); + } +} --- maven/poi-ooxml.pom (revision 8bf3ebfcee2e2799486de4ed8a5e063616b4083a) +++ maven/poi-ooxml.pom (revision ) @@ -69,5 +69,10 @@ poi-ooxml-schemas @VERSION@ + + org.mapdb + mapdb + 1.0.6 +