--- src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java (working copy) @@ -30,10 +30,10 @@ * @param fcStart The start of the text for this property, in _bytes_ * @param fcEnd The end of the text for this property, in _bytes_ */ - public BytePropertyNode(int fcStart, int fcEnd, Object buf, boolean isUnicode) { + public BytePropertyNode(int fcStart, int fcEnd, CharIndexTranslator translator, Object buf, boolean isUnicode) { super( - generateCp(fcStart, isUnicode), - generateCp(fcEnd, isUnicode), + translator.getCharIndex(fcStart), + translator.getCharIndex(fcEnd), buf ); this.isUnicode = isUnicode; @@ -45,14 +45,17 @@ } public boolean isUnicode() { + //XXX Is this method useful? return isUnicode; } public int getStartBytes() { + //XXX That is wrong in most cases! if(isUnicode) return getStart()*2; return getStart(); } public int getEndBytes() { + //XXX That is wrong in most cases! if(isUnicode) return getEnd()*2; return getEnd(); --- src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java (revision 0) +++ src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java (revision 0) @@ -0,0 +1,14 @@ +package org.apache.poi.hwpf.model; + +public interface CharIndexTranslator { + + /** + * Calculates the char index of the given byte index. + * + * @param byteStart + * @param bytePos + * @return + */ + int getCharIndex(int bytePos); + +} --- src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java (working copy) @@ -121,7 +121,7 @@ { boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart); - CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode); + CHPX insertChpx = new CHPX(0, 0, tpt,buf, needsToBeUnicode); // Ensure character offsets are really characters insertChpx.setStart(cpStart); @@ -141,7 +141,7 @@ // Original, until insert at point // New one // Clone of original, on to the old end - CHPX clone = new CHPX(0, 0, chpx.getSprmBuf(), needsToBeUnicode); + CHPX clone = new CHPX(0, 0, tpt,chpx.getSprmBuf(), needsToBeUnicode); // Again ensure contains character based offsets no matter what clone.setStart(cpStart); clone.setEnd(chpx.getEnd()); --- src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java (working copy) @@ -61,8 +61,10 @@ for (int x = 0; x < _crun; x++) { - boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) ); - _chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode)); + int startAt = getStart(x) - fcMin; + boolean isUnicode = tpt.isUnicodeAtByteOffset( startAt ); + int endAt = getEnd(x) - fcMin; + _chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x), isUnicode)); } } --- src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java (working copy) @@ -37,14 +37,14 @@ public class CHPX extends BytePropertyNode { - public CHPX(int fcStart, int fcEnd, byte[] grpprl, boolean isUnicode) + public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] grpprl, boolean isUnicode) { - super(fcStart, fcEnd, new SprmBuffer(grpprl), isUnicode); + super(fcStart, fcEnd, translator, new SprmBuffer(grpprl), isUnicode); } - public CHPX(int fcStart, int fcEnd, SprmBuffer buf, boolean isUnicode) + public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, boolean isUnicode) { - super(fcStart, fcEnd, buf, isUnicode); + super(fcStart, fcEnd, translator ,buf, isUnicode); } --- src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java (working copy) @@ -78,7 +78,7 @@ { boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart); - PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode); + PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream, needsToBeUnicode); // Ensure character offsets are really characters forInsert.setStart(cpStart); @@ -108,7 +108,7 @@ // Original, until insert at point // New one // Clone of original, on to the old end - PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream, needsToBeUnicode); + PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream, needsToBeUnicode); // Again ensure contains character based offsets no matter what clone.setStart(cpStart); clone.setEnd(currentPap.getEnd()); --- src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java (working copy) @@ -70,7 +70,7 @@ boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt); //System.err.println(startAt + " -> " + endAt + " = " + isUnicode); - _papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode)); + _papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode)); } _fkp = null; _dataStream = dataStream; --- src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java (working copy) @@ -43,18 +43,18 @@ private ParagraphHeight _phe; private int _hugeGrpprlOffset = -1; - public PAPX(int fcStart, int fcEnd, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode) + public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode) { - super(fcStart, fcEnd, new SprmBuffer(papx), isUnicode); + super(fcStart, fcEnd, translator, new SprmBuffer(papx), isUnicode); _phe = phe; SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream); if(buf != null) _buf = buf; } - public PAPX(int fcStart, int fcEnd, SprmBuffer buf, byte[] dataStream, boolean isUnicode) + public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, byte[] dataStream, boolean isUnicode) { - super(fcStart, fcEnd, buf, isUnicode); + super(fcStart, fcEnd, translator, buf, isUnicode); _phe = new ParagraphHeight(); buf = findHuge(buf, dataStream); if(buf != null) --- src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java (working copy) @@ -67,7 +67,7 @@ // check for the optimization if (fileOffset == 0xffffffff) { - _sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart)); + _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0], isUnicodeAtStart)); } else { @@ -76,7 +76,7 @@ byte[] buf = new byte[sepxSize]; fileOffset += LittleEndian.SHORT_SIZE; System.arraycopy(documentStream, fileOffset, buf, 0, buf.length); - _sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart)); + _sections.add(new SEPX(sed, startAt, endAt, tpt, buf, isUnicodeAtStart)); } } --- src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java (working copy) @@ -31,9 +31,9 @@ SectionDescriptor _sed; - public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode) + public SEPX(SectionDescriptor sed, int start, int end, CharIndexTranslator translator, byte[] grpprl, boolean isUnicode) { - super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode); + super(start, end, translator, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode); _sed = sed; } --- src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (revision 738040) +++ src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (working copy) @@ -38,7 +38,7 @@ * convertion. * @author Ryan Ackley */ -public class TextPieceTable +public class TextPieceTable implements CharIndexTranslator { protected ArrayList _textPieces = new ArrayList(); //int _multiple; @@ -269,4 +269,32 @@ } return false; } + /* (non-Javadoc) + * @see org.apache.poi.hwpf.model.CharIndexTranslator#getLengthInChars(int) + */ + public int getCharIndex(int bytePos) { + int charCount = 0; + int curByte = 0; + + Iterator it = _textPieces.iterator(); + while (it.hasNext() && curByte < bytePos) { + TextPiece tp = (TextPiece) it.next(); + + int bytesLength = tp.bytesLength(); + int nextByte = curByte + bytesLength; + + int toAdd = bytePos > nextByte ? bytesLength : bytesLength + - (nextByte - bytePos); + + if (tp.isUnicode()) { + charCount += toAdd / 2; + } else { + charCount += toAdd; + } + + curByte = nextByte; + } + + return charCount; + } }