Index: src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java (working copy) @@ -34,14 +34,14 @@ public final class CHPX extends BytePropertyNode { - public CHPX(int fcStart, int fcEnd, byte[] grpprl, boolean isUnicode) + public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] grpprl) { - super(fcStart, fcEnd, new SprmBuffer(grpprl), isUnicode); + super(fcStart, fcEnd, translator, new SprmBuffer(grpprl)); } - public CHPX(int fcStart, int fcEnd, SprmBuffer buf, boolean isUnicode) + public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf) { - super(fcStart, fcEnd, buf, isUnicode); + super(fcStart, fcEnd, translator ,buf); } Index: src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (working copy) @@ -37,7 +37,7 @@ * convertion. * @author Ryan Ackley */ -public final class TextPieceTable +public final class TextPieceTable implements CharIndexTranslator { protected ArrayList _textPieces = new ArrayList(); //int _multiple; @@ -150,31 +150,25 @@ // If they ask off the end, just go with the last one... return lastWas; } - /** - * Is the text at the given byte offset - * unicode, or plain old ascii? - * In a very evil fashion, you have to actually - * know this to make sense of character and - * paragraph properties :( - * @param bytePos The character offset to check about - */ + public boolean isUnicodeAtByteOffset(int bytePos) { boolean lastWas = false; - int curByte = 0; + Iterator it = _textPieces.iterator(); while(it.hasNext()) { TextPiece tp = (TextPiece)it.next(); - int nextByte = curByte + tp.bytesLength(); + int curByte = tp.getPieceDescriptor().getFilePosition(); + int pieceEnd = curByte + tp.bytesLength(); // If the text piece covers the character, all good - if(curByte <= bytePos && nextByte >= bytePos) { + if(curByte <= bytePos && pieceEnd > bytePos) { return tp.isUnicode(); } // Otherwise keep track for the last one lastWas = tp.isUnicode(); // Move along - curByte = nextByte; + curByte = pieceEnd; } // If they ask off the end, just go with the last one... @@ -268,4 +262,34 @@ } return false; } + /* (non-Javadoc) + * @see org.apache.poi.hwpf.model.CharIndexTranslator#getLengthInChars(int) + */ + public int getCharIndex(int bytePos) { + int charCount = 0; + + Iterator it = _textPieces.iterator(); + while (it.hasNext()) { + TextPiece tp = (TextPiece) it.next(); + int pieceStart = tp.getPieceDescriptor().getFilePosition(); + if(pieceStart >= bytePos) { + break; + } + + int bytesLength = tp.bytesLength(); + int pieceEnd = pieceStart + bytesLength; + + int toAdd = bytePos > pieceEnd ? bytesLength : bytesLength + - (pieceEnd - bytePos); + + if (tp.isUnicode()) { + charCount += toAdd / 2; + } else { + charCount += toAdd; + } + } + + return charCount; + } + } Index: src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java (working copy) @@ -119,9 +119,8 @@ public void insert(int listIndex, int cpStart, SprmBuffer buf) { - boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart); - CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode); + CHPX insertChpx = new CHPX(0, 0, tpt,buf); // Ensure character offsets are really characters insertChpx.setStart(cpStart); @@ -141,7 +140,7 @@ // Original, until insert at point // New one // Clone of original, on to the old end - CHPX clone = new CHPX(0, 0, chpx.getSprmBuf(), needsToBeUnicode); + CHPX clone = new CHPX(0, 0, tpt,chpx.getSprmBuf()); // Again ensure contains character based offsets no matter what clone.setStart(cpStart); clone.setEnd(chpx.getEnd()); Index: src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java (working copy) @@ -62,14 +62,10 @@ public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt) { super(documentStream, offset); - for (int x = 0; x < _crun; x++) { - int startAt = getStart(x) - fcMin; - int endAt = getEnd(x) - fcMin; - boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt); - //System.err.println(startAt + " -> " + endAt + " = " + isUnicode); - - _papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode)); + int startAt = getStart(x); + int endAt = getEnd(x); + _papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream)); } _fkp = null; _dataStream = dataStream; Index: src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java (working copy) @@ -40,18 +40,18 @@ private ParagraphHeight _phe; private int _hugeGrpprlOffset = -1; - public PAPX(int fcStart, int fcEnd, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode) + public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] papx, ParagraphHeight phe, byte[] dataStream) { - super(fcStart, fcEnd, new SprmBuffer(papx), isUnicode); + super(fcStart, fcEnd, translator, new SprmBuffer(papx)); _phe = phe; SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream); if(buf != null) _buf = buf; } - public PAPX(int fcStart, int fcEnd, SprmBuffer buf, byte[] dataStream, boolean isUnicode) + public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, byte[] dataStream) { - super(fcStart, fcEnd, buf, isUnicode); + super(fcStart, fcEnd, translator, buf); _phe = new ParagraphHeight(); buf = findHuge(buf, dataStream); if(buf != null) Index: src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java (working copy) @@ -61,13 +61,10 @@ int startAt = CPtoFC(node.getStart()); int endAt = CPtoFC(node.getEnd()); - boolean isUnicodeAtStart = tpt.isUnicodeAtByteOffset( startAt ); -// System.err.println(startAt + " -> " + endAt + " = " + isUnicodeAtStart); - // check for the optimization if (fileOffset == 0xffffffff) { - _sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart)); + _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0])); } else { @@ -76,7 +73,7 @@ byte[] buf = new byte[sepxSize]; fileOffset += LittleEndian.SHORT_SIZE; System.arraycopy(documentStream, fileOffset, buf, 0, buf.length); - _sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart)); + _sections.add(new SEPX(sed, startAt, endAt, tpt, buf)); } } @@ -138,33 +135,13 @@ } int FC = TP.getPieceDescriptor().getFilePosition(); int offset = CP - TP.getCP(); - FC = FC+offset-((TextPiece)_text.get(0)).getPieceDescriptor().getFilePosition(); + if (TP.isUnicode()) { + offset = offset*2; + } + FC = FC+offset; return FC; } - // Ryans code - private int FCtoCP(int fc) - { - int size = _text.size(); - int cp = 0; - for (int x = 0; x < size; x++) - { - TextPiece piece = (TextPiece)_text.get(x); - - if (fc <= piece.getEnd()) - { - cp += (fc - piece.getStart()); - break; - } - else - { - cp += (piece.getEnd() - piece.getStart()); - } - } - return cp; - } - - public ArrayList getSections() { return _sections; @@ -205,7 +182,7 @@ // Line using Ryan's FCtoCP() conversion method - // unable to observe any effect on our testcases when using this code - piers - GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStartBytes()), FCtoCP(sepx.getEndBytes()), sed.toByteArray()); + GenericPropertyNode property = new GenericPropertyNode(tpt.getCharIndex(sepx.getStartBytes()), tpt.getCharIndex(sepx.getEndBytes()), sed.toByteArray()); plex.addProperty(property); Index: src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java (working copy) @@ -25,37 +25,28 @@ * and characters. */ public abstract class BytePropertyNode extends PropertyNode { - private boolean isUnicode; + private final int startBytes; + private final int endBytes; /** * @param fcStart The start of the text for this property, in _bytes_ * @param fcEnd The end of the text for this property, in _bytes_ */ - public BytePropertyNode(int fcStart, int fcEnd, Object buf, boolean isUnicode) { + public BytePropertyNode(int fcStart, int fcEnd, CharIndexTranslator translator, Object buf) { super( - generateCp(fcStart, isUnicode), - generateCp(fcEnd, isUnicode), + translator.getCharIndex(fcStart), + translator.getCharIndex(fcEnd), buf ); - this.isUnicode = isUnicode; + this.startBytes = fcStart; + this.endBytes = fcEnd; } - private static int generateCp(int val, boolean isUnicode) { - if(isUnicode) - return val/2; - return val; - } - public boolean isUnicode() { - return isUnicode; - } public int getStartBytes() { - if(isUnicode) - return getStart()*2; - return getStart(); + return startBytes; } + public int getEndBytes() { - if(isUnicode) - return getEnd()*2; - return getEnd(); + return endBytes; } } Index: src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java (working copy) @@ -28,9 +28,9 @@ SectionDescriptor _sed; - public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode) + public SEPX(SectionDescriptor sed, int start, int end, CharIndexTranslator translator, byte[] grpprl) { - super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode); + super(start, end, translator, SectionSprmUncompressor.uncompressSEP(grpprl, 0)); _sed = sed; } Index: src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java (working copy) @@ -76,9 +76,8 @@ public void insert(int listIndex, int cpStart, SprmBuffer buf) { - boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart); - PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode); + PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream); // Ensure character offsets are really characters forInsert.setStart(cpStart); @@ -108,7 +107,7 @@ // Original, until insert at point // New one // Clone of original, on to the old end - PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream, needsToBeUnicode); + PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream); // Again ensure contains character based offsets no matter what clone.setStart(cpStart); clone.setEnd(currentPap.getEnd()); Index: src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java =================================================================== --- src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java (revision 786463) +++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java (working copy) @@ -60,8 +60,9 @@ for (int x = 0; x < _crun; x++) { - boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) ); - _chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode)); + int startAt = getStart(x); + int endAt = getEnd(x); + _chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x))); } }