ASF Bugzilla – Attachment 23833 Details for
Bug 46610
[PATCH] Problems accessing documents containing unicode
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
Unicode patch v.2
unicode.patch (text/plain), 13.23 KB, created by
Maxim Valyanskiy
on 2009-06-19 04:58:24 UTC
(
hide
)
Description:
Unicode patch v.2
Filename:
MIME Type:
Creator:
Maxim Valyanskiy
Created:
2009-06-19 04:58:24 UTC
Size:
13.23 KB
patch
obsolete
>Index: src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java (working copy) >@@ -34,14 +34,14 @@ > public final class CHPX extends BytePropertyNode > { > >- public CHPX(int fcStart, int fcEnd, byte[] grpprl, boolean isUnicode) >+ public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] grpprl) > { >- super(fcStart, fcEnd, new SprmBuffer(grpprl), isUnicode); >+ super(fcStart, fcEnd, translator, new SprmBuffer(grpprl)); > } > >- public CHPX(int fcStart, int fcEnd, SprmBuffer buf, boolean isUnicode) >+ public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf) > { >- super(fcStart, fcEnd, buf, isUnicode); >+ super(fcStart, fcEnd, translator ,buf); > } > > >Index: src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java (working copy) >@@ -37,7 +37,7 @@ > * convertion. > * @author Ryan Ackley > */ >-public final class TextPieceTable >+public final class TextPieceTable implements CharIndexTranslator > { > protected ArrayList _textPieces = new ArrayList(); > //int _multiple; >@@ -150,31 +150,25 @@ > // If they ask off the end, just go with the last one... > return lastWas; > } >- /** >- * Is the text at the given byte offset >- * unicode, or plain old ascii? >- * In a very evil fashion, you have to actually >- * know this to make sense of character and >- * paragraph properties :( >- * @param bytePos The character offset to check about >- */ >+ > public boolean isUnicodeAtByteOffset(int bytePos) { > boolean lastWas = false; >- int curByte = 0; >+ > > Iterator it = _textPieces.iterator(); > while(it.hasNext()) { > TextPiece tp = (TextPiece)it.next(); >- int nextByte = curByte + tp.bytesLength(); >+ int curByte = tp.getPieceDescriptor().getFilePosition(); >+ int pieceEnd = curByte + tp.bytesLength(); > > // If the text piece covers the character, all good >- if(curByte <= bytePos && nextByte >= bytePos) { >+ if(curByte <= bytePos && pieceEnd > bytePos) { > return tp.isUnicode(); > } > // Otherwise keep track for the last one > lastWas = tp.isUnicode(); > // Move along >- curByte = nextByte; >+ curByte = pieceEnd; > } > > // If they ask off the end, just go with the last one... >@@ -268,4 +262,34 @@ > } > return false; > } >+ /* (non-Javadoc) >+ * @see org.apache.poi.hwpf.model.CharIndexTranslator#getLengthInChars(int) >+ */ >+ public int getCharIndex(int bytePos) { >+ int charCount = 0; >+ >+ Iterator it = _textPieces.iterator(); >+ while (it.hasNext()) { >+ TextPiece tp = (TextPiece) it.next(); >+ int pieceStart = tp.getPieceDescriptor().getFilePosition(); >+ if(pieceStart >= bytePos) { >+ break; >+ } >+ >+ int bytesLength = tp.bytesLength(); >+ int pieceEnd = pieceStart + bytesLength; >+ >+ int toAdd = bytePos > pieceEnd ? bytesLength : bytesLength >+ - (pieceEnd - bytePos); >+ >+ if (tp.isUnicode()) { >+ charCount += toAdd / 2; >+ } else { >+ charCount += toAdd; >+ } >+ } >+ >+ return charCount; >+ } >+ > } >Index: src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java (working copy) >@@ -119,9 +119,8 @@ > > public void insert(int listIndex, int cpStart, SprmBuffer buf) > { >- boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart); > >- CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode); >+ CHPX insertChpx = new CHPX(0, 0, tpt,buf); > > // Ensure character offsets are really characters > insertChpx.setStart(cpStart); >@@ -141,7 +140,7 @@ > // Original, until insert at point > // New one > // Clone of original, on to the old end >- CHPX clone = new CHPX(0, 0, chpx.getSprmBuf(), needsToBeUnicode); >+ CHPX clone = new CHPX(0, 0, tpt,chpx.getSprmBuf()); > // Again ensure contains character based offsets no matter what > clone.setStart(cpStart); > clone.setEnd(chpx.getEnd()); >Index: src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java (working copy) >@@ -62,14 +62,10 @@ > public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt) > { > super(documentStream, offset); >- > for (int x = 0; x < _crun; x++) { >- int startAt = getStart(x) - fcMin; >- int endAt = getEnd(x) - fcMin; >- boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt); >- //System.err.println(startAt + " -> " + endAt + " = " + isUnicode); >- >- _papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode)); >+ int startAt = getStart(x); >+ int endAt = getEnd(x); >+ _papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream)); > } > _fkp = null; > _dataStream = dataStream; >Index: src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java (working copy) >@@ -40,18 +40,18 @@ > private ParagraphHeight _phe; > private int _hugeGrpprlOffset = -1; > >- public PAPX(int fcStart, int fcEnd, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode) >+ public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] papx, ParagraphHeight phe, byte[] dataStream) > { >- super(fcStart, fcEnd, new SprmBuffer(papx), isUnicode); >+ super(fcStart, fcEnd, translator, new SprmBuffer(papx)); > _phe = phe; > SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream); > if(buf != null) > _buf = buf; > } > >- public PAPX(int fcStart, int fcEnd, SprmBuffer buf, byte[] dataStream, boolean isUnicode) >+ public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, byte[] dataStream) > { >- super(fcStart, fcEnd, buf, isUnicode); >+ super(fcStart, fcEnd, translator, buf); > _phe = new ParagraphHeight(); > buf = findHuge(buf, dataStream); > if(buf != null) >Index: src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java (working copy) >@@ -61,13 +61,10 @@ > int startAt = CPtoFC(node.getStart()); > int endAt = CPtoFC(node.getEnd()); > >- boolean isUnicodeAtStart = tpt.isUnicodeAtByteOffset( startAt ); >-// System.err.println(startAt + " -> " + endAt + " = " + isUnicodeAtStart); >- > // check for the optimization > if (fileOffset == 0xffffffff) > { >- _sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart)); >+ _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0])); > } > else > { >@@ -76,7 +73,7 @@ > byte[] buf = new byte[sepxSize]; > fileOffset += LittleEndian.SHORT_SIZE; > System.arraycopy(documentStream, fileOffset, buf, 0, buf.length); >- _sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart)); >+ _sections.add(new SEPX(sed, startAt, endAt, tpt, buf)); > } > } > >@@ -138,33 +135,13 @@ > } > int FC = TP.getPieceDescriptor().getFilePosition(); > int offset = CP - TP.getCP(); >- FC = FC+offset-((TextPiece)_text.get(0)).getPieceDescriptor().getFilePosition(); >+ if (TP.isUnicode()) { >+ offset = offset*2; >+ } >+ FC = FC+offset; > return FC; > } > >- // Ryans code >- private int FCtoCP(int fc) >- { >- int size = _text.size(); >- int cp = 0; >- for (int x = 0; x < size; x++) >- { >- TextPiece piece = (TextPiece)_text.get(x); >- >- if (fc <= piece.getEnd()) >- { >- cp += (fc - piece.getStart()); >- break; >- } >- else >- { >- cp += (piece.getEnd() - piece.getStart()); >- } >- } >- return cp; >- } >- >- > public ArrayList getSections() > { > return _sections; >@@ -205,7 +182,7 @@ > > // Line using Ryan's FCtoCP() conversion method - > // unable to observe any effect on our testcases when using this code - piers >- GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStartBytes()), FCtoCP(sepx.getEndBytes()), sed.toByteArray()); >+ GenericPropertyNode property = new GenericPropertyNode(tpt.getCharIndex(sepx.getStartBytes()), tpt.getCharIndex(sepx.getEndBytes()), sed.toByteArray()); > > > plex.addProperty(property); >Index: src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java (working copy) >@@ -25,37 +25,28 @@ > * and characters. > */ > public abstract class BytePropertyNode extends PropertyNode { >- private boolean isUnicode; >+ private final int startBytes; >+ private final int endBytes; > > /** > * @param fcStart The start of the text for this property, in _bytes_ > * @param fcEnd The end of the text for this property, in _bytes_ > */ >- public BytePropertyNode(int fcStart, int fcEnd, Object buf, boolean isUnicode) { >+ public BytePropertyNode(int fcStart, int fcEnd, CharIndexTranslator translator, Object buf) { > super( >- generateCp(fcStart, isUnicode), >- generateCp(fcEnd, isUnicode), >+ translator.getCharIndex(fcStart), >+ translator.getCharIndex(fcEnd), > buf > ); >- this.isUnicode = isUnicode; >+ this.startBytes = fcStart; >+ this.endBytes = fcEnd; > } >- private static int generateCp(int val, boolean isUnicode) { >- if(isUnicode) >- return val/2; >- return val; >- } > >- public boolean isUnicode() { >- return isUnicode; >- } > public int getStartBytes() { >- if(isUnicode) >- return getStart()*2; >- return getStart(); >+ return startBytes; > } >+ > public int getEndBytes() { >- if(isUnicode) >- return getEnd()*2; >- return getEnd(); >+ return endBytes; > } > } >Index: src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java (working copy) >@@ -28,9 +28,9 @@ > > SectionDescriptor _sed; > >- public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode) >+ public SEPX(SectionDescriptor sed, int start, int end, CharIndexTranslator translator, byte[] grpprl) > { >- super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode); >+ super(start, end, translator, SectionSprmUncompressor.uncompressSEP(grpprl, 0)); > _sed = sed; > } > >Index: src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java (working copy) >@@ -76,9 +76,8 @@ > > public void insert(int listIndex, int cpStart, SprmBuffer buf) > { >- boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart); > >- PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode); >+ PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream); > > // Ensure character offsets are really characters > forInsert.setStart(cpStart); >@@ -108,7 +107,7 @@ > // Original, until insert at point > // New one > // Clone of original, on to the old end >- PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream, needsToBeUnicode); >+ PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream); > // Again ensure contains character based offsets no matter what > clone.setStart(cpStart); > clone.setEnd(currentPap.getEnd()); >Index: src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java >=================================================================== >--- src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java (revision 786463) >+++ src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java (working copy) >@@ -60,8 +60,9 @@ > > for (int x = 0; x < _crun; x++) > { >- boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) ); >- _chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode)); >+ int startAt = getStart(x); >+ int endAt = getEnd(x); >+ _chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x))); > } > } >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 46610
:
23178
|
23179
|
23180
|
23181
|
23184
|
23829
| 23833 |
23834
|
23835