Index: WordExtractor.java =================================================================== --- WordExtractor.java (revision 522571) +++ WordExtractor.java (working copy) @@ -135,73 +135,47 @@ List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); - - TextPiece currentPiece = (TextPiece)textIt.next(); - int currentTextStart = currentPiece.getStart(); - int currentTextEnd = currentPiece.getEnd(); - + + if(!runIt.hasNext()) + return ""; + WordTextBuffer finalTextBuf = new WordTextBuffer(); - - // iterate through all text runs extract the text only if they haven't been - // deleted - while (runIt.hasNext()) - { - CHPX chpx = (CHPX)runIt.next(); - boolean deleted = isDeleted(chpx.getGrpprl()); - if (deleted) - { - continue; - } - - int runStart = chpx.getStart(); - int runEnd = chpx.getEnd(); - - while (runStart >= currentTextEnd) - { - currentPiece = (TextPiece) textIt.next (); - currentTextStart = currentPiece.getStart (); - currentTextEnd = currentPiece.getEnd (); - } - - if (runEnd < currentTextEnd) - { - String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); - finalTextBuf.append(str); - } - else if (runEnd > currentTextEnd) - { - while (runEnd > currentTextEnd) - { - String str = currentPiece.substring(runStart - currentTextStart, - currentTextEnd - currentTextStart); - finalTextBuf.append(str); - if (textIt.hasNext()) - { - currentPiece = (TextPiece) textIt.next (); - currentTextStart = currentPiece.getStart (); - runStart = currentTextStart; - currentTextEnd = currentPiece.getEnd (); - } - else - { - return finalTextBuf.toString(); - } - } - String str = currentPiece.substring(0, runEnd - currentTextStart); - finalTextBuf.append(str); - } - else - { - String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); - if (textIt.hasNext()) - { - currentPiece = (TextPiece) textIt.next(); - currentTextStart = currentPiece.getStart(); - currentTextEnd = currentPiece.getEnd(); - } - finalTextBuf.append(str); - } + + // This code is built to handle all 6 cases of pieces and runs: + // The two cases where there is no overlap. + // The two cases where one is completely contained in the other. + // The two cases where there is partial overlap. + + CHPX currRun = (CHPX) runIt.next(); + outer: + while(textIt.hasNext()) { + TextPiece currPiece = (TextPiece) textIt.next(); + do { + // If all of the current run is after the current piece, go on to the next piece. + if(currRun.getStart() >= currPiece.getEnd()) { + continue outer; + } + // If the current text run isn't deleted and this piece starts before the + // current run ends, there must be some overlap between these objects. + if(!isDeleted(currRun.getGrpprl()) && currPiece.getStart() < currRun.getEnd()) { + int startIndex = Math.max(currRun.getStart() - currPiece.getStart(), 0); + int endIndex = Math.min(currRun.getEnd(), currPiece.getEnd()) - currPiece.getStart(); + String str = currPiece.substring(startIndex, endIndex); + finalTextBuf.append(str); + + // if this run ends after the current piece ends, go on to the next piece + // while still using the current run. + if(currRun.getEnd() >= currPiece.getEnd()) { + continue outer; + } + } + + if(runIt.hasNext()) { + currRun = (CHPX) runIt.next(); + } + }while(runIt.hasNext()); } + return finalTextBuf.toString(); }