Link Here
|
76 |
* works for non-complex files |
76 |
* works for non-complex files |
77 |
* |
77 |
* |
78 |
* @author Ryan Ackley |
78 |
* @author Ryan Ackley |
|
|
79 |
* @author Serge Huber |
79 |
*/ |
80 |
*/ |
80 |
|
81 |
|
81 |
public class WordDocument |
82 |
public class WordDocument |
Link Here
|
183 |
int textStart = Utils.convertBytesToInt(_header, 0x18); |
184 |
int textStart = Utils.convertBytesToInt(_header, 0x18); |
184 |
int textEnd = Utils.convertBytesToInt(_header, 0x1c); |
185 |
int textEnd = Utils.convertBytesToInt(_header, 0x1c); |
185 |
ArrayList textPieces = findProperties(textStart, textEnd, _text.root); |
186 |
ArrayList textPieces = findProperties(textStart, textEnd, _text.root); |
|
|
187 |
|
188 |
if (textPieces.size() == 0) { |
189 |
// fall-back to full text extraction, hoping it won't be too messy... |
190 |
TextPiece fullText = new TextPiece(textStart, textEnd - textStart, false); |
191 |
textPieces.add(fullText); |
192 |
} |
193 |
|
186 |
int size = textPieces.size(); |
194 |
int size = textPieces.size(); |
187 |
|
195 |
|
188 |
for(int x = 0; x < size; x++) |
196 |
for(int x = 0; x < size; x++) |
Link Here
|
205 |
} |
213 |
} |
206 |
else |
214 |
else |
207 |
{ |
215 |
{ |
208 |
String sText = new String(_header, start, end-start); |
216 |
// String sText = new String(_header, start, end-start); |
|
|
217 |
String sText = new String(_header, start, end-start, "Cp1252"); |
209 |
out.write(sText); |
218 |
out.write(sText); |
210 |
} |
219 |
} |
211 |
} |
220 |
} |
Link Here
|
221 |
{ |
230 |
{ |
222 |
this(new FileInputStream(fileName)); |
231 |
this(new FileInputStream(fileName)); |
223 |
} |
232 |
} |
224 |
|
233 |
|
225 |
public WordDocument(InputStream inputStream) throws IOException |
234 |
public WordDocument(InputStream inputStream) throws IOException |
226 |
{ |
235 |
{ |
227 |
//do Ole stuff |
236 |
//do Ole stuff |
Link Here
|
264 |
|
273 |
|
265 |
//I call it the header but its also the main document stream |
274 |
//I call it the header but its also the main document stream |
266 |
_header = new byte[headerProps.getSize()]; |
275 |
_header = new byte[headerProps.getSize()]; |
267 |
filesystem.createDocumentInputStream("WordDocument").read(_header); |
276 |
int bytesRead = filesystem.createDocumentInputStream("WordDocument").read(_header); |
|
|
277 |
|
278 |
int fibVersion = LittleEndian.getShort(_header, 0x2) & 0xFFFF; |
279 |
int productVersion = LittleEndian.getShort(_header, 0x4) & 0xFFFF; |
268 |
|
280 |
|
269 |
//Get the information we need from the header |
281 |
//Get the information we need from the header |
270 |
int info = LittleEndian.getShort(_header, 0xa); |
282 |
int info = LittleEndian.getShort(_header, 0xa); |
271 |
|
283 |
|
272 |
_fcMin = LittleEndian.getInt(_header, 0x18); |
284 |
_fcMin = LittleEndian.getInt(_header, 0x18); |
|
|
285 |
int _fcMax = LittleEndian.getInt(_header, 0x1C); |
273 |
_ccpText = LittleEndian.getInt(_header, 0x4c); |
286 |
_ccpText = LittleEndian.getInt(_header, 0x4c); |
274 |
_ccpFtn = LittleEndian.getInt(_header, 0x50); |
287 |
_ccpFtn = LittleEndian.getInt(_header, 0x50); |
|
|
288 |
int _ccpHdd = LittleEndian.getInt(_header, 0x54); |
275 |
|
289 |
|
276 |
int charPLC = LittleEndian.getInt(_header, 0xfa); |
290 |
int charPLC = LittleEndian.getInt(_header, 0xfa); |
277 |
int charPlcSize = LittleEndian.getInt(_header, 0xfe); |
291 |
int charPlcSize = LittleEndian.getInt(_header, 0xfe); |
278 |
int parPLC = LittleEndian.getInt(_header, 0x102); |
292 |
int parPLC = LittleEndian.getInt(_header, 0x102); |
279 |
int parPlcSize = LittleEndian.getInt(_header, 0x106); |
293 |
int parPlcSize = LittleEndian.getInt(_header, 0x106); |
|
|
294 |
|
295 |
int lcbClx = LittleEndian.getInt(_header, 0x1A6); |
296 |
|
280 |
boolean useTable1 = (info & 0x200) != 0; |
297 |
boolean useTable1 = (info & 0x200) != 0; |
|
|
298 |
boolean isComplex = (info & 0x4) != 0; |
281 |
|
299 |
|
282 |
//process the text and formatting properties |
300 |
if (isComplex) { |
283 |
processComplexFile(useTable1, charPLC, charPlcSize, parPLC, parPlcSize); |
301 |
// now let's verify the existence of a table stream |
|
|
302 |
String tablename=null; |
303 |
DocumentEntry tableEntry = null; |
304 |
if (useTable1) { |
305 |
tablename="1Table"; |
306 |
} else { |
307 |
tablename="0Table"; |
308 |
} |
309 |
try { |
310 |
tableEntry = (DocumentEntry) filesystem.getRoot().getEntry( |
311 |
tablename); |
312 |
} catch (FileNotFoundException fnfe) { |
313 |
isComplex = false; |
314 |
} |
315 |
} |
316 |
|
317 |
if ((isComplex)) { |
318 |
//process the text and formatting properties |
319 |
processComplexFile(useTable1, charPLC, charPlcSize, parPLC, |
320 |
parPlcSize); |
321 |
} else { |
322 |
TextPiece piece = new TextPiece(_fcMin, _fcMax - _fcMin, false); |
323 |
_text.add(piece); |
324 |
} |
284 |
} |
325 |
} |
285 |
|
326 |
|
286 |
/** |
327 |
/** |
Link Here
|
328 |
//parse out the text locations |
369 |
//parse out the text locations |
329 |
findText(tableStream, complexOffset); |
370 |
findText(tableStream, complexOffset); |
330 |
//parse out text formatting |
371 |
//parse out text formatting |
331 |
findFormatting(tableStream, charTable, charPlcSize, parTable, parPlcSize); |
372 |
// findFormatting(tableStream, charTable, charPlcSize, parTable, parPlcSize); |
332 |
|
373 |
|
333 |
} |
374 |
} |
334 |
/** |
375 |
/** |