Lines 376-383
Link Here
|
376 |
* <p>Bug #49441 has more on why this is needed</p> |
376 |
* <p>Bug #49441 has more on why this is needed</p> |
377 |
*/ |
377 |
*/ |
378 |
public void guess7BitEncoding() { |
378 |
public void guess7BitEncoding() { |
|
|
379 |
String charset = null; |
379 |
// First choice is a codepage property |
380 |
// First choice is a codepage property |
380 |
for (MAPIProperty prop : new MAPIProperty[] { |
381 |
try{ |
|
|
382 |
charset = getCharsetFromProperties(); |
383 |
} catch (UnsupportedEncodingException e){ |
384 |
//this will only happen if the codepage is < 0 |
385 |
logger.log(POILogger.WARN, e.getMessage()); |
386 |
} |
387 |
|
388 |
// Second choice is a charset on a content type header |
389 |
if (charset == null){ |
390 |
charset = getCharsetFromHeaders(); |
391 |
} |
392 |
|
393 |
// Nothing suitable in the headers, try HTML |
394 |
if (charset == null){ |
395 |
charset = getCharsetFromHtmlBody(); |
396 |
} |
397 |
|
398 |
//At this point, charset is not guaranteed |
399 |
//to be a supported encoding |
400 |
//this can throw an UnsupportedEncodingException |
401 |
if (charset != null){ |
402 |
set7BitEncoding(charset); |
403 |
} |
404 |
} |
405 |
|
406 |
/** |
407 |
* Try to get the charset from the HTML metaheader. |
408 |
* This can return an unsupported encoding or null. |
409 |
* |
410 |
* This is mainly intended for diagnostic purposes. Most clients |
411 |
* will want to call {@link #guess7BitEncoding()} instead of this. |
412 |
* @return a charset extracted from the metaheader or null if no meta-header was found. |
413 |
*/ |
414 |
public String getCharsetFromHtmlBody() { |
415 |
try { |
416 |
String html = getHtmlBody(); |
417 |
if(html != null && html.length() > 0) { |
418 |
// Look for a content type in the meta headers |
419 |
Pattern p = Pattern.compile( |
420 |
"<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\"" |
421 |
); |
422 |
Matcher m = p.matcher(html); |
423 |
if(m.find()) { |
424 |
// Found it! Tell all the string chunks |
425 |
String charset = m.group(1); |
426 |
return charset; |
427 |
} |
428 |
} |
429 |
} catch(ChunkNotFoundException e) {} |
430 |
return null; |
431 |
} |
432 |
|
433 |
|
434 |
/** |
435 |
* Try to get the encoding from the headers. |
436 |
* This can return an unsupported encoding or null. |
437 |
* |
438 |
* This is mainly intended for diagnostic purposes. Most clients |
439 |
* will want to call {@link #guess7BitEncoding()} instead of this. |
440 |
* @return the encoding or null if not found |
441 |
*/ |
442 |
public String getCharsetFromHeaders() { |
443 |
try { |
444 |
String[] headers = getHeaders(); |
445 |
if(headers != null && headers.length > 0) { |
446 |
// Look for a content type with a charset |
447 |
Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); |
448 |
|
449 |
for(String header : headers) { |
450 |
if(header.startsWith("Content-Type")) { |
451 |
Matcher m = p.matcher(header); |
452 |
if(m.matches()) { |
453 |
// Found it! Tell all the string chunks |
454 |
String charset = m.group(1); |
455 |
|
456 |
if (!charset.equalsIgnoreCase("utf-8")) { |
457 |
return charset; |
458 |
} |
459 |
} |
460 |
} |
461 |
} |
462 |
} |
463 |
} catch(ChunkNotFoundException e) {} |
464 |
|
465 |
return null; |
466 |
} |
467 |
|
468 |
|
469 |
/** |
470 |
* Try to get the charset from the MESSAGE_CODEPAGE or |
471 |
* the INTERNET_CPID and then convert that to a Java encoding. |
472 |
* |
473 |
* This can return an unsupported encoding or null. |
474 |
* |
475 |
* This is mainly intended for diagnostic purposes. Most clients |
476 |
* will want to call {@link #guess7BitEncoding()} instead of this. |
477 |
* |
478 |
* @return encoding or null if a charset wasn't found. |
479 |
* @throws UnsupportedEncodingException |
480 |
*/ |
481 |
public String getCharsetFromProperties() throws UnsupportedEncodingException{ |
482 |
for (MAPIProperty prop : new MAPIProperty[] { |
381 |
MAPIProperty.MESSAGE_CODEPAGE, |
483 |
MAPIProperty.MESSAGE_CODEPAGE, |
382 |
MAPIProperty.INTERNET_CPID |
484 |
MAPIProperty.INTERNET_CPID |
383 |
}) { |
485 |
}) { |
Lines 385-445
Link Here
|
385 |
if (val != null && val.size() > 0) { |
487 |
if (val != null && val.size() > 0) { |
386 |
int codepage = ((LongPropertyValue)val.get(0)).getValue(); |
488 |
int codepage = ((LongPropertyValue)val.get(0)).getValue(); |
387 |
try { |
489 |
try { |
388 |
String encoding = CodePageUtil.codepageToEncoding(codepage, true); |
490 |
return CodePageUtil.codepageToEncoding(codepage, true); |
389 |
set7BitEncoding(encoding); |
|
|
390 |
return; |
391 |
} catch(UnsupportedEncodingException e) { |
491 |
} catch(UnsupportedEncodingException e) { |
392 |
logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, |
492 |
throw new UnsupportedEncodingException( |
393 |
" set for the message via ", prop, ", ignoring"); |
493 |
String.format("%s%d%s%s", "Invalid codepage ID ", |
|
|
494 |
codepage, " set for the message via ", prop.toString())); |
394 |
} |
495 |
} |
395 |
} |
496 |
} |
396 |
} |
497 |
} |
397 |
|
498 |
return null; |
398 |
|
499 |
} |
399 |
// Second choice is a charset on a content type header |
|
|
400 |
try { |
401 |
String[] headers = getHeaders(); |
402 |
if(headers != null && headers.length > 0) { |
403 |
// Look for a content type with a charset |
404 |
Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); |
405 |
|
500 |
|
406 |
for(String header : headers) { |
|
|
407 |
if(header.startsWith("Content-Type")) { |
408 |
Matcher m = p.matcher(header); |
409 |
if(m.matches()) { |
410 |
// Found it! Tell all the string chunks |
411 |
String charset = m.group(1); |
412 |
|
501 |
|
413 |
if (!charset.equalsIgnoreCase("utf-8")) { |
502 |
/** |
414 |
set7BitEncoding(charset); |
|
|
415 |
} |
416 |
return; |
417 |
} |
418 |
} |
419 |
} |
420 |
} |
421 |
} catch(ChunkNotFoundException e) {} |
422 |
|
423 |
// Nothing suitable in the headers, try HTML |
424 |
try { |
425 |
String html = getHmtlBody(); |
426 |
if(html != null && html.length() > 0) { |
427 |
// Look for a content type in the meta headers |
428 |
Pattern p = Pattern.compile( |
429 |
"<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\"" |
430 |
); |
431 |
Matcher m = p.matcher(html); |
432 |
if(m.find()) { |
433 |
// Found it! Tell all the string chunks |
434 |
String charset = m.group(1); |
435 |
set7BitEncoding(charset); |
436 |
return; |
437 |
} |
438 |
} |
439 |
} catch(ChunkNotFoundException e) {} |
440 |
} |
441 |
|
442 |
/** |
443 |
* Many messages store their strings as unicode, which is |
503 |
* Many messages store their strings as unicode, which is |
444 |
* nice and easy. Some use one-byte encodings for their |
504 |
* nice and easy. Some use one-byte encodings for their |
445 |
* strings, but don't easily store the encoding anywhere |
505 |
* strings, but don't easily store the encoding anywhere |