Index: src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java =================================================================== --- src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java (revision 1566940) +++ src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java (working copy) @@ -376,8 +376,110 @@ *

Bug #49441 has more on why this is needed

*/ public void guess7BitEncoding() { + String charset = null; // First choice is a codepage property - for (MAPIProperty prop : new MAPIProperty[] { + try{ + charset = getCharsetFromProperties(); + } catch (UnsupportedEncodingException e){ + //this will only happen if the codepage is < 0 + logger.log(POILogger.WARN, e.getMessage()); + } + + // Second choice is a charset on a content type header + if (charset == null){ + charset = getCharsetFromHeaders(); + } + + // Nothing suitable in the headers, try HTML + if (charset == null){ + charset = getCharsetFromHtmlBody(); + } + + //At this point, charset is not guaranteed + //to be a supported encoding + //this can throw an UnsupportedEncodingException + if (charset != null){ + set7BitEncoding(charset); + } + } + + /** + * Try to get the charset from the HTML metaheader. + * This can return an unsupported encoding or null. + * + * This is mainly intended for diagnostic purposes. Most clients + * will want to call {@link #guess7BitEncoding()} instead of this. + * @return a charset extracted from the metaheader or null if no meta-header was found. + */ + public String getCharsetFromHtmlBody() { + try { + String html = getHtmlBody(); + if(html != null && html.length() > 0) { + // Look for a content type in the meta headers + Pattern p = Pattern.compile( + " 0) { + // Look for a content type with a charset + Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); + + for(String header : headers) { + if(header.startsWith("Content-Type")) { + Matcher m = p.matcher(header); + if(m.matches()) { + // Found it! Tell all the string chunks + String charset = m.group(1); + + if (!charset.equalsIgnoreCase("utf-8")) { + return charset; + } + } + } + } + } + } catch(ChunkNotFoundException e) {} + + return null; + } + + + /** + * Try to get the charset from the MESSAGE_CODEPAGE or + * the INTERNET_CPID and then convert that to a Java encoding. + * + * This can return an unsupported encoding or null. + * + * This is mainly intended for diagnostic purposes. Most clients + * will want to call {@link #guess7BitEncoding()} instead of this. + * + * @return encoding or null if a charset wasn't found. + * @throws UnsupportedEncodingException + */ + public String getCharsetFromProperties() throws UnsupportedEncodingException{ + for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) { @@ -385,61 +487,19 @@ if (val != null && val.size() > 0) { int codepage = ((LongPropertyValue)val.get(0)).getValue(); try { - String encoding = CodePageUtil.codepageToEncoding(codepage, true); - set7BitEncoding(encoding); - return; + return CodePageUtil.codepageToEncoding(codepage, true); } catch(UnsupportedEncodingException e) { - logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, - " set for the message via ", prop, ", ignoring"); + throw new UnsupportedEncodingException( + String.format("%s%d%s%s", "Invalid codepage ID ", + codepage, " set for the message via ", prop.toString())); } } } - - - // Second choice is a charset on a content type header - try { - String[] headers = getHeaders(); - if(headers != null && headers.length > 0) { - // Look for a content type with a charset - Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); + return null; + } - for(String header : headers) { - if(header.startsWith("Content-Type")) { - Matcher m = p.matcher(header); - if(m.matches()) { - // Found it! Tell all the string chunks - String charset = m.group(1); - if (!charset.equalsIgnoreCase("utf-8")) { - set7BitEncoding(charset); - } - return; - } - } - } - } - } catch(ChunkNotFoundException e) {} - - // Nothing suitable in the headers, try HTML - try { - String html = getHmtlBody(); - if(html != null && html.length() > 0) { - // Look for a content type in the meta headers - Pattern p = Pattern.compile( - "