View | Details | Raw Unified | Return to bug 56130
Collapse All | Expand All

(-)src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java (-50 / +110 lines)
Lines 376-383 Link Here
376
    * <p>Bug #49441 has more on why this is needed</p>
376
    * <p>Bug #49441 has more on why this is needed</p>
377
    */
377
    */
378
   public void guess7BitEncoding() {
378
   public void guess7BitEncoding() {
379
      String charset = null;
379
      // First choice is a codepage property
380
      // First choice is a codepage property
380
      for (MAPIProperty prop : new MAPIProperty[] {
381
      try{
382
          charset = getCharsetFromProperties();
383
      } catch (UnsupportedEncodingException e){
384
          //this will only happen if the codepage is < 0
385
          logger.log(POILogger.WARN, e.getMessage());
386
      }
387
      
388
      // Second choice is a charset on a content type header
389
      if (charset == null){
390
          charset = getCharsetFromHeaders();
391
      }
392
      
393
      // Nothing suitable in the headers, try HTML
394
      if (charset == null){
395
          charset = getCharsetFromHtmlBody();
396
      }
397
      
398
      //At this point, charset is not guaranteed 
399
      //to be a supported encoding
400
      //this can throw an UnsupportedEncodingException
401
      if (charset != null){
402
         set7BitEncoding(charset);
403
      }
404
   }
405
   
406
   /**
407
    * Try to get the charset from the HTML metaheader.
408
    * This can return an unsupported encoding or null.
409
    *
410
    * This is mainly intended for diagnostic purposes.  Most clients
411
    * will want to call {@link #guess7BitEncoding()} instead of this.
412
    * @return a charset extracted from the metaheader or null if no meta-header was found.
413
    */
414
   public String getCharsetFromHtmlBody() {
415
       try {
416
           String html = getHtmlBody();
417
           if(html != null && html.length() > 0) {
418
              // Look for a content type in the meta headers
419
              Pattern p = Pattern.compile(
420
                    "<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\""
421
              );
422
              Matcher m = p.matcher(html);
423
              if(m.find()) {
424
                 // Found it! Tell all the string chunks
425
                 String charset = m.group(1);
426
                 return charset;
427
              }
428
           }
429
        } catch(ChunkNotFoundException e) {}
430
       return null;
431
   }
432
433
434
   /**
435
    * Try to get the encoding from the headers.
436
    * This can return an unsupported encoding or null.
437
    *
438
    * This is mainly intended for diagnostic purposes.  Most clients
439
    * will want to call {@link #guess7BitEncoding()} instead of this.
440
    * @return the encoding or null if not found
441
    */
442
   public String getCharsetFromHeaders() {
443
      try {
444
          String[] headers = getHeaders();
445
          if(headers != null && headers.length > 0) {
446
             // Look for a content type with a charset
447
             Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
448
449
             for(String header : headers) {
450
                if(header.startsWith("Content-Type")) {
451
                   Matcher m = p.matcher(header);
452
                   if(m.matches()) {
453
                      // Found it! Tell all the string chunks
454
                      String charset = m.group(1);
455
456
                      if (!charset.equalsIgnoreCase("utf-8")) { 
457
                         return charset;
458
                      }
459
                   }
460
                }
461
             }
462
          }
463
       } catch(ChunkNotFoundException e) {}
464
465
      return null;
466
   }
467
468
469
   /**
470
    * Try to get the charset from the MESSAGE_CODEPAGE or 
471
    * the INTERNET_CPID and then convert that to a Java encoding.
472
    *
473
    * This can return an unsupported encoding or null.
474
    *
475
    * This is mainly intended for diagnostic purposes.  Most clients
476
    * will want to call {@link #guess7BitEncoding()} instead of this.
477
    * 
478
    * @return encoding or null if a charset wasn't found.
479
    * @throws UnsupportedEncodingException
480
    */
481
   public String getCharsetFromProperties() throws UnsupportedEncodingException{
482
       for (MAPIProperty prop : new MAPIProperty[] {
381
               MAPIProperty.MESSAGE_CODEPAGE,
483
               MAPIProperty.MESSAGE_CODEPAGE,
382
               MAPIProperty.INTERNET_CPID
484
               MAPIProperty.INTERNET_CPID
383
      }) {
485
      }) {
Lines 385-445 Link Here
385
        if (val != null && val.size() > 0) {
487
        if (val != null && val.size() > 0) {
386
           int codepage = ((LongPropertyValue)val.get(0)).getValue();
488
           int codepage = ((LongPropertyValue)val.get(0)).getValue();
387
           try {
489
           try {
388
               String encoding = CodePageUtil.codepageToEncoding(codepage, true);
490
               return CodePageUtil.codepageToEncoding(codepage, true);
389
               set7BitEncoding(encoding);
390
               return;
391
            } catch(UnsupportedEncodingException e) {
491
            } catch(UnsupportedEncodingException e) {
392
               logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, 
492
               throw new UnsupportedEncodingException(
393
                          " set for the message via ", prop, ", ignoring");
493
                       String.format("%s%d%s%s", "Invalid codepage ID ", 
494
                       codepage, " set for the message via ", prop.toString()));
394
            }
495
            }
395
         }
496
         }
396
      }
497
      }
397
     
498
      return null;
398
       
499
   }
399
      // Second choice is a charset on a content type header
400
      try {
401
         String[] headers = getHeaders();
402
         if(headers != null && headers.length > 0) {
403
            // Look for a content type with a charset
404
            Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
405
500
406
            for(String header : headers) {
407
               if(header.startsWith("Content-Type")) {
408
                  Matcher m = p.matcher(header);
409
                  if(m.matches()) {
410
                     // Found it! Tell all the string chunks
411
                     String charset = m.group(1);
412
501
413
                     if (!charset.equalsIgnoreCase("utf-8")) { 
502
/**
414
                        set7BitEncoding(charset);
415
                     }
416
                     return;
417
                  }
418
               }
419
            }
420
         }
421
      } catch(ChunkNotFoundException e) {}
422
      
423
      // Nothing suitable in the headers, try HTML
424
      try {
425
         String html = getHmtlBody();
426
         if(html != null && html.length() > 0) {
427
            // Look for a content type in the meta headers
428
            Pattern p = Pattern.compile(
429
                  "<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\""
430
            );
431
            Matcher m = p.matcher(html);
432
            if(m.find()) {
433
               // Found it! Tell all the string chunks
434
               String charset = m.group(1);
435
               set7BitEncoding(charset);
436
               return;
437
            }
438
         }
439
      } catch(ChunkNotFoundException e) {}
440
   }
441
442
   /**
443
    * Many messages store their strings as unicode, which is
503
    * Many messages store their strings as unicode, which is
444
    *  nice and easy. Some use one-byte encodings for their
504
    *  nice and easy. Some use one-byte encodings for their
445
    *  strings, but don't easily store the encoding anywhere
505
    *  strings, but don't easily store the encoding anywhere

Return to bug 56130