poi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From n...@apache.org
Subject svn commit: r1087734 - in /poi/trunk/src: documentation/content/xdocs/ scratchpad/src/org/apache/poi/hsmf/datatypes/ scratchpad/src/org/apache/poi/hsmf/extractor/ scratchpad/testcases/org/apache/poi/hsmf/extractor/
Date Fri, 01 Apr 2011 15:02:14 GMT
Author: nick
Date: Fri Apr  1 15:02:14 2011
New Revision: 1087734

URL: http://svn.apache.org/viewvc?rev=1087734&view=rev
Log:
Update OutlookTextExtractor to request 7 bit encoding guessing

Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=1087734&r1=1087733&r2=1087734&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Fri Apr  1 15:02:14 2011
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.8-beta3" date="2011-??-??">
+           <action dev="poi-developers" type="fix">OutlookTextExtractor now requests
7 bit encoding guessing</action>
            <action dev="poi-developers" type="add">Improve HSMF encoding guessing for
7 bit fields in MAPIMessage</action>
            <action dev="poi-developers" type="add">Allow HSMF access to the HTML body
contents in MAPIMessage</action>
         </release>

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java?rev=1087734&r1=1087733&r2=1087734&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java Fri Apr  1
15:02:14 2011
@@ -32,9 +32,8 @@ import org.apache.poi.util.StringUtil;
 public class StringChunk extends Chunk {
    private static final String DEFAULT_ENCODING = "CP1252"; 
    private String encoding7Bit = DEFAULT_ENCODING;
-   private String value;
-   /** Only kept around for 7 bit strings */
    private byte[] rawValue;
+   private String value;
 
 	/**
 	 * Creates a String Chunk.
@@ -72,23 +71,22 @@ public class StringChunk extends Chunk {
 
 	   // Re-read the String if we're a 7 bit one
 	   if(type == Types.ASCII_STRING) {
-	      parseString(rawValue);
+	      parseString();
 	   }
 	}
 
 	public void readValue(InputStream value) throws IOException {
-	   byte[] data = IOUtils.toByteArray(value);
-	   parseString(data);
+	   rawValue = IOUtils.toByteArray(value);
+	   parseString();
 	}
-	private void parseString(byte[] data) {
+	private void parseString() {
 	   String tmpValue;
 	   switch(type) {
 	   case Types.ASCII_STRING:
-	      tmpValue = parseAs7BitData(data, encoding7Bit);
-	      this.rawValue = data;
+	      tmpValue = parseAs7BitData(rawValue, encoding7Bit);
 	      break;
 	   case Types.UNICODE_STRING:
-	      tmpValue = StringUtil.getFromUnicodeLE(data);
+	      tmpValue = StringUtil.getFromUnicodeLE(rawValue);
 	      break;
 	   default:
 	      throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
@@ -99,34 +97,46 @@ public class StringChunk extends Chunk {
 	}
 	
 	public void writeValue(OutputStream out) throws IOException {
-	   byte[] data;
-	   
+	   out.write(rawValue);
+	}
+	private void storeString() {
       switch(type) {
       case Types.ASCII_STRING:
          try {
-            data = value.getBytes(encoding7Bit);
+            rawValue = value.getBytes(encoding7Bit);
          } catch (UnsupportedEncodingException e) {
             throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
          }
          break;
       case Types.UNICODE_STRING:
-         data = new byte[value.length()*2];
-         StringUtil.putUnicodeLE(value, data, 0);
+         rawValue = new byte[value.length()*2];
+         StringUtil.putUnicodeLE(value, rawValue, 0);
          break;
       default:
          throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
       }
-      
-      out.write(data);
 	}
 	
+	/**
+	 * Returns the Text value of the chunk
+	 */
    public String getValue() {
       return this.value;
    }
-	public String toString() {
-		return this.value;
-	}
+   
+   public byte[] getRawValue() {
+      return this.rawValue;
+   }
 
+   public void setValue(String str) {
+      this.value = str;
+      storeString();
+   }
+   
+   public String toString() {
+      return this.value;
+   }
+   
    /**
     * Parses as non-unicode, supposedly 7 bit CP1252 data
     *  and returns the string that that yields.

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java?rev=1087734&r1=1087733&r2=1087734&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java Fri
Apr  1 15:02:14 2011
@@ -16,6 +16,7 @@
 ==================================================================== */
 package org.apache.poi.hsmf.extractor;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.SimpleDateFormat;
@@ -56,6 +57,15 @@ public class OutlookTextExtactor extends
    public OutlookTextExtactor(InputStream inp) throws IOException {
       this(new MAPIMessage(inp));
    }
+   
+   public static void main(String[] args) throws Exception {
+      for(String filename : args) {
+         OutlookTextExtactor extractor = new OutlookTextExtactor(
+               new NPOIFSFileSystem(new File(filename))
+         );
+         System.out.println( extractor.getText() );
+      }
+   }
 
    /**
     * Returns the underlying MAPI message
@@ -71,6 +81,11 @@ public class OutlookTextExtactor extends
       MAPIMessage msg = (MAPIMessage)document;
       StringBuffer s = new StringBuffer();
       
+      // See if we can get a suitable encoding for any
+      //  non unicode text in the file
+      msg.guess7BitEncoding();
+      
+      // Off we go
       StringsIterator emails;
       try {
          emails = new StringsIterator(

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java?rev=1087734&r1=1087733&r2=1087734&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
(original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
Fri Apr  1 15:02:14 2011
@@ -199,4 +199,21 @@ public final class TestOutlookTextExtrac
       // Embeded bits are checked in
       //  TestExtractorFactory
    }
+   
+   public void testEncodings() throws Exception {
+      POIFSFileSystem simple = new POIFSFileSystem(
+            new FileInputStream(samples.getFile("chinese-traditional.msg"))
+      );
+      MAPIMessage msg = new MAPIMessage(simple);
+      OutlookTextExtactor ext = new OutlookTextExtactor(msg);
+      String text = ext.getText();
+      
+      // Check the english bits
+      assertContains(text, "From: Tests Chang@FT");
+      assertContains(text, "tests.chang@fengttt.com");
+      
+      // And check some chinese bits
+      assertContains(text, "(\u5f35\u6bd3\u502b)");
+      assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
+   }
 }



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org


Mime
View raw message