poi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From n...@apache.org
Subject svn commit: r951034 - in /poi/trunk/src: documentation/content/xdocs/ scratchpad/src/org/apache/poi/hsmf/ scratchpad/src/org/apache/poi/hsmf/datatypes/ scratchpad/src/org/apache/poi/hsmf/extractor/ scratchpad/testcases/org/apache/poi/hsmf/ scratchpad/t...
Date Thu, 03 Jun 2010 15:33:54 GMT
Author: nick
Date: Thu Jun  3 15:33:54 2010
New Revision: 951034

URL: http://svn.apache.org/viewvc?rev=951034&view=rev
Log:
Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed

Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=951034&r1=951033&r2=951034&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Thu Jun  3 15:33:54 2010
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Parse the HSMF headers chunk if
present, and use it to find Dates in text extraction if needed</action>
            <action dev="POI-DEVELOPERS" type="fix">48494 - detect and support time
formats like HH:MM;HH:MM</action>
            <action dev="POI-DEVELOPERS" type="fix">48494 - have ExcelExtractor make
use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render
them</action>
            <action dev="POI-DEVELOPERS" type="fix">48494 - have EventBasedExcelExtractor
make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would
render them</action>

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java?rev=951034&r1=951033&r2=951034&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java Thu Jun  3 15:33:54
2010
@@ -282,6 +282,17 @@ public class MAPIMessage extends POIDocu
       return names;
    }
 
+   
+   /**
+    * 
+    */
+   public String[] getHeaders() throws ChunkNotFoundException {
+      String headers = getStringFromChunk(mainChunks.messageHeaders);
+      if(headers == null) {
+         return null;
+      }
+      return headers.split("\\r?\\n");
+   }
 
    /**
     * Gets the conversation topic of the parsed Outlook Message.

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java?rev=951034&r1=951033&r2=951034&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java Thu Jun  3 15:33:54
2010
@@ -35,6 +35,7 @@ public final class Chunks implements Chu
    // 0x0050 -> 0x006F seem to be routing info or similar
    public static final int CONVERSATION_TOPIC  = 0x0070;
    public static final int SENT_BY_SERVER_TYPE = 0x0075;
+   public static final int MESSAGE_HEADERS     = 0x007D;
    // RECEIVEDEMAIL = 76
    public static final int DISPLAY_TO          = 0x0E04;
    public static final int DISPLAY_FROM        = 0x0C1A;
@@ -66,6 +67,8 @@ public final class Chunks implements Chu
    public StringChunk conversationTopic;
    /** Type of server that the message originated from (SMTP, etc). */
    public StringChunk sentByServerType;
+   /** The email headers */
+   public StringChunk messageHeaders;
    /** TODO */
    public MessageSubmissionChunk submissionChunk; 
    /** TODO */
@@ -104,6 +107,9 @@ public final class Chunks implements Chu
       case SENT_BY_SERVER_TYPE:
          sentByServerType = (StringChunk)chunk;
          break;
+      case MESSAGE_HEADERS:
+         messageHeaders = (StringChunk)chunk;
+         break;
       case DISPLAY_TO:
          displayToChunk = (StringChunk)chunk;
          break;

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java?rev=951034&r1=951033&r2=951034&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java Thu
Jun  3 15:33:54 2010
@@ -87,10 +87,30 @@ public class OutlookTextExtactor extends
          handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
       } catch(ChunkNotFoundException e) {}
       
+      // Date - try two ways to find it
       try {
+         // First try via the proper chunk
          SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
          s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
-      } catch(ChunkNotFoundException e) {}
+      } catch(ChunkNotFoundException e) {
+         try {
+            // Failing that try via the raw headers 
+            String[] headers = msg.getHeaders();
+            for(String header: headers) {
+               if(header.toLowerCase().startsWith("date:")) {
+                  s.append(
+                        "Date:" + 
+                        header.substring(header.indexOf(':')+1) +
+                        "\n"
+                  );
+                  break;
+               }
+            }
+         } catch(ChunkNotFoundException he) {
+            // We can't find the date, sorry...
+         }
+      }
+      
       try {
          s.append("Subject: " + msg.getSubject() + "\n");
       } catch(ChunkNotFoundException e) {}

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java?rev=951034&r1=951033&r2=951034&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java Thu Jun  3 15:33:54
2010
@@ -76,6 +76,35 @@ public final class TestBasics extends Te
 	}
 	
 	/**
+	 * Test message headers
+	 */
+	public void testHeaders() throws Exception {
+	   // Simple email first
+	   assertEquals(26, simple.getHeaders().length);
+	   assertTrue(simple.getHeaders()[0].startsWith("Return-path:"));
+      assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com"));
+      assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean"));
+      
+      // Quick doesn't have them
+      try {
+         quick.getHeaders();
+         fail();
+      } catch(ChunkNotFoundException e) {}
+      
+      // Attachments doesn't have them
+      try {
+         attachments.getHeaders();
+         fail();
+      } catch(ChunkNotFoundException e) {}
+      
+      // Outlook30 has some
+      assertEquals(33, outlook30.getHeaders().length);
+      assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers"));
+      assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:"));
+      assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better
parsing in future
+	}
+	
+	/**
 	 * Test attachments
 	 */
 	public void testAttachments() throws Exception {

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java?rev=951034&r1=951033&r2=951034&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
(original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
Thu Jun  3 15:33:54 2010
@@ -84,7 +84,7 @@ public final class TestOutlookTextExtrac
       assertEquals(-1, text.indexOf("CC:"));
       assertEquals(-1, text.indexOf("BCC:"));
       assertContains(text, "Subject: test message\n");
-      assertEquals(-1, text.indexOf("Date:"));
+      assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n");
       assertContains(text, "This is a test message.");
    }
 
@@ -171,7 +171,7 @@ public final class TestOutlookTextExtrac
                "nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
          assertEquals(-1, text.indexOf("BCC:"));
          assertContains(text, "Subject: This is a test message please ignore\n");
-         assertEquals(-1, text.indexOf("Date:"));
+         assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n");
          assertContains(text, "The quick brown fox jumps over the lazy dog");
       }
    }



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org


Mime
View raw message