manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kam...@apache.org
Subject svn commit: r1791374 - in /manifoldcf/trunk: ./ connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/ connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/ connectors/email/c...
Date Fri, 14 Apr 2017 14:50:38 GMT
Author: kamaci
Date: Fri Apr 14 14:50:38 2017
New Revision: 1791374

URL: http://svn.apache.org/viewvc?rev=1791374&view=rev
Log:
Fix for CONNECTORS-1407.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
    manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
    manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties
    manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties
    manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties
    manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties
    manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html
    manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Apr 14 14:50:38 2017
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 2.7-dev =====================
 
+CONNECTORS-1407: Extract email addresses from email metadata fields.
+(Furkan KAMACI)
+
 CONNECTORS-1406: Fix multiple To and From field bug at e-mail.
 (Furkan KAMACI)
 

Modified: manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
(original)
+++ manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
Fri Apr 14 14:50:38 2017
@@ -78,12 +78,14 @@ public class EmailConfig {
   public static final String PORT_DEFAULT_VALUE = "";
   public static final String[] BASIC_METADATA = {"To","From","Subject","Body","Date","Encoding
of Attachment",
       "MIME Type of attachment", "File Name of Attachment"};
+  public static final String BASIC_EXTRACT_EMAIL = "Use E-Mail Extractor";
   public static final String[] BASIC_SEARCHABLE_ATTRIBUTES = {"To","From","Subject","Body","Start
Date", "End Date"};
 
   // Specification nodes
   
   public static final String NODE_PROPERTIES = "properties";
   public static final String NODE_METADATA = "metadata";
+  public static final String NODE_EXTRACT_EMAIL = "extractemail";
   public static final String NODE_FILTER = "filter";
   public static final String NODE_FOLDER = "folder";
   

Modified: manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
(original)
+++ manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
Fri Apr 14 14:50:38 2017
@@ -31,6 +31,8 @@ import java.io.*;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import javax.mail.*;
 import javax.mail.internet.MimeBodyPart;
 import javax.mail.internet.MimeMessage;
@@ -482,12 +484,16 @@ public class EmailConnector extends org.
     throws ManifoldCFException, ServiceInterruption {
 
     List<String> requiredMetadata = new ArrayList<String>();
+    boolean useEmailExtractor = false;
     for (int i = 0; i < spec.getChildCount(); i++) {
       SpecificationNode sn = spec.getChild(i);
       if (sn.getType().equals(EmailConfig.NODE_METADATA)) {
         String metadataAttribute = sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
         requiredMetadata.add(metadataAttribute);
       }
+      if (sn.getType().equals(EmailConfig.NODE_EXTRACT_EMAIL)) {
+        useEmailExtractor = true;
+      }
     }
     
     // Keep a cached set of open folders
@@ -590,7 +596,7 @@ public class EmailConnector extends org.
                   String[] toStr = new String[to.length];
                   int j = 0;
                   for (Address address : to) {
-                    toStr[j] = address.toString();
+                    toStr[j] = useEmailExtractor ? extractEmailAddress(address.toString())
: address.toString();
                     j++;
                   }
                   rd.addField(EmailConfig.EMAIL_TO, toStr);
@@ -599,11 +605,10 @@ public class EmailConnector extends org.
                   String[] fromStr = new String[from.length];
                   int j = 0;
                   for (Address address : from) {
-                    fromStr[j] = address.toString();
+                    fromStr[j] = useEmailExtractor ? extractEmailAddress(address.toString())
: address.toString();
                     j++;
                   }
                   rd.addField(EmailConfig.EMAIL_FROM, fromStr);
-
                 } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT))
{
                   String subject = msg.getSubject();
                   rd.addField(EmailConfig.EMAIL_SUBJECT, subject);
@@ -850,7 +855,7 @@ public class EmailConnector extends org.
                   String[] toStr = new String[to.length];
                   int j = 0;
                   for (Address address : to) {
-                    toStr[j] = address.toString();
+                    toStr[j] = useEmailExtractor ? extractEmailAddress(address.toString())
: address.toString();
                     j++;
                   }
                   rd.addField(EmailConfig.EMAIL_TO, toStr);
@@ -859,7 +864,7 @@ public class EmailConnector extends org.
                   String[] fromStr = new String[from.length];
                   int j = 0;
                   for (Address address : from) {
-                    fromStr[j] = address.toString();
+                    fromStr[j] = useEmailExtractor ? extractEmailAddress(address.toString())
: address.toString();
                     j++;
                   }
                   rd.addField(EmailConfig.EMAIL_FROM, fromStr);
@@ -930,6 +935,20 @@ public class EmailConnector extends org.
 
   }
 
+  /**
+   * Extracts e-mail address within < and > characters if any.
+   * If not, returns passed raw mail address.
+   *
+   * @param rawEmailAddress e-mail address to be extracted
+   * @return Extracted e-mail address
+   */
+  private String extractEmailAddress(String rawEmailAddress) {
+    Pattern pattern = Pattern.compile("<(.+?@.+?)>");
+    Matcher matcher = pattern.matcher(rawEmailAddress);
+
+    return matcher.find() ? matcher.group(1) : rawEmailAddress;
+  }
+
   //////////////////////////////End of Repository Connector Methods///////////////////////////////////
 
 
@@ -1215,15 +1234,19 @@ public class EmailConnector extends org.
   protected static void fillInMetadataTab(Map<String, Object> paramMap,
     Specification ds) {
     Set<String> metadataSelections = new HashSet<String>();
+    String extractEmailSelection = null;
     int i = 0;
     while (i < ds.getChildCount()) {
       SpecificationNode sn = ds.getChild(i++);
       if (sn.getType().equals(EmailConfig.NODE_METADATA)) {
         String metadataName = sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
         metadataSelections.add(metadataName);
+      } else if (sn.getType().equals(EmailConfig.NODE_EXTRACT_EMAIL)) {
+        extractEmailSelection = sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
       }
     }
     paramMap.put("METADATASELECTIONS", metadataSelections);
+    paramMap.put("EXTRACTEMAILSELECTION", extractEmailSelection);
   }
 
   /**
@@ -1232,6 +1255,9 @@ public class EmailConnector extends org.
   protected void fillInMetadataAttributes(Map<String, Object> paramMap) {
     String[] matchNames = EmailConfig.BASIC_METADATA;
     paramMap.put("METADATAATTRIBUTES", matchNames);
+
+    String extractEmailAttribute = EmailConfig.BASIC_EXTRACT_EMAIL;
+    paramMap.put("EXTRACTEMAILATTRIBUTE", extractEmailAttribute);
   }
 
   protected void outputFilterTab(IHTTPOutput out, Locale locale,
@@ -1364,6 +1390,18 @@ public class EmailConnector extends org.
 
 
   protected String processMetadataTab(IPostParameters variableContext, Specification ds,
+                                   int connectionSequenceNumber)
+          throws ManifoldCFException {
+    String result = processMetadataAttributes(variableContext, ds, connectionSequenceNumber);
+    if (result != null)
+      return result;
+
+    result = processExtractEmail(variableContext, ds, connectionSequenceNumber);
+    return result;
+
+  }
+
+  protected String processMetadataAttributes(IPostParameters variableContext, Specification
ds,
     int connectionSequenceNumber)
     throws ManifoldCFException {
       
@@ -1385,6 +1423,30 @@ public class EmailConnector extends org.
 
     return null;
   }
+
+  protected String processExtractEmail(IPostParameters variableContext, Specification ds,
+    int connectionSequenceNumber)
+    throws ManifoldCFException {
+
+    String seqPrefix = "s"+connectionSequenceNumber+"_";
+
+    // Remove old included extract email nodes
+    removeNodes(ds, EmailConfig.NODE_EXTRACT_EMAIL);
+
+    // Get the posted extract email value
+    String extractEmail = variableContext.getParameter(seqPrefix + "extractemail");
+    if (extractEmail == null) {
+      return null;
+    }
+
+    // Gather the extract email parameter to be the last one
+    SpecificationNode sn = new SpecificationNode(EmailConfig.NODE_EXTRACT_EMAIL);
+    sn.setAttribute(EmailConfig.ATTRIBUTE_NAME, extractEmail);
+    // Add the new extract email parameter
+    ds.addChild(ds.getChildCount(), sn);
+
+    return null;
+  }
 
   /** View specification.
   * This method is called in the body section of a job's view page.  Its purpose is to present
the document

Modified: manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties
(original)
+++ manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties
Fri Apr 14 14:50:38 2017
@@ -47,6 +47,7 @@ EmailConnector.MetadataName=Metadata nam
 EmailConnector.NoMetadataSpecified=No metadata specified
 EmailConnector.SelectMetadataName=--Select metadata name --
 EmailConnector.IncludedMetadataColon=Included metadata:
+EmailConnector.ExtractEmailColon=Fields to extract e-mail addresses from:
 EmailConnector.AttachmentURLTemplateColon=Attachment URL template (blank if no attachments
desired):
 
 

Modified: manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties
(original)
+++ manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties
Fri Apr 14 14:50:38 2017
@@ -47,6 +47,7 @@ EmailConnector.MetadataName=nombre de me
 EmailConnector.NoMetadataSpecified=Sin metadatos especificada
 EmailConnector.SelectMetadataName=--Seleccione el nombre de metadatos --
 EmailConnector.IncludedMetadataColon=metadatos Incluido:
+EmailConnector.ExtractEmailColon=Fields to extract e-mail addresses from:
 EmailConnector.AttachmentURLTemplateColon=Attachment URL template (blank if no attachments
desired):
 
 

Modified: manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties
(original)
+++ manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties
Fri Apr 14 14:50:38 2017
@@ -47,6 +47,7 @@ EmailConnector.MetadataName=Metadata nam
 EmailConnector.NoMetadataSpecified=No metadata specified
 EmailConnector.SelectMetadataName=--Select metadata name --
 EmailConnector.IncludedMetadataColon=Included metadata:
+EmailConnector.ExtractEmailColon=Fields to extract e-mail addresses from:
 EmailConnector.AttachmentURLTemplateColon=Attachment URL template (blank if no attachments
desired):
 
 

Modified: manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties
(original)
+++ manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties
Fri Apr 14 14:50:38 2017
@@ -47,6 +47,7 @@ EmailConnector.MetadataName=元数
 EmailConnector.NoMetadataSpecified=元数据未指定
 EmailConnector.SelectMetadataName=-- 选择元数据名 --
 EmailConnector.IncludedMetadataColon=被包含的元数据:
+EmailConnector.ExtractEmailColon=Fields to extract e-mail addresses from:
 EmailConnector.AttachmentURLTemplateColon=Attachment URL template (blank if no attachments
desired):
 
 

Modified: manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html
(original)
+++ manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html
Fri Apr 14 14:50:38 2017
@@ -70,4 +70,13 @@ limitations under the License.
 
     </td>
   </tr>
+
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('EmailConnector.ExtractEmailColon'))</nobr></td>
+    <td class="value">
+        #if($EXTRACTEMAILSELECTION)
+      <nobr>$Encoder.bodyEscape($EXTRACTEMAILSELECTION)</nobr>
+        #end
+    </td>
+  </tr>
 </table>

Modified: manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html
(original)
+++ manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html
Fri Apr 14 14:50:38 2017
@@ -31,7 +31,18 @@ limitations under the License.
       <nobr>$Encoder.bodyEscape($metadataattribute)</nobr><br/>
   #end
     </td>
-
+  </tr>
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('EmailConnector.ExtractEmailColon'))</nobr></td>
+    <td class="value">
+    #if($EXTRACTEMAILSELECTION)
+      <input type="checkbox" name="s${SeqNum}_extractemail" value="$Encoder.attributeEscape($EXTRACTEMAILATTRIBUTE)"
checked="true"/>
+    #else
+      <input type="checkbox" name="s${SeqNum}_extractemail" value="$Encoder.attributeEscape($EXTRACTEMAILATTRIBUTE)"/>
+    #end
+      <nobr>$Encoder.bodyEscape($EXTRACTEMAILATTRIBUTE)</nobr><br/>
+    </td>
   </tr>
 </table>
 
@@ -40,5 +51,7 @@ limitations under the License.
   #foreach($metadataselection in $METADATASELECTIONS)
 <input type="hidden" name="s${SeqNum}_metadata" value="$Encoder.attributeEscape($metadataselection)"/>
   #end
-  
+  #if($EXTRACTEMAILSELECTION)
+<input type="hidden" name="s${SeqNum}_extractemail" value="$Encoder.attributeEscape($EXTRACTEMAILSELECTION)"/>
+  #end
 #end



Mime
View raw message