manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1435014 - in /manifoldcf/trunk: ./ connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/ connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/ connectors/j...
Date Fri, 18 Jan 2013 03:50:22 GMT
Author: kwright
Date: Fri Jan 18 03:50:21 2013
New Revision: 1435014

URL: http://svn.apache.org/viewvc?rev=1435014&view=rev
Log:
Finish CONNECTORS-613, by finding content type in all connectors where that is possible.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
    manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
    manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
    manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
    manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java
    manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
    manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1435014&r1=1435013&r2=1435014&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Jan 18 03:50:21 2013
@@ -5,6 +5,11 @@ $Id$
 
 ======================= Release 1.1 =====================
 
+CONNECTORS-613: Add a way of getting a document's mime type
+to Solr, since Tika needs mime type in order to extract content
+since Solr 4.0.0.
+(Shinichiro Abe, Karl Wright)
+
 CONNECTORS-614: Solr connection release not working right.
 (Karl Wright)
 

Modified: manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java?rev=1435014&r1=1435013&r2=1435014&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
(original)
+++ manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
Fri Jan 18 03:50:21 2013
@@ -1573,6 +1573,8 @@ public class DCTM extends org.apache.man
             
             String objName = object.getObjectName();
 
+            String contentType = object.getContentType();
+            
             // This particular way of getting content failed, because DFC loaded the
             // whole object into memory (very very bad DFC!)
             // InputStream is = objIDfSysObject.getContent();
@@ -1609,6 +1611,9 @@ public class DCTM extends org.apache.man
 
             rval = new RepositoryDocument();
 
+            if (contentType != null)
+              rval.setMimeType(contentType);
+            
             // Handle the metadata.
             // The start of the version string contains the names of the metadata.  We parse
it out of the
             // version string, because we don't want the chance of somebody changing something
after we got

Modified: manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java?rev=1435014&r1=1435013&r2=1435014&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
(original)
+++ manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
Fri Jan 18 03:50:21 2013
@@ -313,13 +313,13 @@ public class FileConnector extends org.a
   static {
     mimeMap = new HashMap<String,String>();
     mimeMap.put("txt","text/plain");
-    mimeMap.put(".pdf","application/pdf");
-    mimeMap.put(".doc","application/msword");
-    mimeMap.put(".docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
-    mimeMap.put(".ppt","application/vnd.ms-powerpoint");
-    mimeMap.put(".pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
-    mimeMap.put(".xls","application/vnd.ms-excel");
-    mimeMap.put(".xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    mimeMap.put("pdf","application/pdf");
+    mimeMap.put("doc","application/msword");
+    mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+    mimeMap.put("ppt","application/vnd.ms-powerpoint");
+    mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
+    mimeMap.put("xls","application/vnd.ms-excel");
+    mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
   }
   
   /** Map an extension to a mime type */

Modified: manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1435014&r1=1435013&r2=1435014&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
(original)
+++ manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
Fri Jan 18 03:50:21 2013
@@ -986,13 +986,13 @@ public class SharedDriveConnector extend
   static {
     mimeMap = new HashMap<String,String>();
     mimeMap.put("txt","text/plain");
-    mimeMap.put(".pdf","application/pdf");
-    mimeMap.put(".doc","application/msword");
-    mimeMap.put(".docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
-    mimeMap.put(".ppt","application/vnd.ms-powerpoint");
-    mimeMap.put(".pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
-    mimeMap.put(".xls","application/vnd.ms-excel");
-    mimeMap.put(".xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    mimeMap.put("pdf","application/pdf");
+    mimeMap.put("doc","application/msword");
+    mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+    mimeMap.put("ppt","application/vnd.ms-powerpoint");
+    mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
+    mimeMap.put("xls","application/vnd.ms-excel");
+    mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
   }
   
   /** Map an extension to a mime type */

Modified: manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1435014&r1=1435013&r2=1435014&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
(original)
+++ manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
Fri Jan 18 03:50:21 2013
@@ -442,6 +442,7 @@ public class JDBCConnector extends org.a
     addConstant(vm,JDBCConstants.idReturnVariable,JDBCConstants.idReturnColumnName);
     addConstant(vm,JDBCConstants.urlReturnVariable,JDBCConstants.urlReturnColumnName);
     addConstant(vm,JDBCConstants.dataReturnVariable,JDBCConstants.dataReturnColumnName);
+    addConstant(vm,JDBCConstants.contentTypeReturnVariable,JDBCConstants.contentTypeReturnColumnName);
     if (!addIDList(vm,JDBCConstants.idListVariable,documentIdentifiers,scanOnly))
       return;
 
@@ -529,11 +530,24 @@ public class JDBCConnector extends org.a
                 // We will ingest something, so remove this id from the map in order that
we know what we still
                 // need to delete when all done.
                 map.remove(id);
+                String contentType;
+                o = row.getValue(JDBCConstants.contentTypeReturnColumnName);
+                if (o != null)
+                  contentType = readAsString(o);
+                else
+                  contentType = null;
+                
                 if (contents instanceof BinaryInput)
                 {
                   // An ingestion will take place for this document.
                   RepositoryDocument rd = new RepositoryDocument();
 
+                  // Default content type is application/octet-stream for binary data
+                  if (contentType == null)
+                    rd.setMimeType("application/octet-stream");
+                  else
+                    rd.setMimeType(contentType);
+                  
                   applyAccessTokens(rd,version,spec);
                   applyMetadata(rd,row);
 
@@ -578,6 +592,12 @@ public class JDBCConnector extends org.a
                     byte[] bytes = value.getBytes("utf-8");
                     RepositoryDocument rd = new RepositoryDocument();
 
+                    // Default content type is text/plain for character data
+                    if (contentType == null)
+                      rd.setMimeType("text/plain");
+                    else
+                      rd.setMimeType(contentType);
+                    
                     applyAccessTokens(rd,version,spec);
                     applyMetadata(rd,row);
 
@@ -1382,6 +1402,7 @@ public class JDBCConnector extends org.a
     documentKnownColumns.put(JDBCConstants.idReturnColumnName,"");
     documentKnownColumns.put(JDBCConstants.urlReturnColumnName,"");
     documentKnownColumns.put(JDBCConstants.dataReturnColumnName,"");
+    documentKnownColumns.put(JDBCConstants.contentTypeReturnColumnName,"");
   }
   
   /** Apply metadata to a repository document.

Modified: manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java?rev=1435014&r1=1435013&r2=1435014&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java
(original)
+++ manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConstants.java
Fri Jan 18 03:50:21 2013
@@ -52,7 +52,9 @@ public class JDBCConstants
   public static String urlReturnColumnName = "lcf__url";
   /** The name of the data return column */
   public static String dataReturnColumnName = "lcf__data";
-
+  /** The name of the content type return column */
+  public static String contentTypeReturnColumnName = "lcf__contenttype";
+  
   /** The name of the id return variable */
   public static String idReturnVariable = "IDCOLUMN";
   /** The name of the version return variable */
@@ -61,6 +63,8 @@ public class JDBCConstants
   public static String urlReturnVariable = "URLCOLUMN";
   /** The name of the data return variable */
   public static String dataReturnVariable = "DATACOLUMN";
+  /** The name of the content type return variable */
+  public static String contentTypeReturnVariable = "CONTENTTYPE";
   /** The name of the start time variable */
   public static String startTimeVariable = "STARTTIME";
   /** The name of the end time variable */

Modified: manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java?rev=1435014&r1=1435013&r2=1435014&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
(original)
+++ manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
Fri Jan 18 03:50:21 2013
@@ -1522,6 +1522,8 @@ public class SharePointRepository extend
                       RepositoryDocument data = new RepositoryDocument();
                       data.setBinary( is, documentLength );
 
+		      data.setMimeType(mapExtensionToMimeType(documentIdentifier));
+		      
                       setDataACLs(data,acls,denyAcl);
 
                       setPathAttribute(data,sDesc,documentIdentifier);
@@ -1708,6 +1710,31 @@ public class SharePointRepository extend
     }
   }
 
+  protected final static Map<String,String> mimeMap;
+  static {
+    mimeMap = new HashMap<String,String>();
+    mimeMap.put("txt","text/plain");
+    mimeMap.put("pdf","application/pdf");
+    mimeMap.put("doc","application/msword");
+    mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+    mimeMap.put("ppt","application/vnd.ms-powerpoint");
+    mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
+    mimeMap.put("xls","application/vnd.ms-excel");
+    mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+  }
+  
+  /** Map an extension to a mime type */
+  protected static String mapExtensionToMimeType(String fileName)
+  {
+    int slashIndex = fileName.lastIndexOf("/");
+    if (slashIndex != -1)
+      fileName = fileName.substring(slashIndex+1);
+    int dotIndex = fileName.lastIndexOf(".");
+    if (dotIndex == -1)
+      return null;
+    return mimeMap.get(fileName.substring(dotIndex+1).toLowerCase(java.util.Locale.ROOT));
+  }
+
   protected static void setDataACLs(RepositoryDocument data, ArrayList acls, String denyAcl)
   {
     if (acls != null)

Modified: manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1435014&r1=1435013&r2=1435014&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
(original)
+++ manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Fri Jan 18 03:50:21 2013
@@ -3528,6 +3528,10 @@ public class WikiConnector extends org.a
               String lastModified = t.getLastModified();
               
               RepositoryDocument rd = new RepositoryDocument();
+              
+              // For wiki, type is always text/plain
+              rd.setMimeType("text/plain");
+              
               dataSize = contentFile.length();
               InputStream is = new FileInputStream(contentFile);
               try



Mime
View raw message