manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1490734 - in /manifoldcf/trunk: CHANGES.txt connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
Date Fri, 07 Jun 2013 17:11:50 GMT
Author: kwright
Date: Fri Jun  7 17:11:50 2013
New Revision: 1490734

URL: http://svn.apache.org/r1490734
Log:
Fix for CONNECTORS-708.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1490734&r1=1490733&r2=1490734&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Jun  7 17:11:50 2013
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 1.3-dev =====================
 
+CONNECTORS-708: Make JDBC connector check mime type for indexability,
+if it is present.
+(Richard Nichols, Karl Wright)
+
 CONNECTORS-709: Escape \r, \n, \f, and \b in ElasticSearch connector.
 (Richard Nichols, Karl Wright)
 

Modified: manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1490734&r1=1490733&r2=1490734&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
(original)
+++ manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
Fri Jun  7 17:11:50 2013
@@ -543,90 +543,95 @@ public class JDBCConnector extends org.a
                 else
                   contentType = null;
                 
-                if (contents instanceof BinaryInput)
+                if (contentType == null || activities.checkMimeTypeIndexable(contentType))
                 {
-                  // An ingestion will take place for this document.
-                  RepositoryDocument rd = new RepositoryDocument();
-
-                  // Default content type is application/octet-stream for binary data
-                  if (contentType == null)
-                    rd.setMimeType("application/octet-stream");
-                  else
-                    rd.setMimeType(contentType);
-                  
-                  applyAccessTokens(rd,version,spec);
-                  applyMetadata(rd,row);
-
-                  BinaryInput bi = (BinaryInput)contents;
-                  try
-                  {
-                    // Read the stream
-                    InputStream is = bi.getStream();
-                    try
-                    {
-                      rd.setBinary(is,bi.getLength());
-                      activities.ingestDocument(id, version, url, rd);
-                    }
-                    finally
-                    {
-                      is.close();
-                    }
-                  }
-                  catch (java.net.SocketTimeoutException e)
-                  {
-                    throw new ManifoldCFException("Socket timeout reading database data:
"+e.getMessage(),e);
-                  }
-                  catch (InterruptedIOException e)
-                  {
-                    throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-                  }
-                  catch (IOException e)
-                  {
-                    throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
-                  }
-                  finally
-                  {
-                    bi.discard();
-                  }
-                }
-                else
-                {
-                  // Turn it into a string, and then into a stream
-                  String value = contents.toString();
-                  try
+                  if (contents instanceof BinaryInput)
                   {
-                    byte[] bytes = value.getBytes("utf-8");
+                    // An ingestion will take place for this document.
                     RepositoryDocument rd = new RepositoryDocument();
 
-                    // Default content type is text/plain for character data
+                    // Default content type is application/octet-stream for binary data
                     if (contentType == null)
-                      rd.setMimeType("text/plain");
+                      rd.setMimeType("application/octet-stream");
                     else
                       rd.setMimeType(contentType);
                     
                     applyAccessTokens(rd,version,spec);
                     applyMetadata(rd,row);
 
-                    InputStream is = new ByteArrayInputStream(bytes);
+                    BinaryInput bi = (BinaryInput)contents;
                     try
                     {
-                      rd.setBinary(is,bytes.length);
-                      activities.ingestDocument(id, version, url, rd);
+                      // Read the stream
+                      InputStream is = bi.getStream();
+                      try
+                      {
+                        rd.setBinary(is,bi.getLength());
+                        activities.ingestDocument(id, version, url, rd);
+                      }
+                      finally
+                      {
+                        is.close();
+                      }
+                    }
+                    catch (java.net.SocketTimeoutException e)
+                    {
+                      throw new ManifoldCFException("Socket timeout reading database data:
"+e.getMessage(),e);
+                    }
+                    catch (InterruptedIOException e)
+                    {
+                      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                    }
+                    catch (IOException e)
+                    {
+                      throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
                     }
                     finally
                     {
-                      is.close();
+                      bi.discard();
                     }
                   }
-                  catch (InterruptedIOException e)
-                  {
-                    throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-                  }
-                  catch (IOException e)
+                  else
                   {
-                    throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                    // Turn it into a string, and then into a stream
+                    String value = contents.toString();
+                    try
+                    {
+                      byte[] bytes = value.getBytes("utf-8");
+                      RepositoryDocument rd = new RepositoryDocument();
+
+                      // Default content type is text/plain for character data
+                      if (contentType == null)
+                        rd.setMimeType("text/plain");
+                      else
+                        rd.setMimeType(contentType);
+                      
+                      applyAccessTokens(rd,version,spec);
+                      applyMetadata(rd,row);
+
+                      InputStream is = new ByteArrayInputStream(bytes);
+                      try
+                      {
+                        rd.setBinary(is,bytes.length);
+                        activities.ingestDocument(id, version, url, rd);
+                      }
+                      finally
+                      {
+                        is.close();
+                      }
+                    }
+                    catch (InterruptedIOException e)
+                    {
+                      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                    }
+                    catch (IOException e)
+                    {
+                      throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                    }
                   }
                 }
+                else
+                  Logging.connectors.warn("JDBC: Document '"+id+"' excluded because of mime
type - skipping");
               }
               else
                 Logging.connectors.warn("JDBC: Document '"+id+"' seems to have null data
- skipping");



Mime
View raw message