incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1139390 - in /incubator/lcf/trunk/connectors: rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/ webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Date Fri, 24 Jun 2011 17:53:23 GMT
Author: kwright
Date: Fri Jun 24 17:53:22 2011
New Revision: 1139390

URL: http://svn.apache.org/viewvc?rev=1139390&view=rev
Log:
Hook up length and URL exclusions for web and RSS connectors.  Part of CONNECTORS-214.

Modified:
    incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/DataCache.java
    incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DataCache.java
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/DataCache.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/DataCache.java?rev=1139390&r1=1139389&r2=1139390&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/DataCache.java
(original)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/DataCache.java
Fri Jun 24 17:53:22 2011
@@ -35,7 +35,7 @@ public class DataCache
   public static final String _rcsid = "@(#)$Id: DataCache.java 988245 2010-08-23 18:39:35Z
kwright $";
 
   // Hashmap containing the cache
-  protected HashMap cacheData = new HashMap();
+  protected Map<String,DocumentData> cacheData = new HashMap<String,DocumentData>();
 
   /** Constructor.
   */
@@ -46,10 +46,11 @@ public class DataCache
 
   /** Add binary data entry into the cache.  Does NOT close the input stream when done!
   *@param documentIdentifier is the document identifier (url).
+  *@param contentType is the content type for the data.
   *@param dataStream is the data stream.
   *@return the checksum value.
   */
-  public long addData(IVersionActivity activities, String documentIdentifier, InputStream
dataStream)
+  public long addData(IVersionActivity activities, String documentIdentifier, String contentType,
InputStream dataStream)
     throws ManifoldCFException, ServiceInterruption
   {
     // Create a temporary file; that's what we will cache
@@ -126,7 +127,7 @@ public class DataCache
         synchronized(this)
         {
           deleteData(documentIdentifier);
-          cacheData.put(documentIdentifier,tempFile);
+          cacheData.put(documentIdentifier,new DocumentData(tempFile,contentType));
           return checkSum;
         }
 
@@ -172,10 +173,10 @@ public class DataCache
   public synchronized long getDataLength(String documentIdentifier)
     throws ManifoldCFException
   {
-    File f = (File)cacheData.get(documentIdentifier);
+    DocumentData f = cacheData.get(documentIdentifier);
     if (f == null)
       return 0L;
-    return f.length();
+    return f.getData().length();
   }
 
   /** Fetch binary data entry from the cache.
@@ -185,12 +186,12 @@ public class DataCache
   public synchronized InputStream getData(String documentIdentifier)
     throws ManifoldCFException
   {
-    File f = (File)cacheData.get(documentIdentifier);
+    DocumentData f = cacheData.get(documentIdentifier);
     if (f == null)
       return null;
     try
     {
-      return new FileInputStream(f);
+      return new FileInputStream(f.getData());
     }
     catch (IOException e)
     {
@@ -198,17 +199,62 @@ public class DataCache
     }
   }
 
+  /** Get the content type.
+  *@param documentIdentifier is the document identifier.
+  *@return the content type, or null if there is none.
+  */
+  public synchronized String getContentType(String documentIdentifier)
+  {
+    DocumentData dd = cacheData.get(documentIdentifier);
+    if (dd == null)
+      return null;
+    return dd.getContentType();
+  }
+
   /** Delete specified item of data.
   *@param documentIdentifier is the document identifier (url).
   */
   public synchronized void deleteData(String documentIdentifier)
   {
-    File f = (File)cacheData.get(documentIdentifier);
+    DocumentData f = cacheData.get(documentIdentifier);
     cacheData.remove(documentIdentifier);
     if (f != null)
     {
-      ManifoldCF.deleteFile(f);
+      ManifoldCF.deleteFile(f.getData());
     }
   }
 
+  // Protected classes
+
+  /** This class represents everything we need to know about a document that's getting passed
from the
+  * getDocumentVersions() phase to the processDocuments() phase.
+  */
+  protected static class DocumentData
+  {
+    /** The cache file for the data */
+    protected File data;
+    /** The content-type header value */
+    protected String contentType;
+
+    /** Constructor. */
+    public DocumentData(File data, String contentType)
+    {
+      this.data = data;
+      this.contentType = contentType;
+    }
+
+    /** Get the data */
+    public File getData()
+    {
+      return data;
+    }
+
+    /** Get the contentType */
+    public String getContentType()
+    {
+      return contentType;
+    }
+
+  }
+
 }

Modified: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1139390&r1=1139389&r2=1139390&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
(original)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Fri Jun 24 17:53:22 2011
@@ -837,7 +837,7 @@ public class RSSConnector extends org.ap
               try
               {
                 StringBuilder sb = new StringBuilder();
-                long checkSum = cache.addData(activities,urlValue,is);
+                long checkSum = cache.addData(activities,urlValue,"text/html",is);
                 // Grab what we need from the passed-down data for the document.  These will
all become part
                 // of the version string.
                 String[] pubDates = activities.retrieveParentData(urlValue,"pubdate");
@@ -1064,7 +1064,7 @@ public class RSSConnector extends org.ap
                       InputStream is = connection.getResponseBodyStream();
                       try
                       {
-                        long checkSum = cache.addData(activities,urlValue,is);
+                        long checkSum = cache.addData(activities,urlValue,contentType,is);
                         StringBuilder sb = new StringBuilder();
                         if (ingestURL != null)
                         {
@@ -1310,169 +1310,181 @@ public class RSSConnector extends org.ap
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("RSS: Interpreting '"+urlValue+"' as a document");
 
-        // Treat it as an ingestable document.
-        // Version *should* start with a "+".
-        ArrayList acls = new ArrayList();
-        StringBuilder denyAclBuffer = new StringBuilder();
-        int startPos = unpackList(acls,version,1,'+');
-        if (startPos < version.length() && version.charAt(startPos++) == '+')
-        {
-          startPos = unpack(denyAclBuffer,version,startPos,'+');
-        }
-        ArrayList metadata = new ArrayList();
-        startPos = unpackList(metadata,version,startPos,'+');
-        StringBuilder ingestUrlBuffer = new StringBuilder();
-        startPos = unpack(ingestUrlBuffer,version,startPos,'+');
-        String ingestURL = ingestUrlBuffer.toString();
-        ArrayList pubDates = new ArrayList();
-        startPos = unpackList(pubDates,version,startPos,'+');
-        ArrayList titles = new ArrayList();
-        startPos = unpackList(titles,version,startPos,'+');
-        ArrayList sources = new ArrayList();
-        startPos = unpackList(sources,version,startPos,'+');
-        ArrayList categories = new ArrayList();
-        startPos = unpackList(categories,version,startPos,'+');
-
-        if (ingestURL.length() > 0)
-        {
-          long dataSize = cache.getDataLength(urlValue);
-          RepositoryDocument rd = new RepositoryDocument();
-
-          // Turn into acls and add into description
-          String[] aclArray = new String[acls.size()];
-          int j = 0;
-          while (j < aclArray.length)
-          {
-            aclArray[j] = (String)acls.get(j);
-            j++;
-          }
-          rd.setACL(aclArray);
-
-          // Deny acl too
-          if (denyAclBuffer.length() > 0)
-          {
-            String[] denyAclArray = new String[]{denyAclBuffer.toString()};
-            rd.setDenyACL(denyAclArray);
-          }
-
-          // Grab metadata
-          HashMap metaHash = new HashMap();
-          int k = 0;
-          while (k < metadata.size())
-          {
-            String metadataItem = (String)metadata.get(k++);
-            unpackFixedList(fixedList,metadataItem,0,'=');
-            HashMap hashValue = (HashMap)metaHash.get(fixedList[0]);
-            if (hashValue == null)
-            {
-              hashValue = new HashMap();
-              metaHash.put(fixedList[0],hashValue);
-            }
-            hashValue.put(fixedList[1],fixedList[1]);
-          }
-          Iterator metaIter = metaHash.keySet().iterator();
-          while (metaIter.hasNext())
-          {
-            String key = (String)metaIter.next();
-            HashMap metaList = (HashMap)metaHash.get(key);
-            String[] values = new String[metaList.size()];
-            Iterator iter = metaList.keySet().iterator();
-            k = 0;
-            while (iter.hasNext())
+        if (isDataIngestable(activities,urlValue))
+        {
+          // Treat it as an ingestable document.
+          // Version *should* start with a "+".
+          ArrayList acls = new ArrayList();
+          StringBuilder denyAclBuffer = new StringBuilder();
+          int startPos = unpackList(acls,version,1,'+');
+          if (startPos < version.length() && version.charAt(startPos++) == '+')
+          {
+            startPos = unpack(denyAclBuffer,version,startPos,'+');
+          }
+          ArrayList metadata = new ArrayList();
+          startPos = unpackList(metadata,version,startPos,'+');
+          StringBuilder ingestUrlBuffer = new StringBuilder();
+          startPos = unpack(ingestUrlBuffer,version,startPos,'+');
+          String ingestURL = ingestUrlBuffer.toString();
+          ArrayList pubDates = new ArrayList();
+          startPos = unpackList(pubDates,version,startPos,'+');
+          ArrayList titles = new ArrayList();
+          startPos = unpackList(titles,version,startPos,'+');
+          ArrayList sources = new ArrayList();
+          startPos = unpackList(sources,version,startPos,'+');
+          ArrayList categories = new ArrayList();
+          startPos = unpackList(categories,version,startPos,'+');
+
+          if (ingestURL.length() > 0)
+          {
+            long dataSize = cache.getDataLength(urlValue);
+            RepositoryDocument rd = new RepositoryDocument();
+
+            // Turn into acls and add into description
+            String[] aclArray = new String[acls.size()];
+            int j = 0;
+            while (j < aclArray.length)
+            {
+              aclArray[j] = (String)acls.get(j);
+              j++;
+            }
+            rd.setACL(aclArray);
+
+            // Deny acl too
+            if (denyAclBuffer.length() > 0)
+            {
+              String[] denyAclArray = new String[]{denyAclBuffer.toString()};
+              rd.setDenyACL(denyAclArray);
+            }
+
+            // Grab metadata
+            HashMap metaHash = new HashMap();
+            int k = 0;
+            while (k < metadata.size())
+            {
+              String metadataItem = (String)metadata.get(k++);
+              unpackFixedList(fixedList,metadataItem,0,'=');
+              HashMap hashValue = (HashMap)metaHash.get(fixedList[0]);
+              if (hashValue == null)
+              {
+                hashValue = new HashMap();
+                metaHash.put(fixedList[0],hashValue);
+              }
+              hashValue.put(fixedList[1],fixedList[1]);
+            }
+            Iterator metaIter = metaHash.keySet().iterator();
+            while (metaIter.hasNext())
             {
-              values[k] = (String)iter.next();
-              k++;
+              String key = (String)metaIter.next();
+              HashMap metaList = (HashMap)metaHash.get(key);
+              String[] values = new String[metaList.size()];
+              Iterator iter = metaList.keySet().iterator();
+              k = 0;
+              while (iter.hasNext())
+              {
+                values[k] = (String)iter.next();
+                k++;
+              }
+              rd.addField(key,values);
             }
-            rd.addField(key,values);
-          }
-
-          // Loop through the titles to add those to the metadata
-          String[] titleValues = new String[titles.size()];
-          k = 0;
-          while (k < titleValues.length)
-          {
-            titleValues[k] = (String)titles.get(k);
-            k++;
-          }
-          if (k > 0)
-            rd.addField("title",titleValues);
-
-          // Loop through the sources to add those to the metadata
-          String[] sourceValues = new String[sources.size()];
-          k = 0;
-          while (k < sourceValues.length)
-          {
-            sourceValues[k] = (String)sources.get(k);
-            k++;
-          }
-          if (k > 0)
-            rd.addField("source",sourceValues);
 
-          // Add the categories now
-          String[] categoryValues = new String[categories.size()];
-          k = 0;
-          while (k < categoryValues.length)
-          {
-            categoryValues[k] = (String)categories.get(k);
-            k++;
-          }
-          if (k > 0)
-            rd.addField("category",categoryValues);
-
-          // The pubdates are a ms since epoch value; we want the minimum one for the origination
time.
-          Long minimumOrigTime = null;
-          String[] pubDateValues = new String[pubDates.size()];
-          k = 0;
-          while (k < pubDates.size())
-          {
-            String pubDate = (String)pubDates.get(k);
-            pubDateValues[k++] = pubDate;
-            try
+            // Loop through the titles to add those to the metadata
+            String[] titleValues = new String[titles.size()];
+            k = 0;
+            while (k < titleValues.length)
             {
-              Long pubDateLong = new Long(pubDate);
-              if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue())
-                minimumOrigTime = pubDateLong;
+              titleValues[k] = (String)titles.get(k);
+              k++;
             }
-            catch (NumberFormatException e)
+            if (k > 0)
+              rd.addField("title",titleValues);
+
+            // Loop through the sources to add those to the metadata
+            String[] sourceValues = new String[sources.size()];
+            k = 0;
+            while (k < sourceValues.length)
             {
-              // Do nothing; the version string seems to not mean anything
+              sourceValues[k] = (String)sources.get(k);
+              k++;
             }
-          }
-          if (k > 0)
-            rd.addField("pubdate",pubDateValues);
-
-          if (minimumOrigTime != null)
-            activities.setDocumentOriginationTime(urlValue,minimumOrigTime);
+            if (k > 0)
+              rd.addField("source",sourceValues);
 
-          InputStream is = cache.getData(urlValue);
-          if (is != null)
-          {
-            try
+            // Add the categories now
+            String[] categoryValues = new String[categories.size()];
+            k = 0;
+            while (k < categoryValues.length)
             {
-              rd.setBinary(is,dataSize);
-              activities.ingestDocument(urlValue,version,ingestURL,rd);
+              categoryValues[k] = (String)categories.get(k);
+              k++;
             }
-            finally
+            if (k > 0)
+              rd.addField("category",categoryValues);
+
+            // The pubdates are a ms since epoch value; we want the minimum one for the origination
time.
+            Long minimumOrigTime = null;
+            String[] pubDateValues = new String[pubDates.size()];
+            k = 0;
+            while (k < pubDates.size())
             {
+              String pubDate = (String)pubDates.get(k);
+              pubDateValues[k++] = pubDate;
               try
               {
-                is.close();
+                Long pubDateLong = new Long(pubDate);
+                if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue())
+                  minimumOrigTime = pubDateLong;
               }
-              catch (java.net.SocketTimeoutException e)
+              catch (NumberFormatException e)
               {
-                throw new ManifoldCFException("IO error closing stream: "+e.getMessage(),e);
+                // Do nothing; the version string seems to not mean anything
               }
-              catch (InterruptedIOException e)
+            }
+            if (k > 0)
+              rd.addField("pubdate",pubDateValues);
+
+            if (minimumOrigTime != null)
+              activities.setDocumentOriginationTime(urlValue,minimumOrigTime);
+
+            InputStream is = cache.getData(urlValue);
+            if (is != null)
+            {
+              try
               {
-                throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                rd.setBinary(is,dataSize);
+                activities.ingestDocument(urlValue,version,ingestURL,rd);
               }
-              catch (IOException e)
+              finally
               {
-                throw new ManifoldCFException("IO error closing stream: "+e.getMessage(),e);
+                try
+                {
+                  is.close();
+                }
+                catch (java.net.SocketTimeoutException e)
+                {
+                  throw new ManifoldCFException("IO error closing stream: "+e.getMessage(),e);
+                }
+                catch (InterruptedIOException e)
+                {
+                  throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                }
+                catch (IOException e)
+                {
+                  throw new ManifoldCFException("IO error closing stream: "+e.getMessage(),e);
+                }
               }
             }
           }
         }
+        else
+        {
+          // This is NOT quite the same as deleteDocument().  The deleteDocument() method
removes the record, and
+          // thus the version string.  So, when that is used, we cannot tell if the document
has changed; we simply have to try again.
+          activities.ingestDocument(urlValue,version,null,null);
+
+          if (Logging.connectors.isDebugEnabled())
+            Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because it cannot
be indexed");
+        }
       }
       else
       {
@@ -5049,6 +5061,41 @@ public class RSSConnector extends org.ap
     return activities.checkMimeTypeIndexable(contentType);
   }
 
+  /** Code to check if an already-fetched document should be ingested.
+  */
+  protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier)
+    throws ServiceInterruption, ManifoldCFException
+  {
+    if (activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) == false)
+      return false;
+
+    if (activities.checkURLIndexable(documentIdentifier) == false)
+      return false;
+
+    // Check if it's a recognized content type
+    String contentType = cache.getContentType(documentIdentifier);
+
+    // Some sites have multiple content types.  We just look at the LAST one in that case.
+    if (contentType != null)
+    {
+      String[] contentTypes = contentType.split(",");
+      if (contentTypes.length > 0)
+        contentType = contentTypes[contentTypes.length-1].trim();
+      else
+        contentType = null;
+    }
+
+    if (contentType == null)
+      return false;
+
+    int pos = contentType.indexOf(";");
+    if (pos != -1)
+      contentType = contentType.substring(0,pos);
+    contentType = contentType.trim();
+
+    return activities.checkMimeTypeIndexable(contentType);
+  }
+
   /** Given the current parameters, find the correct throttled fetcher object
   * (or create one if not there).
   */

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DataCache.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DataCache.java?rev=1139390&r1=1139389&r2=1139390&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DataCache.java
(original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DataCache.java
Fri Jun 24 17:53:22 2011
@@ -36,7 +36,7 @@ public class DataCache
 
   // Hashmap containing the cache of files.
   // This is keyed by document identifier, and contains DocumentData objects.
-  protected HashMap cacheData = new HashMap();
+  protected Map<String,DocumentData> cacheData = new HashMap<String,DocumentData>();
 
   /** Constructor.
   */
@@ -220,7 +220,7 @@ public class DataCache
   */
   public synchronized int getResponseCode(String documentIdentifier)
   {
-    DocumentData dd = (DocumentData)cacheData.get(documentIdentifier);
+    DocumentData dd = cacheData.get(documentIdentifier);
     if (dd == null)
       return IThrottledConnection.FETCH_NOT_TRIED;
     return dd.getResponseCode();
@@ -232,7 +232,7 @@ public class DataCache
   */
   public synchronized String getContentType(String documentIdentifier)
   {
-    DocumentData dd = (DocumentData)cacheData.get(documentIdentifier);
+    DocumentData dd = cacheData.get(documentIdentifier);
     if (dd == null)
       return null;
     return dd.getContentType();
@@ -244,7 +244,7 @@ public class DataCache
   */
   public synchronized String getReferralURI(String documentIdentifier)
   {
-    DocumentData dd = (DocumentData)cacheData.get(documentIdentifier);
+    DocumentData dd = cacheData.get(documentIdentifier);
     if (dd == null)
       return null;
     return dd.getReferralURI();
@@ -256,7 +256,7 @@ public class DataCache
   */
   public synchronized long getDataLength(String documentIdentifier)
   {
-    DocumentData dd = (DocumentData)cacheData.get(documentIdentifier);
+    DocumentData dd = cacheData.get(documentIdentifier);
     if (dd == null)
       return 0L;
     return dd.getData().length();
@@ -269,7 +269,7 @@ public class DataCache
   public synchronized InputStream getData(String documentIdentifier)
     throws ManifoldCFException
   {
-    DocumentData dd = (DocumentData)cacheData.get(documentIdentifier);
+    DocumentData dd = cacheData.get(documentIdentifier);
     if (dd == null)
       return null;
     try
@@ -287,7 +287,7 @@ public class DataCache
   */
   public synchronized void deleteData(String documentIdentifier)
   {
-    DocumentData dd = (DocumentData)cacheData.remove(documentIdentifier);
+    DocumentData dd = cacheData.remove(documentIdentifier);
     if (dd != null)
     {
       ManifoldCF.deleteFile(dd.getData());

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1139390&r1=1139389&r2=1139390&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Fri Jun 24 17:53:22 2011
@@ -5114,7 +5114,7 @@ public class WebcrawlerConnector extends
     
     return activities.checkMimeTypeIndexable(contentType);
   }
-
+  
   /** Code to check if an already-fetched document should be ingested.
   */
   protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier)
@@ -5123,6 +5123,12 @@ public class WebcrawlerConnector extends
     if (cache.getResponseCode(documentIdentifier) != 200)
       return false;
 
+    if (activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) == false)
+      return false;
+
+    if (activities.checkURLIndexable(documentIdentifier) == false)
+      return false;
+
     // Check if it's a recognized content type
     String contentType = cache.getContentType(documentIdentifier);
 



Mime
View raw message