cocoon-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Colin Britton" <cbrit...@metatomix.com>
Subject Patches for search indexing
Date Sun, 20 Jan 2002 01:04:56 GMT
Attached are three changes for the indexing capability provided by Cocoon
and Lucene. Details below:

1) current code makes any URL's with paramaters incorrect as it always adds
"?cocoon-view=" the the url, which invalidates any existing parameters.
Patch provided detects for a ? in the url and only adds it if one does not
exist. This applies to SimpleCocoonCrawlerImpl.java and in a similar way to
SimpleLuceneXMLIndexerImpl.java

2) Change a system.out to a logger entry in SimpleCocoonCrawlerImpl.java

3) added a logger entry when lucene optimizes to
SimpleLuceneCocoonIndexerImpl.java

rgds
CB



Index: crawler/SimpleCocoonCrawlerImpl.java
===================================================================
RCS file:
/home/cvspublic/xml-cocoon2/src/java/org/apache/cocoon/components/crawler/Si
mpleCocoonCrawlerImpl.java,v
retrieving revision 1.1
diff -u -r1.1 SimpleCocoonCrawlerImpl.java
--- crawler/SimpleCocoonCrawlerImpl.java 3 Jan 2002 12:31:09 -0000 1.1
+++ crawler/SimpleCocoonCrawlerImpl.java 20 Jan 2002 00:49:41 -0000
@@ -58,7 +58,7 @@
* @since
*/
public final String LINK_CONTENT_TYPE_DEFAULT =
"application/x-cocoon-links";
-
+
/**
* Config element name specifying query-string appendend for requesting links
* of an URL.
@@ -77,7 +77,7 @@
*
* @since
*/
- public final static String LINK_VIEW_QUERY_DEFAULT = "?cocoon-view=links";
+ public final static String LINK_VIEW_QUERY_DEFAULT = "&cocoon-view=links";
/**
* Config element name specifying excluding regular expression pattern.
@@ -199,7 +199,7 @@
this.includeCrawlingURL.add(new RE(tokenized_pattern));
}
} catch (RESyntaxException rese) {
- getLogger().error("Cannot create includeing regular-expression for " +
+ getLogger().error("Cannot create includeing regular-expression for " +
pattern, rese);
}
}
@@ -217,7 +217,7 @@
this.excludeCrawlingURL.add(new RE(tokenized_pattern));
}
} catch (RESyntaxException rese) {
- getLogger().error("Cannot create excluding regular-expression for " +
+ getLogger().error("Cannot create excluding regular-expression for " +
pattern, rese);
}
}
@@ -416,6 +416,9 @@
// get links of url
try {
+ if (url.toString().indexOf("?")==-1){
+ linkViewQuery = "?" + linkViewQuery;
+ }
URL links_url = new URL(url, url.getPath() + linkViewQuery);
URLConnection links_url_connection = links_url.openConnection();
InputStream is = links_url_connection.getInputStream();

Index: search/SimpleLuceneCocoonIndexerImpl.java
===================================================================
RCS file:
/home/cvspublic/xml-cocoon2/src/java/org/apache/cocoon/components/search/Sim
pleLuceneCocoonIndexerImpl.java,v
retrieving revision 1.1
diff -u -r1.1 SimpleLuceneCocoonIndexerImpl.java
--- search/SimpleLuceneCocoonIndexerImpl.java 3 Jan 2002 12:31:13 -0000 1.1
+++ search/SimpleLuceneCocoonIndexerImpl.java 20 Jan 2002 00:49:42 -0000
@@ -198,8 +198,10 @@
// skip urls using different host, or port than host,
// or port of base url
- System.out.println("Skipping carwling URL " + crawl_url.toString() +
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("Skipping crawling URL " + crawl_url.toString() +
" as base_url is " + base_url.toString());
+ }
continue;
}
@@ -212,9 +214,15 @@
Document document = (Document) i.next();
writer.addDocument(document);
}
- }
- // optimize it
+
+ }
+ // optimize it
writer.optimize();
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("Optimizing index" );
+ }
+
+
} catch (IOException ioe) {
throw new ProcessingException("IOException in index()", ioe);
} catch (ComponentException ce) {

Index: search/SimpleLuceneXMLIndexerImpl.java
===================================================================
RCS file:
/home/cvspublic/xml-cocoon2/src/java/org/apache/cocoon/components/search/Sim
pleLuceneXMLIndexerImpl.java,v
retrieving revision 1.1
diff -u -r1.1 SimpleLuceneXMLIndexerImpl.java
--- search/SimpleLuceneXMLIndexerImpl.java 3 Jan 2002 12:31:13 -0000 1.1
+++ search/SimpleLuceneXMLIndexerImpl.java 20 Jan 2002 00:49:42 -0000
@@ -29,7 +29,6 @@
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.AbstractLoggable;
-import org.apache.avalon.framework.logger.AbstractLoggable;
import org.apache.avalon.framework.parameters.Parameters;
import org.apache.avalon.framework.thread.ThreadSafe;
import org.apache.cocoon.ProcessingException;
@@ -80,7 +79,7 @@
*
* @since
*/
- final String CONTENT_QUERY = "?cocoon-view=content";
+ final String CONTENT_QUERY = "&cocoon-view=content";
/**
* set of allowed content types
@@ -163,7 +162,14 @@
throws ProcessingException {
try {
- URL contentURL = new URL(url, url.getPath() + CONTENT_QUERY);
+
+ String contentQuery = CONTENT_QUERY;
+
+ if (url.toString().indexOf("?")==-1){
+ contentQuery = "?" + contentQuery;
+ }
+
+ URL contentURL = new URL(url, url.getPath() + contentQuery);
URLConnection contentURLConnection = contentURL.openConnection();
String contentType = contentURLConnection.getContentType();
if (contentType != null &&


---------------------------------------------------------------------
To unsubscribe, e-mail: cocoon-dev-unsubscribe@xml.apache.org
For additional commands, email: cocoon-dev-help@xml.apache.org


Mime
View raw message