lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From k...@apache.org
Subject svn commit: r1225120 - in /lucene/dev/trunk/solr/contrib/extraction: CHANGES.txt src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
Date Wed, 28 Dec 2011 07:17:55 GMT
Author: koji
Date: Wed Dec 28 07:17:55 2011
New Revision: 1225120

URL: http://svn.apache.org/viewvc?rev=1225120&view=rev
Log:
SOLR-2346: Add a chance to set content encoding explicitly via content type of stream.

Modified:
    lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java

Modified: lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt?rev=1225120&r1=1225119&r2=1225120&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt Wed Dec 28 07:17:55 2011
@@ -30,7 +30,9 @@ $Id$
 
 ================== Release 3.6.0 ==================
 
-(No Changes)
+* SOLR-2346: Add a chance to set content encoding explicitly via content type of stream.
+  This is convenient when Tika's auto detector cannot detect encoding, especially
+  the text file is too short to detect encoding. (koji)
 
 ================== Release 3.5.0 ==================
 

Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1225120&r1=1225119&r2=1225120&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
(original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
Wed Dec 28 07:17:55 2011
@@ -26,6 +26,7 @@ import org.apache.solr.common.SolrExcept
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.solr.request.SolrQueryRequest;
@@ -158,6 +159,12 @@ public class ExtractingDocumentLoader ex
         metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
         metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
         metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
+        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
+        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
+        if(charset != null){
+          metadata.add(Metadata.CONTENT_ENCODING, charset);
+        }
+
         String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
         boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
         SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, schema);



Mime
View raw message