incubator-blur-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From amccu...@apache.org
Subject [27/28] git commit: Fixes to highlighting code. Now the highlighting code uses the standard document fetch code.
Date Wed, 15 Jan 2014 22:03:50 GMT
Fixes to highlighting code.  Now the highlighting code uses the standard document fetch code.


Project: http://git-wip-us.apache.org/repos/asf/incubator-blur/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-blur/commit/2be7ad3e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-blur/tree/2be7ad3e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-blur/diff/2be7ad3e

Branch: refs/heads/master
Commit: 2be7ad3e7a58dc7c1f6fad4df663ef1837bcccd9
Parents: 1919f18
Author: Aaron McCurry <amccurry@gmail.com>
Authored: Wed Jan 15 16:44:04 2014 -0500
Committer: Aaron McCurry <amccurry@gmail.com>
Committed: Wed Jan 15 17:02:59 2014 -0500

----------------------------------------------------------------------
 .../apache/blur/manager/BlurHighlighter.java    | 94 ++++++++++++++++++++
 .../org/apache/blur/manager/IndexManager.java   | 24 ++---
 .../blur/manager/writer/MutatableAction.java    |  8 +-
 .../java/org/apache/blur/utils/BlurUtil.java    | 17 +++-
 .../org/apache/blur/utils/HighlightHelper.java  | 52 -----------
 .../org/apache/blur/utils/BlurUtilsTest.java    |  8 +-
 .../org/apache/blur/mapreduce/BlurReducer.java  |  2 +-
 7 files changed, 123 insertions(+), 82 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/2be7ad3e/blur-core/src/main/java/org/apache/blur/manager/BlurHighlighter.java
----------------------------------------------------------------------
diff --git a/blur-core/src/main/java/org/apache/blur/manager/BlurHighlighter.java b/blur-core/src/main/java/org/apache/blur/manager/BlurHighlighter.java
new file mode 100644
index 0000000..bc5d8ba
--- /dev/null
+++ b/blur-core/src/main/java/org/apache/blur/manager/BlurHighlighter.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.blur.manager;
+
+import java.io.IOException;
+
+import org.apache.blur.analysis.FieldManager;
+import org.apache.blur.thrift.generated.HighlightOptions;
+import org.apache.blur.thrift.generated.Selector;
+import org.apache.blur.utils.HighlightHelper;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.SegmentReader;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
+
+public class BlurHighlighter {
+
+  private final String _preTag;
+  private final String _postTag;
+  private final Query _highlightQuery;
+  private final FieldManager _fieldManager;
+  private final boolean _shouldHighlight;
+
+  public BlurHighlighter(Query highlightQuery, FieldManager fieldManager, Selector selector)
{
+    HighlightOptions highlightOptions = selector.getHighlightOptions();
+    if (highlightOptions != null) {
+      _preTag = highlightOptions.getPreTag();
+      _postTag = highlightOptions.getPostTag();
+      _highlightQuery = highlightQuery;
+      _fieldManager = fieldManager;
+      _shouldHighlight = true;
+    } else {
+      _preTag = null;
+      _postTag = null;
+      _highlightQuery = null;
+      _fieldManager = null;
+      _shouldHighlight = false;
+    }
+  }
+
+  public BlurHighlighter() {
+    _preTag = null;
+    _postTag = null;
+    _highlightQuery = null;
+    _fieldManager = null;
+    _shouldHighlight = false;
+  }
+
+  public boolean shouldHighlight() {
+    return _shouldHighlight;
+  }
+
+  public String getPreTag() {
+    return _preTag;
+  }
+
+  public String getPostTag() {
+    return _postTag;
+  }
+
+  public Query getHighlightQuery() {
+    return _highlightQuery;
+  }
+
+  public FieldManager getFieldManager() {
+    return _fieldManager;
+  }
+
+  public Document highlight(int docID, Document document, SegmentReader segmentReader) throws
IOException {
+    Document highlight;
+    try {
+      highlight = HighlightHelper.highlight(docID, document, _highlightQuery, _fieldManager,
segmentReader, _preTag,
+          _postTag);
+    } catch (InvalidTokenOffsetsException e) {
+      throw new IOException(e);
+    }
+    return highlight;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/2be7ad3e/blur-core/src/main/java/org/apache/blur/manager/IndexManager.java
----------------------------------------------------------------------
diff --git a/blur-core/src/main/java/org/apache/blur/manager/IndexManager.java b/blur-core/src/main/java/org/apache/blur/manager/IndexManager.java
index 66689df..f8a23cf 100644
--- a/blur-core/src/main/java/org/apache/blur/manager/IndexManager.java
+++ b/blur-core/src/main/java/org/apache/blur/manager/IndexManager.java
@@ -690,25 +690,11 @@ public class IndexManager {
             List<Document> docs;
             AtomicBoolean moreDocsToFetch = new AtomicBoolean(false);
             AtomicInteger totalRecords = new AtomicInteger();
-            if (highlightQuery != null && fieldManager != null) {
-              String rowId = selector.getRowId();
-              if (rowId == null) {
-                rowId = getRowId(reader, docId);
-              }
-              Term term = new Term(ROW_ID, rowId);
-              HighlightOptions highlightOptions = selector.getHighlightOptions();
-              String preTag = highlightOptions.getPreTag();
-              String postTag = highlightOptions.getPostTag();
-              Tracer docTrace = Trace.trace("fetchRow - Document w/Highlight read");
-              docs = HighlightHelper.highlightDocuments(reader, term, fieldVisitor, selector,
highlightQuery,
-                  fieldManager, preTag, postTag, filter);
-              docTrace.done();
-            } else {
-              Tracer docTrace = Trace.trace("fetchRow - Document read");
-              docs = BlurUtil.fetchDocuments(reader, fieldVisitor, selector, maxHeap, table
+ "/" + shard,
-                  tableContext.getDefaultPrimeDocTerm(), filter, moreDocsToFetch, totalRecords);
-              docTrace.done();
-            }
+            BlurHighlighter highlighter = new BlurHighlighter(highlightQuery, fieldManager,
selector);
+            Tracer docTrace = Trace.trace("fetchRow - Document read");
+            docs = BlurUtil.fetchDocuments(reader, fieldVisitor, selector, maxHeap, table
+ "/" + shard,
+                tableContext.getDefaultPrimeDocTerm(), filter, moreDocsToFetch, totalRecords,
highlighter);
+            docTrace.done();
             Tracer rowTrace = Trace.trace("fetchRow - Row create");
             Row row = getRow(docs);
             if (row == null) {

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/2be7ad3e/blur-core/src/main/java/org/apache/blur/manager/writer/MutatableAction.java
----------------------------------------------------------------------
diff --git a/blur-core/src/main/java/org/apache/blur/manager/writer/MutatableAction.java b/blur-core/src/main/java/org/apache/blur/manager/writer/MutatableAction.java
index 3e09823..70cb377 100644
--- a/blur-core/src/main/java/org/apache/blur/manager/writer/MutatableAction.java
+++ b/blur-core/src/main/java/org/apache/blur/manager/writer/MutatableAction.java
@@ -116,7 +116,7 @@ public class MutatableAction {
           AtomicBoolean moreDocsToFetch = new AtomicBoolean(false);
           AtomicInteger totalRecords = new AtomicInteger();
           List<Document> docs = new ArrayList<Document>(BlurUtil.fetchDocuments(reader,
fieldVisitor, selector,
-              _maxHeap, _table + "/" + _shard, _primeDocTerm, null, moreDocsToFetch, totalRecords));
+              _maxHeap, _table + "/" + _shard, _primeDocTerm, null, moreDocsToFetch, totalRecords,
null));
           if (moreDocsToFetch.get()) {
             throw new IOException("Row too large to update.");
           }
@@ -170,7 +170,7 @@ public class MutatableAction {
           AtomicBoolean moreDocsToFetch = new AtomicBoolean(false);
           AtomicInteger totalRecords = new AtomicInteger();
           List<Document> docs = new ArrayList<Document>(BlurUtil.fetchDocuments(reader,
fieldVisitor, selector,
-              _maxHeap, _table + "/" + _shard, _primeDocTerm, null, moreDocsToFetch, totalRecords));
+              _maxHeap, _table + "/" + _shard, _primeDocTerm, null, moreDocsToFetch, totalRecords,
null));
           if (moreDocsToFetch.get()) {
             throw new IOException("Row too large to update.");
           }
@@ -229,7 +229,7 @@ public class MutatableAction {
           AtomicBoolean moreDocsToFetch = new AtomicBoolean(false);
           AtomicInteger totalRecords = new AtomicInteger();
           List<Document> docs = new ArrayList<Document>(BlurUtil.fetchDocuments(reader,
fieldVisitor, selector,
-              _maxHeap, _table + "/" + _shard, _primeDocTerm, null, moreDocsToFetch, totalRecords));
+              _maxHeap, _table + "/" + _shard, _primeDocTerm, null, moreDocsToFetch, totalRecords,
null));
           if (moreDocsToFetch.get()) {
             throw new IOException("Row too large to update.");
           }
@@ -300,7 +300,7 @@ public class MutatableAction {
           AtomicBoolean moreDocsToFetch = new AtomicBoolean(false);
           AtomicInteger totalRecords = new AtomicInteger();
           List<Document> docs = new ArrayList<Document>(BlurUtil.fetchDocuments(reader,
fieldVisitor, selector,
-              _maxHeap, _table + "/" + _shard, _primeDocTerm, null, moreDocsToFetch, totalRecords));
+              _maxHeap, _table + "/" + _shard, _primeDocTerm, null, moreDocsToFetch, totalRecords,
null));
           if (moreDocsToFetch.get()) {
             throw new IOException("Row too large to update.");
           }

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/2be7ad3e/blur-core/src/main/java/org/apache/blur/utils/BlurUtil.java
----------------------------------------------------------------------
diff --git a/blur-core/src/main/java/org/apache/blur/utils/BlurUtil.java b/blur-core/src/main/java/org/apache/blur/utils/BlurUtil.java
index 50634fa..aecf229 100644
--- a/blur-core/src/main/java/org/apache/blur/utils/BlurUtil.java
+++ b/blur-core/src/main/java/org/apache/blur/utils/BlurUtil.java
@@ -66,6 +66,7 @@ import org.apache.blur.index.ExitableReader.ExitableFilterAtomicReader;
 import org.apache.blur.log.Log;
 import org.apache.blur.log.LogFactory;
 import org.apache.blur.lucene.search.PrimeDocCache;
+import org.apache.blur.manager.BlurHighlighter;
 import org.apache.blur.manager.clusterstatus.ZookeeperPathConstants;
 import org.apache.blur.manager.results.BlurResultComparator;
 import org.apache.blur.manager.results.BlurResultIterable;
@@ -800,13 +801,17 @@ public class BlurUtil {
    * @param primeDocTerm
    * @param filter
    * @param totalRecords
+   * @param highlighter
    * 
    * @throws IOException
    */
   @SuppressWarnings("unchecked")
   public static List<Document> fetchDocuments(IndexReader reader, ResetableDocumentStoredFieldVisitor
fieldSelector,
       Selector selector, int maxHeap, String context, Term primeDocTerm, Filter filter, AtomicBoolean
moreToFetch,
-      AtomicInteger totalRecords) throws IOException {
+      AtomicInteger totalRecords, BlurHighlighter highlighter) throws IOException {
+    if (highlighter == null) {
+      highlighter = new BlurHighlighter();
+    }
     if (reader instanceof BaseCompositeReader) {
       BaseCompositeReader<IndexReader> indexReader = (BaseCompositeReader<IndexReader>)
reader;
       List<? extends IndexReader> sequentialSubReaders = BaseCompositeReaderUtil.getSequentialSubReaders(indexReader);
@@ -858,8 +863,14 @@ public class BlurUtil {
             }
             if (docsInRowSpanToFetch.fastGet(cursor)) {
               maxDocsToFetch--;
-              segmentReader.document(primeDocId + cursor, fieldSelector);
-              docs.add(fieldSelector.getDocument());
+              int docID = primeDocId + cursor;
+              segmentReader.document(docID, fieldSelector);
+              Document document = fieldSelector.getDocument();
+              if (highlighter.shouldHighlight()) {
+                docs.add(highlighter.highlight(docID, document, segmentReader));
+              } else {
+                docs.add(document);
+              }
               totalHeap += fieldSelector.getSize();
               fieldSelector.reset();
             }

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/2be7ad3e/blur-core/src/main/java/org/apache/blur/utils/HighlightHelper.java
----------------------------------------------------------------------
diff --git a/blur-core/src/main/java/org/apache/blur/utils/HighlightHelper.java b/blur-core/src/main/java/org/apache/blur/utils/HighlightHelper.java
index 5ae9baa..7ed52c1 100644
--- a/blur-core/src/main/java/org/apache/blur/utils/HighlightHelper.java
+++ b/blur-core/src/main/java/org/apache/blur/utils/HighlightHelper.java
@@ -17,16 +17,12 @@ package org.apache.blur.utils;
  * limitations under the License.
  */
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
 
 import org.apache.blur.analysis.FieldManager;
-import org.apache.blur.log.Log;
-import org.apache.blur.log.LogFactory;
 import org.apache.blur.lucene.search.SuperQuery;
-import org.apache.blur.thrift.generated.Selector;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Document;
@@ -37,8 +33,6 @@ import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.Filter;
-import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MultiPhraseQuery;
 import org.apache.lucene.search.NumericRangeQuery;
 import org.apache.lucene.search.PhraseQuery;
@@ -46,7 +40,6 @@ import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
-import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.highlight.Highlighter;
 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
@@ -58,8 +51,6 @@ import org.apache.lucene.util.BytesRef;
 
 public class HighlightHelper {
 
-  private static final Log LOG = LogFactory.getLog(HighlightHelper.class);
-
   private static final Collection<String> FIELDS_NOT_TO_HIGHLIGHT = new HashSet<String>()
{
     private static final long serialVersionUID = 1L;
     {
@@ -70,49 +61,6 @@ public class HighlightHelper {
     }
   };
 
-  public static List<Document> highlightDocuments(IndexReader reader, Term term,
-      ResetableDocumentStoredFieldVisitor fieldSelector, Selector selector, Query highlightQuery,
-      FieldManager fieldManager, String preTag, String postTag, Filter filter) throws IOException
{
-    IndexSearcher indexSearcher = new IndexSearcher(reader);
-    int docFreq = reader.docFreq(term);
-    BooleanQuery booleanQueryForFamily = null;
-    BooleanQuery booleanQuery = null;
-    if (selector.getColumnFamiliesToFetchSize() > 0) {
-      booleanQueryForFamily = new BooleanQuery();
-      for (String familyName : selector.getColumnFamiliesToFetch()) {
-        booleanQueryForFamily
-            .add(new TermQuery(new Term(BlurConstants.FAMILY, familyName)), BooleanClause.Occur.SHOULD);
-      }
-      booleanQuery = new BooleanQuery();
-      booleanQuery.add(new TermQuery(term), BooleanClause.Occur.MUST);
-      booleanQuery.add(booleanQueryForFamily, BooleanClause.Occur.MUST);
-    }
-    Query query = booleanQuery == null ? new TermQuery(term) : booleanQuery;
-    TopDocs topDocs = indexSearcher.search(query, filter, docFreq);
-    int totalHits = topDocs.totalHits;
-    List<Document> docs = new ArrayList<Document>();
-
-    int start = selector.getStartRecord();
-    int end = selector.getMaxRecordsToFetch() + start;
-
-    for (int i = start; i < end; i++) {
-      if (i >= totalHits) {
-        break;
-      }
-      int doc = topDocs.scoreDocs[i].doc;
-      indexSearcher.doc(doc, fieldSelector);
-      Document document = fieldSelector.getDocument();
-      try {
-        document = highlight(doc, document, highlightQuery, fieldManager, reader, preTag,
postTag);
-      } catch (InvalidTokenOffsetsException e) {
-        LOG.error("Unknown error while tring to highlight", e);
-      }
-      docs.add(document);
-      fieldSelector.reset();
-    }
-    return docs;
-  }
-
   /**
    * NOTE: This method will not preserve the correct field types.
    * 

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/2be7ad3e/blur-core/src/test/java/org/apache/blur/utils/BlurUtilsTest.java
----------------------------------------------------------------------
diff --git a/blur-core/src/test/java/org/apache/blur/utils/BlurUtilsTest.java b/blur-core/src/test/java/org/apache/blur/utils/BlurUtilsTest.java
index 4e2b697..6e6d505 100644
--- a/blur-core/src/test/java/org/apache/blur/utils/BlurUtilsTest.java
+++ b/blur-core/src/test/java/org/apache/blur/utils/BlurUtilsTest.java
@@ -181,7 +181,7 @@ public class BlurUtilsTest {
     AtomicInteger totalRecords = new AtomicInteger();
     List<Document> docs = BlurUtil.fetchDocuments(getReader(), resetableDocumentStoredFieldVisitor,
selector, 10000000,
         "test-context", new Term(BlurConstants.PRIME_DOC, BlurConstants.PRIME_DOC_VALUE),
null, moreDocsToFetch,
-        totalRecords);
+        totalRecords, null);
     assertEquals(docs.size(), 1);
     assertFalse(moreDocsToFetch.get());
     assertEquals(1, totalRecords.get());
@@ -196,13 +196,15 @@ public class BlurUtilsTest {
     columnFamiliesToFetch.add("f1");
     columnFamiliesToFetch.add("f2");
     selector.setColumnFamiliesToFetch(columnFamiliesToFetch);
+    selector.addToOrderOfFamiliesToFetch("f1");
+    selector.addToOrderOfFamiliesToFetch("f2");
 
     ResetableDocumentStoredFieldVisitor resetableDocumentStoredFieldVisitor = new ResetableDocumentStoredFieldVisitor();
     AtomicBoolean moreDocsToFetch = new AtomicBoolean(false);
     AtomicInteger totalRecords = new AtomicInteger();
     List<Document> docs = BlurUtil.fetchDocuments(getReaderWithDocsHavingFamily(),
resetableDocumentStoredFieldVisitor,
         selector, 10000000, "test-context", new Term(BlurConstants.PRIME_DOC, BlurConstants.PRIME_DOC_VALUE),
null,
-        moreDocsToFetch, totalRecords);
+        moreDocsToFetch, totalRecords, null);
     assertEquals(docs.size(), 2);
     assertEquals(docs.get(0).getField("family").stringValue(), "f1");
     assertEquals(docs.get(1).getField("family").stringValue(), "f2");
@@ -219,7 +221,7 @@ public class BlurUtilsTest {
     AtomicInteger totalRecords = new AtomicInteger();
     List<Document> docs = BlurUtil.fetchDocuments(getReader(), resetableDocumentStoredFieldVisitor,
selector, 10000000,
         "test-context", new Term(BlurConstants.PRIME_DOC, BlurConstants.PRIME_DOC_VALUE),
null, moreDocsToFetch,
-        totalRecords);
+        totalRecords, null);
     assertEquals(docs.size(), 2);
     assertFalse(moreDocsToFetch.get());
     assertEquals(2, totalRecords.get());

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/2be7ad3e/blur-mapred/src/main/java/org/apache/blur/mapreduce/BlurReducer.java
----------------------------------------------------------------------
diff --git a/blur-mapred/src/main/java/org/apache/blur/mapreduce/BlurReducer.java b/blur-mapred/src/main/java/org/apache/blur/mapreduce/BlurReducer.java
index d31c7fb..d0e04b0 100644
--- a/blur-mapred/src/main/java/org/apache/blur/mapreduce/BlurReducer.java
+++ b/blur-mapred/src/main/java/org/apache/blur/mapreduce/BlurReducer.java
@@ -255,7 +255,7 @@ public class BlurReducer extends Reducer<Text, BlurMutate, Text, BlurMutate>
{
     AtomicInteger totalRecords = new AtomicInteger();
     List<Document> docs = BlurUtil.fetchDocuments(_reader, new ResetableDocumentStoredFieldVisitor(),
new Selector()
         .setRowId(_rowIdTerm.text()), Integer.MAX_VALUE, "reducer-context", new Term(BlurConstants.PRIME_DOC,
-        BlurConstants.PRIME_DOC_VALUE), null, moreDocsToFetch, totalRecords);
+        BlurConstants.PRIME_DOC_VALUE), null, moreDocsToFetch, totalRecords, null);
     if (moreDocsToFetch.get()) {
       throw new IOException("Row too large to update.");
     }


Mime
View raw message