Return-Path: Delivered-To: apmail-jackrabbit-commits-archive@www.apache.org Received: (qmail 4548 invoked from network); 11 Aug 2008 12:09:17 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 11 Aug 2008 12:09:17 -0000 Received: (qmail 14376 invoked by uid 500); 11 Aug 2008 12:09:16 -0000 Delivered-To: apmail-jackrabbit-commits-archive@jackrabbit.apache.org Received: (qmail 14348 invoked by uid 500); 11 Aug 2008 12:09:16 -0000 Mailing-List: contact commits-help@jackrabbit.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@jackrabbit.apache.org Delivered-To: mailing list commits@jackrabbit.apache.org Received: (qmail 14339 invoked by uid 99); 11 Aug 2008 12:09:16 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 11 Aug 2008 05:09:16 -0700 X-ASF-Spam-Status: No, hits=-1998.9 required=10.0 tests=ALL_TRUSTED,FB_GET_MEDS X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 11 Aug 2008 12:08:20 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 876142388986; Mon, 11 Aug 2008 05:08:48 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r684732 - in /jackrabbit/trunk/jackrabbit-core/src: main/java/org/apache/jackrabbit/core/query/lucene/ main/resources/org/apache/jackrabbit/core/query/lucene/ test/java/org/apache/jackrabbit/core/query/ test/repository/workspaces/indexing-t... Date: Mon, 11 Aug 2008 12:08:41 -0000 To: commits@jackrabbit.apache.org From: mreutegg@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20080811120848.876142388986@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: mreutegg Date: Mon Aug 11 05:08:35 2008 New Revision: 684732 URL: http://svn.apache.org/viewvc?rev=684732&view=rev Log: JCR-1717: Configure occurrence of property value in excerpt Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.1.dtd jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/IndexingRuleTest.java jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/indexing-test/indexing-configuration.xml jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/indexing-test/workspace.xml Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java?rev=684732&r1=684731&r2=684732&view=diff ============================================================================== --- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java (original) +++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java Mon Aug 11 05:08:35 2008 @@ -21,6 +21,8 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.Set; +import java.util.Arrays; +import java.util.Collections; import org.apache.lucene.document.Field; import org.apache.lucene.index.TermPositionVector; @@ -150,15 +152,13 @@ int[] tvecindexes = tvec.indexesOf(terms, 0, terms.length); for (int i = 0; i < tvecindexes.length; i++) { TermVectorOffsetInfo[] termoffsets = tvec.getOffsets(tvecindexes[i]); - for (int ii = 0; ii < termoffsets.length; ii++) { - list.add(termoffsets[ii]); - } + list.addAll(Arrays.asList(termoffsets)); } TermVectorOffsetInfo[] offsets = (TermVectorOffsetInfo[]) list.toArray(new TermVectorOffsetInfo[list.size()]); // sort offsets if (terms.length > 1) { - java.util.Arrays.sort(offsets, new TermVectorOffsetInfoSorter()); + Arrays.sort(offsets, new TermVectorOffsetInfoSorter()); } return mergeFragments(offsets, text, excerptStart, @@ -175,43 +175,37 @@ String hlStart, String hlEnd, int maxFragments, - int surround) - throws IOException { - StringReader reader = new StringReader(text); + int surround) throws IOException { if (offsets == null || offsets.length == 0) { // nothing to highlight - StringBuffer excerpt = new StringBuffer(excerptStart); - excerpt.append(fragmentStart); - int min = excerpt.length(); - char[] buf = new char[surround * 2]; - int len = reader.read(buf); - excerpt.append(buf, 0, len); - if (len == buf.length) { - for (int i = excerpt.length() - 1; i > min; i--) { - if (Character.isWhitespace(excerpt.charAt(i))) { - excerpt.delete(i, excerpt.length()); - excerpt.append(" ..."); - break; - } - } - } - excerpt.append(fragmentEnd).append(excerptEnd); - return excerpt.toString(); + return createDefaultExcerpt(text, excerptStart, excerptEnd, + fragmentStart, fragmentEnd, surround * 2); } int lastOffset = offsets.length; // Math.min(10, offsets.length); // 10 terms is plenty? ArrayList fragmentInfoList = new ArrayList(); - FragmentInfo fi = new FragmentInfo(offsets[0], surround * 2); - for (int i = 1; i < lastOffset; i++) { - if (fi.add(offsets[i])) { - continue; + if (offsets[0].getEndOffset() <= text.length()) { + FragmentInfo fi = new FragmentInfo(offsets[0], surround * 2); + for (int i = 1; i < lastOffset; i++) { + if (offsets[i].getEndOffset() > text.length()) { + break; + } + if (fi.add(offsets[i])) { + continue; + } + fragmentInfoList.add(fi); + fi = new FragmentInfo(offsets[i], surround * 2); } fragmentInfoList.add(fi); - fi = new FragmentInfo(offsets[i], surround * 2); } - fragmentInfoList.add(fi); + + if (fragmentInfoList.isEmpty()) { + // nothing to highlight + return createDefaultExcerpt(text, excerptStart, excerptEnd, + fragmentStart, fragmentEnd, surround * 2); + } // sort with score - java.util.Collections.sort(fragmentInfoList, new FragmentInfoScoreSorter()); + Collections.sort(fragmentInfoList, new FragmentInfoScoreSorter()); // extract best fragments ArrayList bestFragmentsList = new ArrayList(); @@ -220,9 +214,10 @@ } // re-sort with positions - java.util.Collections.sort(bestFragmentsList, new FragmentInfoPositionSorter()); + Collections.sort(bestFragmentsList, new FragmentInfoPositionSorter()); // merge #maxFragments fragments + StringReader reader = new StringReader(text); StringBuffer sb = new StringBuffer(excerptStart); int pos = 0; char[] cbuf; @@ -231,7 +226,7 @@ int skippedChars; int firstWhitespace; for (int i = 0; i < bestFragmentsList.size(); i++) { - fi = (FragmentInfo) bestFragmentsList.get(i); + FragmentInfo fi = (FragmentInfo) bestFragmentsList.get(i); fi.trim(); nextStart = fi.getStartOffset(); skip = nextStart - pos; @@ -360,6 +355,44 @@ return sb.toString(); } + /** + * Creates a default excerpt with the given text. + * + * @param text the text. + * @param excerptStart the excerpt start. + * @param excerptEnd the excerpt end. + * @param fragmentStart the fragement start. + * @param fragmentEnd the fragment end. + * @param maxLength the maximum length of the fragment. + * @return a default excerpt. + * @throws IOException if an error occurs while reading from the text. + */ + protected String createDefaultExcerpt(String text, + String excerptStart, + String excerptEnd, + String fragmentStart, + String fragmentEnd, + int maxLength) throws IOException { + StringReader reader = new StringReader(text); + StringBuffer excerpt = new StringBuffer(excerptStart); + excerpt.append(fragmentStart); + int min = excerpt.length(); + char[] buf = new char[maxLength]; + int len = reader.read(buf); + excerpt.append(buf, 0, len); + if (len == buf.length) { + for (int i = excerpt.length() - 1; i > min; i--) { + if (Character.isWhitespace(excerpt.charAt(i))) { + excerpt.delete(i, excerpt.length()); + excerpt.append(" ..."); + break; + } + } + } + excerpt.append(fragmentEnd).append(excerptEnd); + return excerpt.toString(); + } + private static class FragmentInfo { ArrayList offsetInfosList; int startOffset; Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java?rev=684732&r1=684731&r2=684732&view=diff ============================================================================== --- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java (original) +++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfiguration.java Mon Aug 11 05:08:35 2008 @@ -66,8 +66,8 @@ /** * Returns true if the property with the given name should be - * included in the node scope fulltext index. If there is not configuration - * entry for that propery false is returned. + * included in the node scope fulltext index. If there is no configuration + * entry for that property false is returned. * * @param state the node state. * @param propertyName the name of a property. @@ -77,6 +77,18 @@ boolean isIncludedInNodeScopeIndex(NodeState state, Name propertyName); /** + * Returns true if the content of the property with the given + * name should show up in an excerpt. If there is no configuration entry for + * that property true is returned. + * + * @param state the node state. + * @param propertyName the name of a property. + * @return true if the content of the property should be + * included in an excerpt; false otherwise. + */ + boolean useInExcerpt(NodeState state, Name propertyName); + + /** * Returns the boost value for the given property name. If there is no * configuration entry for the property name the {@link #DEFAULT_BOOST} is * returned. Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java?rev=684732&r1=684731&r2=684732&view=diff ============================================================================== --- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java (original) +++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/IndexingConfigurationImpl.java Mon Aug 11 05:08:35 2008 @@ -270,6 +270,24 @@ return true; } + /** + * Returns true if the content of the property with the given + * name should show up in an excerpt. If there is no configuration entry for + * that property true is returned. + * + * @param state the node state. + * @param propertyName the name of a property. + * @return true if the content of the property should be + * included in an excerpt; false otherwise. + */ + public boolean useInExcerpt(NodeState state, Name propertyName) { + IndexingRule rule = getApplicableIndexingRule(state); + if (rule != null) { + return rule.useInExcerpt(propertyName); + } + // none of the config elements matched -> default is to include + return true; + } /** * Returns the analyzer configured for the property with this fieldName @@ -398,7 +416,16 @@ regexp.getNodeValue()).booleanValue(); } - PropertyConfig pc = new PropertyConfig(boost, nodeScopeIndex); + // get useInExcerpt flag + boolean useInExcerpt = true; + Node excerpt = attributes.getNamedItem("useInExcerpt"); + if (excerpt != null) { + useInExcerpt = Boolean.valueOf( + excerpt.getNodeValue()).booleanValue(); + } + + PropertyConfig pc = new PropertyConfig( + boost, nodeScopeIndex, useInExcerpt); if (isRegexp) { namePatterns.add(new NamePattern( @@ -693,6 +720,24 @@ } /** + * Returns true if the content of the property with the + * given name should show up in an excerpt. If there is no configuration + * entry for that property true is returned. + * + * @param propertyName the name of a property. + * @return true if the content of the property should be + * included in an excerpt; false otherwise. + */ + public boolean useInExcerpt(Name propertyName) { + PropertyConfig config = getConfig(propertyName); + if (config != null) { + return config.useInExcerpt; + } else { + return true; + } + } + + /** * Returns true if this rule applies to the given node * state. * @@ -788,9 +833,18 @@ */ final boolean nodeScopeIndex; - PropertyConfig(float boost, boolean nodeScopeIndex) { + /** + * Flag that indicates whether the content of a property should be used + * to create an excerpt. + */ + final boolean useInExcerpt; + + PropertyConfig(float boost, + boolean nodeScopeIndex, + boolean useInExcerpt) { this.boost = boost; this.nodeScopeIndex = nodeScopeIndex; + this.useInExcerpt = useInExcerpt; } } Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java?rev=684732&r1=684731&r2=684732&view=diff ============================================================================== --- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java (original) +++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java Mon Aug 11 05:08:35 2008 @@ -48,6 +48,8 @@ import java.util.Calendar; import java.util.Iterator; import java.util.Set; +import java.util.List; +import java.util.ArrayList; /** * Creates a lucene Document object from a {@link javax.jcr.Node}. @@ -107,6 +109,12 @@ protected IndexFormatVersion indexFormatVersion = IndexFormatVersion.V1; /** + * List of {@link FieldNames#FULLTEXT} fields which should not be used in + * an excerpt. + */ + protected List doNotUseInExcerpt = new ArrayList(); + + /** * Creates a new node indexer. * * @param node the node state to index. @@ -169,6 +177,7 @@ * values from the ItemStateProvider. */ protected Document createDoc() throws RepositoryException { + doNotUseInExcerpt.clear(); Document doc = new Document(); doc.setBoost(getNodeBoost()); @@ -237,6 +246,11 @@ throwRepositoryException(e); } } + + // now add fields that are not used in excerpt (must go at the end) + for (Iterator it = doNotUseInExcerpt.iterator(); it.hasNext(); ) { + doc.add((Field) it.next()); + } return doc; } @@ -328,7 +342,7 @@ } else { addStringValue(doc, fieldName, value.getString(), true, isIncludedInNodeIndex(name), - getPropertyBoost(name)); + getPropertyBoost(name), useInExcerpt(name)); } } break; @@ -612,10 +626,36 @@ * tokenized and added to the node scope fulltext * index. * @param boost the boost value for this string field. + * @deprecated use {@link #addStringValue(Document, String, Object, boolean, boolean, float, boolean)} instead. */ protected void addStringValue(Document doc, String fieldName, Object internalValue, boolean tokenized, boolean includeInNodeIndex, float boost) { + addStringValue(doc, fieldName, internalValue, tokenized, includeInNodeIndex, boost, true); + } + + /** + * Adds the string value to the document both as the named field and + * optionally for full text indexing if tokenized is + * true. + * + * @param doc The document to which to add the field + * @param fieldName The name of the field to add + * @param internalValue The value for the field to add to the + * document. + * @param tokenized If true the string is also + * tokenized and fulltext indexed. + * @param includeInNodeIndex If true the string is also + * tokenized and added to the node scope fulltext + * index. + * @param boost the boost value for this string field. + * @param useInExcerpt If true the string may show up in + * an excerpt. + */ + protected void addStringValue(Document doc, String fieldName, + Object internalValue, boolean tokenized, + boolean includeInNodeIndex, float boost, + boolean useInExcerpt) { // simple String String stringValue = (String) internalValue; @@ -638,7 +678,13 @@ if (includeInNodeIndex) { // also create fulltext index of this value - doc.add(createFulltextField(stringValue)); + boolean store = supportHighlighting && useInExcerpt; + f = createFulltextField(stringValue, store, supportHighlighting); + if (useInExcerpt) { + doc.add(f); + } else { + doNotUseInExcerpt.add(f); + } } } } @@ -670,9 +716,30 @@ * * @param value the string value. * @return a lucene field. + * @deprecated use {@link #createFulltextField(String, boolean, boolean)} instead. */ protected Field createFulltextField(String value) { - if (supportHighlighting) { + return createFulltextField(value, supportHighlighting, supportHighlighting); + } + + /** + * Creates a fulltext field for the string value. + * + * @param value the string value. + * @param store if the value of the field should be stored. + * @param withOffsets if a term vector with offsets should be stored. + * @return a lucene field. + */ + protected Field createFulltextField(String value, + boolean store, + boolean withOffsets) { + Field.TermVector tv; + if (withOffsets) { + tv = Field.TermVector.WITH_OFFSETS; + } else { + tv = Field.TermVector.NO; + } + if (store) { // store field compressed if greater than 16k Field.Store stored; if (value.length() > 0x4000) { @@ -681,10 +748,10 @@ stored = Field.Store.YES; } return new Field(FieldNames.FULLTEXT, value, stored, - Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS); + Field.Index.TOKENIZED, tv); } else { return new Field(FieldNames.FULLTEXT, value, - Field.Store.NO, Field.Index.TOKENIZED); + Field.Store.NO, Field.Index.TOKENIZED, tv); } } @@ -711,7 +778,7 @@ } finally { IOUtils.closeQuietly(value); } - return createFulltextField(textExtract.toString()); + return createFulltextField(textExtract.toString(), true, true); } else { return new Field(FieldNames.FULLTEXT, value); } @@ -750,6 +817,22 @@ } /** + * Returns true if the content of the property with the given + * name should the used to create an excerpt. + * + * @param propertyName the name of a property. + * @return true if it should be used to create an excerpt; + * false otherwise. + */ + protected boolean useInExcerpt(Name propertyName) { + if (indexingConfig == null) { + return true; + } else { + return indexingConfig.useInExcerpt(node, propertyName); + } + } + + /** * Returns the boost value for the given property name. * * @param propertyName the name of a property. Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java?rev=684732&r1=684731&r2=684732&view=diff ============================================================================== --- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java (original) +++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/WeightedHighlighter.java Mon Aug 11 05:08:35 2008 @@ -120,36 +120,32 @@ String hlStart, String hlEnd, int maxFragments, - int surround) { - + int surround) throws IOException { if (offsets == null || offsets.length == 0) { // nothing to highlight - StringBuffer excerpt = new StringBuffer(excerptStart); - excerpt.append(fragmentStart); - int min = excerpt.length(); - excerpt.append(text.substring(0, Math.min(text.length(), surround * 2))); - if (text.length() > excerpt.length()) { - for (int i = excerpt.length() - 1; i > min; i--) { - if (Character.isWhitespace(excerpt.charAt(i))) { - excerpt.delete(i, excerpt.length()); - excerpt.append(" ..."); - break; - } - } - } - excerpt.append(fragmentEnd).append(excerptEnd); - return excerpt.toString(); + return createDefaultExcerpt(text, excerptStart, excerptEnd, + fragmentStart, fragmentEnd, surround * 2); } PriorityQueue bestFragments = new FragmentInfoPriorityQueue(maxFragments); for (int i = 0; i < offsets.length; i++) { - FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2); - for (int j = i + 1; j < offsets.length; j++) { - if (!fi.add(offsets[j], text)) { - break; + if (offsets[i].getEndOffset() <= text.length()) { + FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2); + for (int j = i + 1; j < offsets.length; j++) { + if (offsets[j].getEndOffset() > text.length()) { + break; + } + if (!fi.add(offsets[j], text)) { + break; + } } + bestFragments.insert(fi); } - bestFragments.insert(fi); + } + + if (bestFragments.size() == 0) { + return createDefaultExcerpt(text, excerptStart, excerptEnd, + fragmentStart, fragmentEnd, surround * 2); } // retrieve fragment infos from queue and fill into list, least Modified: jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.1.dtd URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.1.dtd?rev=684732&r1=684731&r2=684732&view=diff ============================================================================== --- jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.1.dtd (original) +++ jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/indexing-configuration-1.1.dtd Mon Aug 11 05:08:35 2008 @@ -62,12 +62,15 @@ is not of type string. If isRegexp is set to true the name of the property is interpreted as a regular expression to match properties on a node. Please note that you may only use a regular expression for the local part of a - property name. + property name. The attribute useInExcerpt controls whether the contents + of the property is used to construct an excerpt. The default value for this + attribute is true. --> + isRegexp CDATA "false" + useInExcerpt CDATA "true"> Modified: jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/indexing-test/workspace.xml URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/indexing-test/workspace.xml?rev=684732&r1=684731&r2=684732&view=diff ============================================================================== --- jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/indexing-test/workspace.xml (original) +++ jackrabbit/trunk/jackrabbit-core/src/test/repository/workspaces/indexing-test/workspace.xml Mon Aug 11 05:08:35 2008 @@ -37,6 +37,8 @@ + +