lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Aman Tandon <amantandon...@gmail.com>
Subject Re: Help: Problem in customized token filter
Date Thu, 18 Jun 2015 15:33:33 GMT
Please help, what wrong I am doing here. please guide me.

With Regards
Aman Tandon

On Thu, Jun 18, 2015 at 4:51 PM, Aman Tandon <amantandon.10@gmail.com>
wrote:

> Hi,
>
> I created a *token concat filter* to concat all the tokens from token
> stream. It creates the concatenated token as expected.
>
> But when I am posting the xml containing more than 30,000 documents, then
> only first document is having the data of that field.
>
> *Schema:*
>
> *<field name="titlex" type="text" indexed="true" stored="false"
>> required="false" omitNorms="false" multiValued="false" />*
>
>
>
>
>
>
>> *<fieldType name="text" class="solr.TextField"
>> positionIncrementGap="100">*
>> *      <analyzer type="index">*
>> *        <charFilter class="solr.HTMLStripCharFilterFactory"/>*
>> *        <tokenizer class="solr.StandardTokenizerFactory"/>*
>> *        <filter class="solr.WordDelimiterFilterFactory"
>> generateWordParts="1" generateNumberParts="1" catenateWords="0"
>> catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>*
>> *        <filter class="solr.LowerCaseFilterFactory"/>*
>> *        <filter class="solr.ShingleFilterFactory" maxShingleSize="3"
>> outputUnigrams="true" tokenSeparator=""/>*
>> *        <filter class="solr.SnowballPorterFilterFactory"
>> language="English" protected="protwords.txt"/>*
>> *        <filter
>> class="com.xyz.analysis.concat.ConcatenateWordsFilterFactory"/>*
>> *        <filter class="solr.SynonymFilterFactory"
>> synonyms="stemmed_synonyms_text_prime_ex_index.txt" ignoreCase="true"
>> expand="true"/>*
>> *      </analyzer>*
>> *      <analyzer type="query">*
>> *        <tokenizer class="solr.StandardTokenizerFactory"/>*
>> *        <filter class="solr.SynonymFilterFactory"
>> synonyms="synonyms.txt" ignoreCase="true" expand="true"/>*
>> *        <filter class="solr.StopFilterFactory" ignoreCase="true"
>> words="stopwords_text_prime_search.txt" enablePositionIncrements="true" />*
>> *        <filter class="solr.WordDelimiterFilterFactory"
>> generateWordParts="1" generateNumberParts="1" catenateWords="0"
>> catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>*
>> *        <filter class="solr.LowerCaseFilterFactory"/>*
>> *        <filter class="solr.SnowballPorterFilterFactory"
>> language="English" protected="protwords.txt"/>*
>> *        <filter
>> class="com.xyz.analysis.concat.ConcatenateWordsFilterFactory"/>*
>> *      </analyzer>**    </fieldType>*
>
>
> Please help me, The code for the filter is as follows, please take a look.
>
> Here is the picture of what filter is doing
> <http://i.imgur.com/THCsYtG.png?1>
>
> The code of concat filter is :
>
> *package com.xyz.analysis.concat;*
>>
>> *import java.io.IOException;*
>>
>>
>>> *import org.apache.lucene.analysis.TokenFilter;*
>>
>> *import org.apache.lucene.analysis.TokenStream;*
>>
>> *import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;*
>>
>> *import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;*
>>
>> *import
>>> org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;*
>>
>> *import org.apache.lucene.analysis.tokenattributes.TypeAttribute;*
>>
>>
>>> *public class ConcatenateWordsFilter extends TokenFilter {*
>>
>>
>>> *  private CharTermAttribute charTermAttribute =
>>> addAttribute(CharTermAttribute.class);*
>>
>> *  private OffsetAttribute offsetAttribute =
>>> addAttribute(OffsetAttribute.class);*
>>
>> *  PositionIncrementAttribute posIncr =
>>> addAttribute(PositionIncrementAttribute.class);*
>>
>> *  TypeAttribute typeAtrr = addAttribute(TypeAttribute.class);*
>>
>>
>>> *  private StringBuilder stringBuilder = new StringBuilder();*
>>
>> *  private boolean exhausted = false;*
>>
>>
>>> *  /***
>>
>> *   * Creates a new ConcatenateWordsFilter*
>>
>> *   * @param input TokenStream that will be filtered*
>>
>> *   */*
>>
>> *  public ConcatenateWordsFilter(TokenStream input) {*
>>
>> *    super(input);*
>>
>> *  }*
>>
>>
>>> *  /***
>>
>> *   * {@inheritDoc}*
>>
>> *   */*
>>
>> *  @Override*
>>
>> *  public final boolean incrementToken() throws IOException {*
>>
>> *    while (!exhausted && input.incrementToken()) {*
>>
>> *      char terms[] = charTermAttribute.buffer();*
>>
>> *      int termLength = charTermAttribute.length();*
>>
>> *      if(typeAtrr.type().equals("<ALPHANUM>")){*
>>
>> *     stringBuilder.append(terms, 0, termLength);*
>>
>> *      }*
>>
>> *      charTermAttribute.copyBuffer(terms, 0, termLength);*
>>
>> *      return true;*
>>
>> *    }*
>>
>>
>>> *    if (!exhausted) {*
>>
>> *      exhausted = true;*
>>
>> *      String sb = stringBuilder.toString();*
>>
>> *      System.err.println("The Data got is "+sb);*
>>
>> *      int sbLength = sb.length();*
>>
>> *      //posIncr.setPositionIncrement(0);*
>>
>> *      charTermAttribute.copyBuffer(sb.toCharArray(), 0, sbLength);*
>>
>> *      offsetAttribute.setOffset(offsetAttribute.startOffset(),
>>> offsetAttribute.startOffset()+sbLength);*
>>
>> *      stringBuilder.setLength(0);*
>>
>> *      //typeAtrr.setType("CONCATENATED");*
>>
>> *      return true;*
>>
>> *    }*
>>
>> *    return false;*
>>
>> *  }*
>>
>> *}*
>>
>>
>
> With Regards
> Aman Tandon
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message