lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Markus Jelsma <markus.jel...@openindex.io>
Subject RE: Question on "other language" than english stemmers and using both
Date Tue, 27 Feb 2018 10:11:50 GMT
Hello,

Mixing language specific filters in the same analyzer is not going to give predictable or
desirable results. Instead, create separate text_en and text_de fieldTypes and fields.  See
Solr's default schema.xml, it has many examples of various languages.

Depending on what query parser you use, you need to make sure you search on both fields now.

Regards,
Markus
 
-----Original message-----
> From:TG Servers <srvrs@prvtmail.net>
> Sent: Tuesday 27th February 2018 8:26
> To: solr-user@lucene.apache.org
> Subject: Question on &quot;other language&quot; than english stemmers and using
both
> 
> Hi,
> 
> I currently adapted this schema.xml for dovecot and Solr 7.2.1.
> Now this is stemming only english words.
> What do I have to do to use it for english AND german?
> Can I just put the according german filterfactorys appended to it or
> does that not work?
> E.g.
> ...
> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
> <filter class="solr.EnglishMinimalStemFilterFactory"/>
> <filter class="solr.GermanMinimalStemFilterFactory"/>
> ...
> 
> Thanks,
> Thomas
> 
> Original schema :
> 
> <schema name="dovecot" version="1.5">
> <types>
> <!-- IMAP has 32bit unsigned ints but java ints are signed, so use longs -->
> <fieldType name="string" class="solr.StrField" />
> <fieldType name="long" class="solr.LongPointField" />
> <fieldType name="boolean" class="solr.BoolField" />
> 
> <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
> <analyzer type="index">
> <tokenizer class="solr.StandardTokenizerFactory"/>
> <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_en.txt"/>
> <filter class="solr.WordDelimiterGraphFilterFactory"
> generateWordParts="1" generateNumberParts="1" catenateWords="1"
> catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
> <filter class="solr.FlattenGraphFilterFactory"/>
> <filter class="solr.LowerCaseFilterFactory"/>
> <filter class="solr.EnglishPossessiveFilterFactory"/>
> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
> <filter class="solr.EnglishMinimalStemFilterFactory"/>
> </analyzer>
> <analyzer type="query">
> <tokenizer class="solr.StandardTokenizerFactory"/>
> <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
> <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_en.txt"/>
> <filter class="solr.WordDelimiterGraphFilterFactory"
> generateWordParts="1" generateNumberParts="1" catenateWords="0"
> catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
> <filter class="solr.LowerCaseFilterFactory"/>
> <filter class="solr.EnglishPossessiveFilterFactory"/>
> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
> <filter class="solr.EnglishMinimalStemFilterFactory"/>
> </analyzer>
> </fieldType>
> </types>
> <fields>
> <field name="id" type="string" indexed="true" stored="true"
> required="true" />
> <field name="uid" type="long" indexed="true" stored="true"
> required="true" />
> <field name="box" type="string" indexed="true" stored="true"
> required="true" />
> <field name="user" type="string" indexed="true" stored="true"
> required="true" />
> 
> <field name="hdr" type="text" indexed="true" stored="false" />
> <field name="body" type="text" indexed="true" stored="false" />
> 
> <field name="from" type="text" indexed="true" stored="false" />
> <field name="to" type="text" indexed="true" stored="false" />
> <field name="cc" type="text" indexed="true" stored="false" />
> <field name="bcc" type="text" indexed="true" stored="false" />
> <field name="subject" type="text" indexed="true" stored="false" />
> 
> <!-- Used by Solr internally: -->
> <field name="_version_" type="long" indexed="true" stored="true"/>
> </fields>
> 
> <uniqueKey>id</uniqueKey>
> </schema>
> 

Mime
View raw message