lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Michael Sokolov <msoko...@safaribooksonline.com>
Subject Re: Multi Language Suggester Solr Issue
Date Sun, 28 Dec 2014 14:13:56 GMT
I noticed that your suggester analyzers include

<filter class="solr.PatternReplaceFilterFactory" pattern="([^\w\d\*æøåÆØÅ ])"
         replacement=" " replace="all" />

which seems like a bad idea -- this will strip all those arabic, russian 
and japanese characters entirely, leaving you with probably only 
whitespace in your tokens.  Try just removing that?

-Mike

On 12/24/14 6:09 PM, alaa.abuzaghleh wrote:
> I am trying create suggester handler using solr 4.8, everything work fine but
> when I try to get suggestion using different language Arabic, or Japanese
> for example I got result in mixed language, but I am trying to search only
> using Japanese, I got Arabic with that too. the following is my Schema.xml
>
> <?xml version="1.0" encoding="UTF-8" ?>
> <schema name="people_schema" version="1.5">
>          <fields>
>                  <field name="_version_" type="long" indexed="true"
> stored="true" />
>                  <field name="id" type="string" indexed="true" stored="true"
>                          required="true" />
>                  <field name="first_name" type="txt_general" indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="last_name" type="txt_general" indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="about" type="text_general_edge_ngram"
> indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="year_birth" type="tint" indexed="true"
> stored="true"
>                          multiValued="false" />
>                  <field name="month_birth" type="tint" indexed="true"
> stored="true"
>                          multiValued="false" />
>                  <field name="day_birth" type="tint" indexed="true"
> stored="true"
>                          multiValued="false" />
>                  <field name="country" type="string" indexed="true"
> stored="true"
>                          required="false" multiValued="false" />
>                  <field name="country_tree" type="placetree" indexed="true"
>                          stored="false" multiValued="false" />
>                  <field name="state" type="string" indexed="true"
> stored="true"
>                          required="false" multiValued="false" />
>                  <field name="state_tree" type="placetree" indexed="true"
> stored="false"
>                          multiValued="false" />
>                  <field name="city" type="string" indexed="true"
> stored="true"
>                          required="false" multiValued="false" />
>                  <field name="city_tree" type="placetree" indexed="true"
> stored="false"
>                          multiValued="false" />
>                  <field name="job" type="string" indexed="true" stored="true"
>                          required="false" multiValued="false" />
>                  <field name="job_tree" type="txt_general" indexed="true"
> stored="true"
>                          multiValued="false" />
>                  <field name="company" type="string" indexed="true"
> stored="true"
>                          required="false" multiValued="false" />
>                  <field name="company_tree" type="companytree" indexed="true"
>                          stored="false" multiValued="false" />
>
>                  <field name="full_name" type="txt_general" indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="full_name_suggest" type="text_suggest"
> indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="full_name_edge" type="text_suggest_edge"
> indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="full_name_ngram" type="text_suggest_ngram"
> indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="full_name_sort" type="alphaNumericSort"
> indexed="true"
>                          stored="true" multiValued="false" />
>                         
>                 
>              <field name="job_suggest" type="text_suggest" indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="job_edge" type="text_suggest_edge"
> indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="job_ngram" type="text_suggest_ngram"
> indexed="true"
>                          stored="true" multiValued="false" />
>                  <field name="job_sort" type="alphaNumericSort"
> indexed="true"
>                          stored="true" multiValued="false" />
>                 
>                 
>                  <copyField source="full_name" dest="full_name_suggest" />
>                  <copyField source="full_name" dest="full_name_edge" />
>                  <copyField source="full_name" dest="full_name_ngram" />
>                  <copyField source="full_name" dest="full_name_sort" />
>                 
>                  <copyField source="job_tree" dest="job_suggest" />
>                  <copyField source="job_tree" dest="job_edge" />
>                  <copyField source="job_tree" dest="job_ngram" />
>                  <copyField source="job_tree" dest="job_sort" />
>                 
>          </fields>
>          <uniqueKey>id</uniqueKey>
>          <types>
>                 
>                  <fieldType name="string" class="solr.StrField"
>                          sortMissingLast="true" />
>                 
>                  <fieldType name="boolean" class="solr.BoolField"
>                          sortMissingLast="true" />
>                  <fieldType name="int" class="solr.TrieIntField"
>                          precisionStep="0" positionIncrementGap="0" />
>                  <fieldType name="float" class="solr.TrieFloatField"
>                          precisionStep="0" positionIncrementGap="0" />
>                  <fieldType name="long" class="solr.TrieLongField"
>                          precisionStep="0" positionIncrementGap="0" />
>                  <fieldType name="double" class="solr.TrieDoubleField"
>                          precisionStep="0" positionIncrementGap="0" />
>                  <fieldType name="tint" class="solr.TrieIntField"
>                          precisionStep="8" positionIncrementGap="0" />
>                  <fieldType name="tfloat" class="solr.TrieFloatField"
>                          precisionStep="8" positionIncrementGap="0" />
>                  <fieldType name="tlong" class="solr.TrieLongField"
>                          precisionStep="8" positionIncrementGap="0" />
>                  <fieldType name="tdouble" class="solr.TrieDoubleField"
>                          precisionStep="8" positionIncrementGap="0" />
>                  <fieldType name="date" class="solr.TrieDateField"
>                          precisionStep="0" positionIncrementGap="0" />
>                  <fieldType name="tdate" class="solr.TrieDateField"
>                          precisionStep="6" positionIncrementGap="0" />
>                  <fieldtype name="binary" class="solr.BinaryField" />
>
>                  <fieldType name="text_general_edge_ngram"
> class="solr.TextField"
>                          positionIncrementGap="100">
>                          <analyzer type="index">
>                                  <tokenizer
> class="solr.LowerCaseTokenizerFactory" />
>                                  <filter class="solr.EdgeNGramFilterFactory"
> minGramSize="2"
>                                          maxGramSize="15" side="front" />
>                          </analyzer>
>                          <analyzer type="query">
>                                  <tokenizer
> class="solr.LowerCaseTokenizerFactory" />
>                          </analyzer>
>                  </fieldType>
>
>
>
>                  <fieldType name="txt_general" class="solr.TextField"
>                          positionIncrementGap="100">
>                          <analyzer type="index">
>                                  <tokenizer
> class="solr.StandardTokenizerFactory" />
>                                  <filter class="solr.LowerCaseFilterFactory"
> />
>                          </analyzer>
>                          <analyzer type="query">
>                                  <tokenizer
> class="solr.StandardTokenizerFactory" />
>                                  <filter class="solr.SynonymFilterFactory"
> synonyms="synonyms.txt"
>                                          ignoreCase="true" expand="true" />
>                                  <filter class="solr.LowerCaseFilterFactory"
> />
>                          </analyzer>
>                  </fieldType>
>                  <fieldtype name="name_phonetic" stored="false"
> indexed="true"
>                          class="solr.TextField">
>                          <analyzer>
>                                  <tokenizer
> class="solr.StandardTokenizerFactory" />
>                                  <filter
> class="solr.DoubleMetaphoneFilterFactory" inject="false" />
>                          </analyzer>
>                  </fieldtype>
>                 
>                  <fieldType name="placetree" class="solr.TextField"
>                          positionIncrementGap="100">
>                          <analyzer>
>                                  <tokenizer
> class="solr.StandardTokenizerFactory" />
>                                  <filter class="solr.LowerCaseFilterFactory"
> />
>                          </analyzer>
>                  </fieldType>
>                 
>                 
>                 
>                 
>                 
>                  <fieldType name="jobtree" class="solr.TextField"
>                          positionIncrementGap="100">
>                          <analyzer>
>                                  <tokenizer
> class="solr.StandardTokenizerFactory" />
>                                  <filter class="solr.LowerCaseFilterFactory"
> />
>                          </analyzer>
>                  </fieldType>
>                  <fieldType name="companytree" class="solr.TextField"
>                          positionIncrementGap="100">
>                          <analyzer>
>                                  <tokenizer
> class="solr.StandardTokenizerFactory" />
>                                  <filter class="solr.LowerCaseFilterFactory"
> />
>                          </analyzer>
>                  </fieldType>
>
>
>                 
>                 
>                 
>                 
>                 
>                 
>                 
>                 
>
>                 
>                 
>                 
>                 
>                 
>                 
>                 
>                 
>
>          <fieldType name="text_suggest_ngram" class="solr.TextField">
>                  <analyzer type="index">
>                          <charFilter class="solr.MappingCharFilterFactory"
> mapping="mapping-ISOLatin1Accent.txt" />
>                          <tokenizer class="solr.StandardTokenizerFactory" />
>                          <filter class="solr.WordDelimiterFilterFactory"
>                                  generateWordParts="1"
> generateNumberParts="1" catenateWords="0"
>                                  catenateNumbers="0" catenateAll="0"
> splitOnCaseChange="1" />
>                          <filter class="solr.LowerCaseFilterFactory" />
>                          <filter class="solr.EdgeNGramFilterFactory"
> maxGramSize="20"
>                                  minGramSize="1" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([^\w\d\*æøåÆØÅ ])"
>                                  replacement="" replace="all" />
>                  </analyzer>
>                  <analyzer type="query">
>                          <charFilter class="solr.MappingCharFilterFactory"
> mapping="mapping-ISOLatin1Accent.txt" />
>                          <tokenizer class="solr.StandardTokenizerFactory" />
>                          <filter class="solr.WordDelimiterFilterFactory"
>                                  generateWordParts="0"
> generateNumberParts="0" catenateWords="0"
>                                  catenateNumbers="0" catenateAll="0"
> splitOnCaseChange="0" />
>                          <filter class="solr.LowerCaseFilterFactory" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([^\w\d\*æøåÆØÅ ])"
>                                  replacement="" replace="all" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="^(.{20})(.*)?"
>                                  replacement="$1" replace="all" />
>                  </analyzer>
>          </fieldType>
>
>          <fieldType name="alphaNumericSort" class="solr.TextField"
>                  sortMissingLast="true" omitNorms="true">
>                  <analyzer>
>                          <tokenizer class="solr.KeywordTokenizerFactory" />
>                          <filter class="solr.LowerCaseFilterFactory" />
>                          <filter class="solr.TrimFilterFactory" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="^(a |the |les |la |le |l'|de la |du |des )"
>                                  replacement="" replace="all" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([^a-z0-9])"
>                                  replacement="" replace="all" />
>                  </analyzer>
>          </fieldType>
>
>          <fieldType name="text_suggest_edge" class="solr.TextField">
>                  <analyzer type="index">
>                          <charFilter class="solr.MappingCharFilterFactory"
> mapping="mapping-ISOLatin1Accent.txt" />
>                          <tokenizer class="solr.KeywordTokenizerFactory" />
>                          <filter class="solr.LowerCaseFilterFactory" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([\.,;:-_])"
>                                  replacement=" " replace="all" />
>                          <filter class="solr.EdgeNGramFilterFactory"
> maxGramSize="30"
>                                  minGramSize="1" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([^\w\d\*æøåÆØÅ ])"
>                                  replacement="" replace="all" />
>                  </analyzer>
>                  <analyzer type="query">
>                          <charFilter class="solr.MappingCharFilterFactory"
> mapping="mapping-ISOLatin1Accent.txt" />
>                          <tokenizer class="solr.KeywordTokenizerFactory" />
>                          <filter class="solr.LowerCaseFilterFactory" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([\.,;:-_])"
>                                  replacement=" " replace="all" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([^\w\d\*æøåÆØÅ ])"
>                                  replacement="" replace="all" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="^(.{30})(.*)?"
>                                  replacement="$1" replace="all" />
>                  </analyzer>
>          </fieldType>
>
>          <fieldType name="text_suggest" class="solr.TextField"
>                  positionIncrementGap="100">
>                  <analyzer type="index">
>                          <charFilter class="solr.MappingCharFilterFactory"
> mapping="mapping-ISOLatin1Accent.txt" />
>                          <tokenizer class="solr.StandardTokenizerFactory" />
>                          <filter class="solr.WordDelimiterFilterFactory"
>                                  generateWordParts="1"
> generateNumberParts="1" catenateWords="1"
>                                  catenateNumbers="1" catenateAll="1"
> splitOnCaseChange="1"
>                                  splitOnNumerics="1" preserveOriginal="1" />
>                          <filter class="solr.LowerCaseFilterFactory" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([^\w\d\*æøåÆØÅ ])"
>                                  replacement=" " replace="all" />
>                  </analyzer>
>                  <analyzer type="query">
>                          <charFilter class="solr.MappingCharFilterFactory"
> mapping="mapping-ISOLatin1Accent.txt" />
>                          <tokenizer class="solr.StandardTokenizerFactory" />
>                          <filter class="solr.WordDelimiterFilterFactory"
>                                  generateWordParts="0"
> generateNumberParts="0" catenateWords="0"
>                                  catenateNumbers="0" catenateAll="0"
> splitOnCaseChange="0"
>                                  splitOnNumerics="0" />
>                          <filter class="solr.LowerCaseFilterFactory" />
>                          <filter class="solr.PatternReplaceFilterFactory"
> pattern="([^\w\d\*æøåÆØÅ ])"
>                                  replacement=" " replace="all" />
>                  </analyzer>
>          </fieldType>
>
>
>                 
>          </types>
> </schema>
>
> and this is my SolrConfig
>
> <?xml version="1.0" encoding="UTF-8" ?>
>
>
>
> <config>
>          <luceneMatchVersion>4.8</luceneMatchVersion>
>         
>          <directoryFactory name="DirectoryFactory"
>                 
> class="${solr.directoryFactory:solr.StandardDirectoryFactory}" />
>
>          <dataDir>${solr.core0.data.dir:}</dataDir>
>
>         
>          <schemaFactory class="ClassicIndexSchemaFactory" />
>
>          <updateHandler class="solr.DirectUpdateHandler2">
>                  <updateLog>
>                          <str name="dir">${solr.core0.data.dir:}</str>
>                  </updateLog>
>          </updateHandler>
>
>         
>          <requestHandler name="/get" class="solr.RealTimeGetHandler">
>                  <lst name="defaults">
>                          <str name="omitHeader">true</str>
>                  </lst>
>          </requestHandler>
>         
>          <requestHandler name="/select" class="solr.SearchHandler">
>     
>       <lst name="defaults">
>         <str name="echoParams">explicit</str>
>         <int name="rows">10</int>
>         <str name="df">id</str>
>         </lst>
>      </requestHandler>
>
>          <requestHandler name="/suggest" class="solr.SearchHandler">
>                  <lst name="defaults">
>                          <str name="echoParams">explicit</str>
>                          <str name="defType">edismax</str>
>                          <str name="rows">10</str>
>                          <str name="fl">full_name,job_tree, company, city,
> state, country, first_name, last_name, id</str>
>                          <str name="qf">full_name_suggest^60
> full_name_ngram^100.0 job_suggest^30 job_ngram^50.0 </str>
>                          <str name="pf">full_name_edge^100.0
> job_edge^50.0</str>
>                          <str name="group">true</str>
>                          <str name="group.field">full_name</str>
>             
>
>    <str name="sort">full_name asc</str>
>    <str name="group.sort">full_name asc</str>
>                  </lst>
>          </requestHandler>
>         
>         
>
>          <requestHandler name="/replication" class="solr.ReplicationHandler"
>                  startup="lazy" />
>
>          <requestDispatcher handleSelect="true">
>                  <requestParsers enableRemoteStreaming="false"
>                          multipartUploadLimitInKB="2048"
> formdataUploadLimitInKB="2048" />
>          </requestDispatcher>
>
>          <requestHandler name="standard" class="solr.StandardRequestHandler"
>                  default="true" />
>          <requestHandler name="/analysis/field" startup="lazy"
>                  class="solr.FieldAnalysisRequestHandler" />
>          <requestHandler name="/update" class="solr.UpdateRequestHandler" />
>          <requestHandler name="/admin/"
>                  class="org.apache.solr.handler.admin.AdminHandlers" />
>
>          <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
>                  <lst name="invariants">
>                          <str name="q">solrpingquery</str>
>                  </lst>
>                  <lst name="defaults">
>                          <str name="echoParams">all</str>
>                  </lst>
>          </requestHandler>
>
>         
>          <admin>
>                  <defaultQuery>solr</defaultQuery>
>          </admin>
>
> </config>
> the following is the result for
> (http://localhost:9090/solr/people/suggest?q=%E3%82%B7%E3%82%B9%E3%83%86%E3%83%A0%E3%82%A2%E3%83%8A%E3%83%AA%E3%82%B9%E3%83%88&wt=json&indent=true)
>
> {
>    "responseHeader":{
>      "status":0,
>      "QTime":8,
>      "params":{
>        "indent":"true",
>        "q":"システムアナリスト",
>        "wt":"json"}},
>    "grouped":{
>      "full_name":{
>        "matches":2,
>        "groups":[{
>            "groupValue":"مسعود",
>            "doclist":{"numFound":1,"start":0,"docs":[
>                {
>                  "job_tree":"رسام كاريكاتور",
>                  "last_name":"النغش",
>                  "state":"Amman",
>                  "country":"Jordan",
>                  "city":"Amman",
>                  "id":"fa0a5f94-0497-49f6-9060-ec45c27c0d8e",
>                  "company":"شركة الفنون المتطورة",
>                  "full_name":"مسعود  النغش",
>                  "first_name":"مسعود "}]
>            }},
>          {
>            "groupValue":"ね",
>            "doclist":{"numFound":1,"start":0,"docs":[
>                {
>                  "job_tree":"システムアナリスト",
>                  "last_name":"シャン",
>                  "state":"Tokyo",
>                  "country":"Japan",
>                  "city":"Tokyo",
>                  "id":"4fdce27b-3a9b-4045-85f3-2d5087d97b50",
>                  "company":"日立",
>                  "full_name":"すね シャン",
>                  "first_name":"すね"}]
>            }}]}}}
>
> I dont now why it bring the Arabic text with it the result is the same if I
> try to search for the Arabic. Any help from you will be highly appreciated.
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/Multi-Language-Suggester-Solr-Issue-tp4176075.html
> Sent from the Solr - User mailing list archive at Nabble.com.


Mime
View raw message