lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From waynelam <wayne...@ln.edu.hk>
Subject Searching of Chinese characters and English
Date Tue, 04 Sep 2012 05:03:11 GMT
Hi all,

I tried to modified the schema.xml and solrconfig.xml come with Drupal 
"search_api_solr" modules. I tried to modified it so that it is suitable 
for an CJK environment. I can see Chinese words cut up each 2 words in 
"Field Analysis". If i use the following query

my_ip_address:8080/solr/select?indent=on&version=2.2&fq=t_title:"Find"&start=0&rows=10&fl=t_title

I can see it returning results. The problem is when i change the search 
keywords for one of my field (e.g. t_title) to Chinese characters. It 
always shows

<result name="response" numFound="0" start="0"/>

in the results. It is strange because if a title contains both chinese 
and english (e.g. testing ??), when i search just the english part (e.g. 
fq=t_title:"testing"), i can find the result perfectly. It just happened 
to be problem when searching chinese characters.


Much appreciated if you guys can show me which part i did wrong.

Thanks

Wayne

*My Settings:*
Java : 1.6.0_24
Solr : version 3.6.1
tomcat: version 6.0.35

*My schema.xml* (i highlighted the place i changed from default)

*<fieldType name="text" class="solr.TextField" indexed="true" 
stored="true" multiValued="true">**
**      <analyzer type="index" 
class="org.apache.lucene.analysis.cjk.CJKAnalyzer">**
**        <tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer"/>**
**        <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="1" generateNumberParts="1" catenateWords="1" 
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>**
**        <filter class="solr.LowerCaseFilterFactory"/>**
**        <filter class="solr.SnowballPorterFilterFactory" 
language="English" protected="protwords.txt"/>**
**        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>**
**        <filter class="schema.UnicodeNormalizationFilterFactory" 
version="icu4j" composed="false" remove_diacritics="true" 
remove_modifiers="true" fold="true"/>**
**        <filter class="solr.ISOLatin1AccentFilterFactory"/>**
**      </analyzer>**
**      <analyzer type="query" 
class="org.apache.lucene.analysis.cjk.CJKAnalyzer">**
**        <tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer"/>**
**        <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="1" generateNumberParts="1" catenateWords="0" 
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>**
**        <filter class="solr.LowerCaseFilterFactory"/>**
**        <filter class="solr.SnowballPorterFilterFactory" 
language="English" protected="protwords.txt"/>**
**        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>**
**        <filter class="schema.UnicodeNormalizationFilterFactory" 
version="icu4j" composed="false" remove_diacritics="true" 
remove_modifiers="true" fold="true"/>**
**        <filter class="solr.ISOLatin1AccentFilterFactory"/>**
**      </analyzer>**
**    </fieldType>*

     <fieldType name="sortString" class="solr.TextField" indexed="true" 
stored="true" sortMissingLast="true" omitNorms="true">
       <analyzer>

         <tokenizer class="solr.KeywordTokenizerFactory"/>

         <filter class="solr.LowerCaseFilterFactory" />
         <filter class="solr.TrimFilterFactory" />
       </analyzer>
     </fieldType>

     <fieldType name="rand" class="solr.RandomSortField" indexed="true" />

     <fieldtype name="ignored" stored="true" indexed="false" 
class="solr.StrField" />
  </types>
  <fields>

    <field name="id"       type="string" indexed="true" stored="true" 
required="true" />
    <field name="item_id"  type="string" indexed="true" stored="true" 
required="true" />
    <field name="index_id" type="string" indexed="true" stored="true" 
required="true" />

    <copyField source="item_id" dest="ss_search_api_id" />
    <field name="spell" type="textSpell" indexed="true" stored="true" 
multiValued="true"/>
    <copyField source="t_*" dest="spell"/>

*<field name="t_title" type="text" indexed="true" stored="true" 
autoGeneratePhraseQueries="false"/>*
    <dynamicField name="t_*" type="text" termVectors="true" />
    <dynamicField name="ss_*" type="sortString" multiValued="false" 
termVectors="true" />
    <dynamicField name="sm_*" type="sortString" multiValued="true" 
termVectors="true" />
    <dynamicField name="is_*" type="tlong" multiValued="false" 
termVectors="true" />
    <dynamicField name="im_*" type="long" multiValued="true" 
termVectors="true" />
    <dynamicField name="fs_*" type="tdouble" multiValued="false" 
termVectors="true" />
    <dynamicField name="fm_*" type="tdouble" multiValued="true" 
termVectors="true" />
    <dynamicField name="ds_*" type="tdate" multiValued="false" 
termVectors="true" />
    <dynamicField name="dm_*" type="tdate" multiValued="true" 
termVectors="true" />
    <dynamicField name="bs_*" type="boolean" multiValued="false" 
termVectors="true" />
    <dynamicField name="bm_*" type="boolean" multiValued="true" 
termVectors="true" />
    <dynamicField name="f_ss_*" type="string" multiValued="false" 
termVectors="true" />
    <dynamicField name="f_sm_*" type="string" multiValued="true" 
termVectors="true" />
    <copyField source="ss_*" dest="f_ss_*" />
    <copyField source="sm_*" dest="f_sm_*" />
    <dynamicField name="*" type="ignored" multiValued="true" />
  </fields>

  <uniqueKey>id</uniqueKey>
  <solrQueryParser defaultOperator="AND"/>

</schema>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message