lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Mike O'Leary" <tmole...@uw.edu>
Subject Getting started with indexing a database
Date Tue, 10 Jan 2012 01:39:25 GMT
I am trying to index the contents of a database for the first time, and I am only getting the
primary key of the table represented by the top level entity in my data-config.xml file to
be indexed. The database I am starting with has three tables:

The table called docs has columns called doc_id, type and last_modified. The primary key is
doc_id.
The table called codes has columns called id, doc_id, origin, type, code and last_modified.
The primary key is id. doc_id is a foreign key to the doc_id column in the docs table.
The table called texts has columns called id, doc_id, origin, type, text and last_modified.
The primary key is id. doc_id is a foreign key to the doc_id column in the docs table.

My data-config.xml file looks like this:

<dataConfig>
  <dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver"
              url="jdbc:mysql://localhost:3306/bioscope" user="db_user" password=""/>
  <document name="bioscope">
    <entity name="docs" pk="doc_id" query="SELECT doc_id, type FROM bioscope.docs"
            deltaQuery="SELECT doc_id FROM bioscope.docs where last_modified > '${dataimporter.last_index_time}'">
      <field column="doc_id" name="DOC_ID"/>
      <field column="type" name="DOC_TYPE"/>
      <entity name="codes" pk="id" query="SELECT id, origin, type, code FROM bioscope.codes
WHERE doc_id='${docs.doc_id}'"
              deltaQuery="SELECT doc_id FROM bioscope.codes WHERE last_modified > '${dataimporter.last_index_time}'"
              parentDeltaQuery="SELECT doc_id from bioscope.docs WHERE doc_id='${codes.doc_id}'">
        <field column="id" name="CODE_ID"/>
        <field column="doc_id" name="DOC_ID"/>
        <field column="origin" name="CODE_ORIGIN"/>
        <field column="type" name="CODE_TYPE"/>
        <field column="code" name="CODE_VALUE"/>
      </entity>
      <entity name="notes" pk="id" query="SELECT id, origin, type, text FROM bioscope.texts
WHERE doc_id='${docs.doc_id}'"
              deltaQuery="SELECT doc_id FROM bioscope.texts WHERE last_modified > '${dataimporter.last_index_time}'"
              parentDeltaQuery="SELECT doc_id from bioscope.docs WHERE doc_id='${texts.doc_id}'">
        <field column="id" name="NOTE_ID"/>
        <field column="doc_id" name="DOC_ID"/>
        <field column="origin" name="NOTE_ORIGIN"/>
        <field column="type" name="NOTE_TYPE"/>
        <field column="text" name="NOTE_TEXT"/>
      </entity>
    </entity>
  </document>
</dataConfig>

I added these lines to the schema.xml file:

<field name="DOC_ID" type="string" indexed="true" omitNorms="true" stored="true"/>
<field name="DOC_TYPE" type="string" indexed="true" omitNorms="true" stored="true"/>

<field name="CODE_ID" type="string" indexed="true" omitNorms="true" stored="true"/>
<field name="CODE_ORIGIN" type="string" indexed="true" omitNorms="true" stored="true"/>
<field name="CODE_TYPE" type="string" indexed="true" omitNorms="true" stored="true"/>
<field name="CODE_VALUE" type="string" indexed="true" omitNorms="true" stored="true"/>

<field name="NOTE_ID" type="string" indexed="true" omitNorms="true" stored="true"/>
<field name="NOTE_ORIGIN" type="string" indexed="true" omitNorms="true" stored="true"/>
<field name="NOTE_TYPE" type="string" indexed="true" omitNorms="true" stored="true"/>
<field name="NOTE_TEXT" type="text_ws" indexed="true" omitNorms="true" stored="true"/>

...

<uniqueKey>DOC_ID</uniqueKey>
<defaultSearchField>NOTE_TEXT</defaultSearchField>

When I run the full-import operation, only the DOC_ID values are written to the index. When
I run a program that dumps the index contents as an xml string, the output looks like this:

<?xml version="1.0" ?>
<documents>
  <document>
    <field name="DOC_ID" value="97634811">
    </field>
  </document>
  <document>
    <field name="DOC_ID" value="97634910">
    </field>
  </document>
...
</documents>

Since this is new to me, I am sure that I have simply left something out or specified something
the wrong way, but I haven't been able to spot what I have been doing wrong when I have gone
over the configuration files that I am using. Can anyone help me figure out why the other
database contents are not being indexed?
Thanks,
Mike


Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message