lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From anarchos78 <rigasathanasio...@hotmail.com>
Subject Is it possible to index pdfs and database into single document?
Date Fri, 11 May 2012 19:08:38 GMT
Hello again,
I can index pdf using:
*data-config.xml*
<?xml version="1.0" encoding="utf-8"?>
<dataConfig>
<dataSource type="BinFileDataSource" name="binary" />
  	<document>
		<entity name="f" dataSource="binary" rootEntity="false"
processor="FileListEntityProcessor" baseDir="../solr/docu/" fileName=".*pdf"
recursive="true">
			<entity name="tika" processor="TikaEntityProcessor" 
url="${f.fileAbsolutePath}" format="text">
				<field column="id" name="id" meta="true" />
				<field column="fake_id" name="fake_id" meta="true" />
				<field column="model" name="model" meta="true" />
				<field column="text" name="biog" />
			</entity>
		</entity>
	</document>  
</dataConfig> 

I can also index a database using:
*data-config.xml*
<?xml version="1.0" encoding="utf-8"?>

<dataConfig>

  <dataSource type="JdbcDataSource" 
              driver="com.mysql.jdbc.Driver"
              url="jdbc:mysql://127.0.0.1:3306/rental" 
              user="root" 
              password="1a2b3c4d"
			  name="db" />
			  
  <dataSource type="BinFileDataSource" name="binary" />			  
			  
  <document>
  
    <entity name="members" dataSource="db"
transformer="HTMLStripTransformer" query="select CONCAT('m_',id) as fake_id,
id, firstname, lastname, biog, model from members">
		<field column="id" name="id" /> 
		<field column="fake_id" name="fake_id" />
        <field column="firstname" name="firstname" stripHTML="true" />
        <field column="lastname" name="lastname" stripHTML="true" />
		<field column="biog" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true"  />
    </entity>
	
	<entity name="new_members" dataSource="db"
transformer="HTMLStripTransformer" query="select CONCAT('nm_',id) as
fake_id, id, firstname, lastname, biog, model from new_members">
		<field column="id" name="id" />
		<field column="fake_id" name="fake_id" /> 
        <field column="firstname" name="firstname" stripHTML="true" />
        <field column="lastname" name="lastname" stripHTML="true" />
		<field column="biog" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true" />
    </entity>
  
  
  <entity name="books" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('b_',id) as fake_id, id, title, description, model from
books">
		<field column="id" name="id" />
		<field column="fake_id" name="fake_id" /> 
        <field column="title" name="title" stripHTML="true" />
        <field column="description" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true" />
    </entity>
  
  
  <entity name="journals" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('j_',id) as fake_id, id, title, description, model from
journals">
		<field column="id" name="id" />
		<field column="fake_id" name="fake_id" /> 
        <field column="title" name="title" stripHTML="true" />
        <field column="description" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true" />
    </entity>
  
  
  <entity name="cds" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('c_',id) as fake_id, id, title, description, model from
cd">
		<field column="id" name="id" /> 
		<field column="fake_id" name="fake_id" />
        <field column="title" name="title" stripHTML="true" />
        <field column="description" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true" />
    </entity>
</document>
</dataConfig>

For the above I have:
*schema.xml(fields)*
<fields>
<field  name="id" type="string" indexed="true" stored="true" /> 
  <field  name="fake_id" type="string" indexed="true" stored="true" /> 
  <field  name="model" type="text_en" indexed="true" stored="true"  />
  <field  name="firstname" type="text_en" indexed="true" stored="true"/>
  <field  name="lastname" type="text_en" indexed="true" stored="true"/>
  <field  name="title" type="text_en" indexed="true" stored="true"/>
  <field  name="biog" type="text_en" indexed="true" stored="true"/>
 </fields>
<uniqueKey>fake_id</uniqueKey>
<defaultSearchField>biog</defaultSearchField>



But when I am using the below data-config.xml indexing fails:

*data-config.xml*

<?xml version="1.0" encoding="utf-8"?>

<dataConfig>

  <dataSource type="JdbcDataSource" 
              driver="com.mysql.jdbc.Driver"
              url="jdbc:mysql://127.0.0.1:3306/rental" 
              user="root" 
              password="1a2b3c4d"
			  name="db" />
			  
  <dataSource type="BinFileDataSource" name="binary" />			  
			  
  <document>
  
    <entity name="members" dataSource="db"
transformer="HTMLStripTransformer" query="select CONCAT('m_',id) as fake_id,
id, firstname, lastname, biog, model from members">
		<field column="id" name="id" /> 
		<field column="fake_id" name="fake_id" />
        <field column="firstname" name="firstname" stripHTML="true" />
        <field column="lastname" name="lastname" stripHTML="true" />
		<field column="biog" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true"  />
    </entity>
	
	<entity name="new_members" dataSource="db"
transformer="HTMLStripTransformer" query="select CONCAT('nm_',id) as
fake_id, id, firstname, lastname, biog, model from new_members">
		<field column="id" name="id" />
		<field column="fake_id" name="fake_id" /> 
        <field column="firstname" name="firstname" stripHTML="true" />
        <field column="lastname" name="lastname" stripHTML="true" />
		<field column="biog" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true" />
    </entity>
  
  
  <entity name="books" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('b_',id) as fake_id, id, title, description, model from
books">
		<field column="id" name="id" />
		<field column="fake_id" name="fake_id" /> 
        <field column="title" name="title" stripHTML="true" />
        <field column="description" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true" />
    </entity>
  
  
  <entity name="journals" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('j_',id) as fake_id, id, title, description, model from
journals">
		<field column="id" name="id" />
		<field column="fake_id" name="fake_id" /> 
        <field column="title" name="title" stripHTML="true" />
        <field column="description" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true" />
    </entity>
  
  
  <entity name="cds" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('c_',id) as fake_id, id, title, description, model from
cd">
		<field column="id" name="id" /> 
		<field column="fake_id" name="fake_id" />
        <field column="title" name="title" stripHTML="true" />
        <field column="description" name="biog" stripHTML="true" />
		<field column="model" name="model" stripHTML="true" />
    </entity>
	
	
	<entity name="f" dataSource="binary" rootEntity="false"
processor="FileListEntityProcessor" baseDir="../solr/docu/" fileName=".*pdf"
recursive="true">
			<entity name="tika" processor="TikaEntityProcessor" 
url="${f.fileAbsolutePath}" format="text">
				<field column="id" name="id" meta="true" />
				<field column="fake_id" name="fake_id" meta="true" />
				<field column="model" name="model" meta="true" />
				<field column="text" name="biog" />
			</entity>
		</entity>
	
  </document>	
</dataConfig>

*The log file is outputting:*

SEVERE: Exception while processing: f document :
null:org.apache.solr.handler.dataimport.DataImportHandlerException: Unable
to execute query: C:\solr\tomcat\..\solr\docu\dinos.pdf Processing Document
# 36
	at
org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow(DataImportHandlerException.java:72)
	at
org.apache.solr.handler.dataimport.JdbcDataSource$ResultSetIterator.<init>(JdbcDataSource.java:253)
	at
org.apache.solr.handler.dataimport.JdbcDataSource.getData(JdbcDataSource.java:210)
	at
org.apache.solr.handler.dataimport.JdbcDataSource.getData(JdbcDataSource.java:39)
	at
org.apache.solr.handler.dataimport.TikaEntityProcessor.nextRow(TikaEntityProcessor.java:103)
	at
org.apache.solr.handler.dataimport.EntityProcessorWrapper.pullRow(EntityProcessorWrapper.java:330)
	at
org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:296)
	at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:683)
	at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:709)
	at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:619)
	at
org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:327)
	at
org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:225)
	at
org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:375)
	at
org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:445)
	at
org.apache.solr.handler.dataimport.DataImporter$1.run(DataImporter.java:426)
Caused by: com.mysql.jdbc.exceptions.jdbc4.MySQLSyntaxErrorException: You
have an error in your SQL syntax; check the manual that corresponds to your
MySQL server version for the right syntax to use near
'C:\solr\tomcat\..\solr\docu\dinos.pdf' at line 1
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(Unknown Source)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(Unknown
Source)
	at java.lang.reflect.Constructor.newInstance(Unknown Source)
	at com.mysql.jdbc.Util.handleNewInstance(Util.java:411)
	at com.mysql.jdbc.Util.getInstance(Util.java:386)
	at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:1052)
	at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:4096)
	at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:4028)
	at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2490)
	at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2651)
	at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2677)
	at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2627)
	at com.mysql.jdbc.StatementImpl.execute(StatementImpl.java:841)
	at com.mysql.jdbc.StatementImpl.execute(StatementImpl.java:681)
	at
org.apache.solr.handler.dataimport.JdbcDataSource$ResultSetIterator.<init>(JdbcDataSource.java:246)
	... 13 more

Is it possible to index pdfs, docs, rtf along with database and having a
single document?

Thank in advance,
Tom




--
View this message in context: http://lucene.472066.n3.nabble.com/Is-it-possible-to-index-pdfs-and-database-into-single-document-tp3980761.html
Sent from the Solr - User mailing list archive at Nabble.com.

Mime
View raw message