lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Andreas Owen ...@conx.ch>
Subject dih HTMLStripTransformer
Date Tue, 24 Sep 2013 18:35:30 GMT
why does stripHTML="false" have no effect in dih? the html is strippedin text and text_nohtml
when i do display the index with select?q=*

i'm trying to get a field without html and one with it so i can also index the links on the
page.

data-config.xml
<entity name="rec" processor="XPathEntityProcessor" url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml"
forEach="/docs/doc" dataSource="main"> <!-- transformer="script:GenerateId"-->
		<field column="title" xpath="//title" />
		<field column="id" xpath="//id" />
		<field column="file" xpath="//file" />
		<field column="url" xpath="//url" />
		<field column="urlParse" xpath="//urlParse" />
		<field column="last_modified" xpath="//last_modified" />
		<field column="Author" xpath="//author" />
		
		<entity name="tika" processor="TikaEntityProcessor" url="${rec.urlParse}" dataSource="dataUrl"
onError="skip" htmlMapper="identity" format="html" transformer="HTMLStripTransformer">
			<field column="text" name="text" stripHTML="false" />
			<field column="text" name="text_nohtml" stripHTML="true" />
			<!--  transformer="RegexTransformer"
			<field column="text_html_b" regex="(?s)^.*&lt;div.*id=.*&gt;(.*)&lt;/div&gt;.*$"
replaceWith="$1" sourceColName="text"  />
			<field column="text_html_b" regex="(?s)^.*&lt;!-body-&gt;(.*)&lt;!-/body-&gt;.*$"
replaceWith="$1" sourceColName="text"  /> -->
		</entity>
	</entity>
Mime
View raw message