Mailing-List: contact pylucene-dev-help@lucene.apache.org; run by ezmlm
Precedence: bulk
Reply-To: pylucene-dev@lucene.apache.org
Received-SPF: pass (nike.apache.org: domain of ihasmax@gmail.com designates
 209.85.221.133 as permitted sender)
DomainKey-Signature: a=rsa-sha1; c=nofws;
        d=gmail.com; s=gamma;
        h=mime-version:in-reply-to:references:date:message-id:subject:from:to
         :content-type;
        b=LDpTMT6mkR4MmxLfF6s4nXSgaVPSune8BNjHVQJ/tX5AZZRD4xp+COwHPL+a4K8Z2i
         o5HGwbm88n3ZbHKHT2ojC7k6uwoQcj+wdr/u1SQwMX7sHEe3LPDFMorvzofpQfrjcWK+
         0ApW+Ak8AIEwVPh2uJJcKNvxq6P1sfIzmubXQ=
MIME-Version: 1.0
In-Reply-To: <alpine.OSX.2.00.0904272316450.40438@yuzu.local>
References: <3836ec640904272228j3fdea1a6t6ec949857929649e@mail.gmail.com>
	 <alpine.OSX.2.00.0904272238020.40388@yuzu.local>
	 <3836ec640904272300i25a82365o3548d92db9ee8b71@mail.gmail.com>
	 <3836ec640904272311j4d380c12y64121909c0d4fe3f@mail.gmail.com>
	 <alpine.OSX.2.00.0904272316450.40438@yuzu.local>
Date: Tue, 28 Apr 2009 02:33:33 -0500
Message-ID: <3836ec640904280033j69b5f2e4la803561d0846f881@mail.gmail.com>
Subject: Re: SpanScorer Not implemented Error
From: Max Lynch <ihasmax@gmail.com>
To: pylucene-dev@lucene.apache.org, Andi Vajda <vajda@apache.org>
Content-Type: multipart/alternative; boundary=0015175d6706eb3fc50468987ab9

--0015175d6706eb3fc50468987ab9
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: 7bit

Here is an example that is failing.  However, by just doing a
dir(SpanScorer) with your console commands show it has no extra
attributes other than the base java object ones, and it is using the
spans version of the code.


# ====================================================================
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
# ====================================================================

from unittest import TestCase, main
from lucene import *


class TestFormatter(PythonFormatter):

	def __init__(self, testCase):
		super(TestFormatter, self).__init__()
		self.testCase = testCase

	def highlightTerm(self, originalText, group):
		if group.getTotalScore() <= 0:
			return originalText;

		self.testCase.countHighlightTerm(originalText)

		return "<b>" + originalText + "</b>"


class HighlighterTestCase(TestCase):
	"""
	Unit tests ported from Java Lucene.
	2004 by Yura Smolsky ;)
	"""

	FIELD_NAME = "contents"
	texts = [ "A wicked problem is one for which each attempt to create a
solution changes the understanding of the problem.  Wicked problems
cannot be solved in a traditional linear fashion, because the problem
definition evolves as new possible solutions are considered and/or
implemented."
			"Wicked problems always occur in a social context -- the wickedness
of the problem reflects the diversity among the stakeholders in the
problem."
			"From http://cognexus.org/id42.htm"
			"Most projects in organizations -- and virtually all
technology-related projects these days -- are about wicked problems.
Indeed, it is the social complexity of these problems, not their
technical complexity, that overwhelms most current problem solving and
project management approaches."
			"This text has a typo in referring to whicked problems" ];


	def __init__(self, *args):

		super(HighlighterTestCase, self).__init__(*args)
		self.foundList = []
		self.parser = QueryParser(self.FIELD_NAME, StandardAnalyzer())

	def testSimpleHighlighter(self):

		self.doSearching("wicked")
		formatter = TestFormatter(self)

		for i in range(0, self.hits.length()):
			self.foundList = []
			text = self.hits.doc(i).get(self.FIELD_NAME)
			tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
					StringReader(text))

			ctokenStream = CachingTokenFilter(tokenStream)
			highlighter = Highlighter(formatter, SpanScorer(self.query,
self.FIELD_NAME, ctokenStream))
			result = highlighter.getBestFragments(tokenStream, text,
					maxNumFragmentsRequired,
					"...")
			print "\t", result
			print "Found: ",
			print self.foundList

		
	def doSearching(self, queryString):

		searcher = IndexSearcher(self.ramDir)
		self.query = self.parser.parse(queryString)
		#self.query = PhraseQuery()
		#self.query = queries[0]
		#self.query.combine(queries)
		# for any multi-term queries to work (prefix, wildcard, range,
		# fuzzy etc) you must use a rewritten query!
		self.query = self.query.rewrite(self.reader)

		print "Searching for:", self.query.toString(self.FIELD_NAME)
		self.hits = searcher.search(self.query)
		self.numHighlights = 0

	def countHighlightTerm(self, found):

		self.foundList.append(found)

		self.numHighlights += 1 # update stats used in assertions

	def setUp(self):

		self.analyzer=StandardAnalyzer()
		self.ramDir = RAMDirectory()
		writer = IndexWriter(self.ramDir, self.analyzer, True)
		for text in self.texts:
			self.addDoc(writer, text)

		writer.optimize()
		writer.close()
		self.reader = IndexReader.open(self.ramDir)
		self.numHighlights = 0;

	def addDoc(self, writer, text):

		d = Document()
		f = Field(self.FIELD_NAME, text,
				Field.Store.YES, Field.Index.TOKENIZED,
				Field.TermVector.YES)
		d.add(f)
		writer.addDocument(d)


if __name__ == "__main__":
	import sys, lucene
	lucene.initVM(lucene.CLASSPATH)
	if '-loop' in sys.argv:
		sys.argv.remove('-loop')
		while True:
			try:
				main()
			except:
				pass
	else:
		main()

--0015175d6706eb3fc50468987ab9--