Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 38576 invoked from network); 5 Aug 2009 01:39:37 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 5 Aug 2009 01:39:37 -0000 Received: (qmail 21340 invoked by uid 500); 5 Aug 2009 01:39:45 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 21279 invoked by uid 500); 5 Aug 2009 01:39:45 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 21270 invoked by uid 99); 5 Aug 2009 01:39:45 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 05 Aug 2009 01:39:45 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 05 Aug 2009 01:39:42 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id B7100238889D; Wed, 5 Aug 2009 01:39:22 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r801043 - in /lucene/java/trunk/contrib/benchmark: CHANGES.txt conf/highlight-vs-vector-highlight.alg src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java Date: Wed, 05 Aug 2009 01:39:22 -0000 To: java-commits@lucene.apache.org From: markrmiller@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20090805013922.B7100238889D@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: markrmiller Date: Wed Aug 5 01:39:22 2009 New Revision: 801043 URL: http://svn.apache.org/viewvc?rev=801043&view=rev Log: LUCENE-1770: Add EnwikiQueryMaker Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt lucene/java/trunk/contrib/benchmark/conf/highlight-vs-vector-highlight.alg Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=801043&r1=801042&r2=801043&view=diff ============================================================================== --- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original) +++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Wed Aug 5 01:39:22 2009 @@ -4,6 +4,9 @@ $Id:$ +8/4/2009 + LUCENE-1770: Add EnwikiQueryMaker (Mark Miller) + 8/04/2009 LUCENE-1773: Add FastVectorHighlighter tasks. (Koji Sekiguchi via Mike McCandless) Modified: lucene/java/trunk/contrib/benchmark/conf/highlight-vs-vector-highlight.alg URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/highlight-vs-vector-highlight.alg?rev=801043&r1=801042&r2=801043&view=diff ============================================================================== --- lucene/java/trunk/contrib/benchmark/conf/highlight-vs-vector-highlight.alg (original) +++ lucene/java/trunk/contrib/benchmark/conf/highlight-vs-vector-highlight.alg Wed Aug 5 01:39:22 2009 @@ -34,8 +34,7 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource docs.file=temp/enwiki-20070527-pages-articles.xml -# Use LUCENE-1770 WikipediaQueryMaker -query.maker=org.apache.lucene.benchmark.byTask.feeds.WikipediaQueryMaker +query.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java?rev=801043&view=auto ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java Wed Aug 5 01:39:22 2009 @@ -0,0 +1,134 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.spans.SpanFirstQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; + +/** + * A QueryMaker that uses common and uncommon actual Wikipedia queries for + * searching the English Wikipedia collection. 90 queries total. + */ +public class EnwikiQueryMaker extends AbstractQueryMaker implements + QueryMaker { + + // common and a few uncommon queries from wikipedia search logs + private static String[] STANDARD_QUERIES = { "Images catbox gif", + "Imunisasi haram", "Favicon ico", "Michael jackson", "Unknown artist", + "Lily Thai", "Neda", "The Last Song", "Metallica", "Nicola Tesla", + "Max B", "Skil Corporation", "\"The 100 Greatest Artists of All Time\"", + "\"Top 100 Global Universities\"", "Pink floyd", "Bolton Sullivan", + "Frank Lucas Jr", "Drake Woods", "Radiohead", "George Freeman", + "Oksana Grigorieva", "The Elder Scrolls V", "Deadpool", "Green day", + "\"Red hot chili peppers\"", "Jennifer Bini Taylor", + "The Paradiso Girls", "Queen", "3Me4Ph", "Paloma Jimenez", "AUDI A4", + "Edith Bouvier Beale: A Life In Pictures", "\"Skylar James Deleon\"", + "Simple Explanation", "Juxtaposition", "The Woody Show", "London WITHER", + "In A Dark Place", "George Freeman", "LuAnn de Lesseps", "Muhammad.", + "U2", "List of countries by GDP", "Dean Martin Discography", "Web 3.0", + "List of American actors", "The Expendables", + "\"100 Greatest Guitarists of All Time\"", "Vince Offer.", + "\"List of ZIP Codes in the United States\"", "Blood type diet", + "Jennifer Gimenez", "List of hobbies", "The beatles", "Acdc", + "Nightwish", "Iron maiden", "Murder Was the Case", "Pelvic hernia", + "Naruto Shippuuden", "campaign", "Enthesopathy of hip region", + "operating system", "mouse", + "List of Xbox 360 games without region encoding", "Shakepearian sonnet", + "\"The Monday Night Miracle\"", "India", "Dad's Army", + "Solanum melanocerasum", "\"List of PlayStation Portable Wi-Fi games\"", + "Little Pixie Geldof", "Planes, Trains & Automobiles", "Freddy Ingalls", + "The Return of Chef", "Nehalem", "Turtle", "Calculus", "Superman-Prime", + "\"The Losers\"", "pen-pal", "Audio stream input output", "lifehouse", + "50 greatest gunners", "Polyfecalia", "freeloader", "The Filthy Youth" }; + + private static Query[] getPrebuiltQueries(String field) { + WildcardQuery wcq = new WildcardQuery(new Term(field, "fo*")); + wcq .setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); + // be wary of unanalyzed text + return new Query[] { + new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 5), + new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(field, "night")), + new SpanTermQuery(new Term(field, "trading")) }, 4, false), + new SpanNearQuery(new SpanQuery[] { + new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 10), + new SpanTermQuery(new Term(field, "credit")) }, 10, false), wcq, }; + } + + /** + * Parse the strings containing Lucene queries. + * + * @param qs array of strings containing query expressions + * @param a analyzer to use when parsing queries + * @return array of Lucene queries + */ + private static Query[] createQueries(List qs, Analyzer a) { + QueryParser qp = new QueryParser(DocMaker.BODY_FIELD, a); + List queries = new ArrayList(); + for (int i = 0; i < qs.size(); i++) { + try { + + Object query = qs.get(i); + Query q = null; + if (query instanceof String) { + q = qp.parse((String) query); + + } else if (query instanceof Query) { + q = (Query) query; + + } else { + System.err.println("Unsupported Query Type: " + query); + } + + if (q != null) { + queries.add(q); + } + + } catch (Exception e) { + e.printStackTrace(); + } + } + + return (Query[]) queries.toArray(new Query[0]); + } + + protected Query[] prepareQueries() throws Exception { + // analyzer (default is standard analyzer) + Analyzer anlzr = (Analyzer) Class.forName( + config.get("analyzer", StandardAnalyzer.class.getName())).newInstance(); + + List queryList = new ArrayList(20); + queryList.addAll(Arrays.asList(STANDARD_QUERIES)); + queryList.addAll(Arrays.asList(getPrebuiltQueries(DocMaker.BODY_FIELD))); + return createQueries(queryList, anlzr); + } + +}