From commits-return-22957-archive-asf-public=cust-asf.ponee.io@accumulo.apache.org Fri Jun 7 15:22:34 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id 639BF18067E for ; Fri, 7 Jun 2019 17:22:34 +0200 (CEST) Received: (qmail 9076 invoked by uid 500); 7 Jun 2019 15:22:33 -0000 Mailing-List: contact commits-help@accumulo.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@accumulo.apache.org Delivered-To: mailing list commits@accumulo.apache.org Received: (qmail 9067 invoked by uid 99); 7 Jun 2019 15:22:33 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 07 Jun 2019 15:22:33 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 833FD87A69; Fri, 7 Jun 2019 15:22:28 +0000 (UTC) Date: Fri, 07 Jun 2019 15:22:28 +0000 To: "commits@accumulo.apache.org" Subject: [accumulo-wikisearch] branch master updated: Upgrade to lucene 7.1.0 MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <155992094847.5263.12168609249311252942@gitbox.apache.org> From: mmiller@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: accumulo-wikisearch X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: c06e9d32690fe2cd00eb619339463252ffd425bc X-Git-Newrev: 134b78df77f45184f2193e6f5d822f1d99071f59 X-Git-Rev: 134b78df77f45184f2193e6f5d822f1d99071f59 X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. mmiller pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/accumulo-wikisearch.git The following commit(s) were added to refs/heads/master by this push: new 134b78d Upgrade to lucene 7.1.0 134b78d is described below commit 134b78df77f45184f2193e6f5d822f1d99071f59 Author: Mike Miller AuthorDate: Fri Jun 7 11:21:54 2019 -0400 Upgrade to lucene 7.1.0 --- ingest/pom.xml | 6 +- .../wikisearch/ingest/ArticleExtractor.java | 8 +- .../wikisearch/ingest/WikipediaConfiguration.java | 7 -- .../wikisearch/ingest/WikipediaMapper.java | 25 ++---- .../wikisearch/normalizer/NumberNormalizer.java | 42 ---------- .../normalizer/testNumberNormalizer.java | 90 ---------------------- pom.xml | 21 ++--- 7 files changed, 18 insertions(+), 181 deletions(-) diff --git a/ingest/pom.xml b/ingest/pom.xml index 295c8a8..1f6bc99 100644 --- a/ingest/pom.xml +++ b/ingest/pom.xml @@ -57,7 +57,7 @@ org.apache.lucene - lucene-wikipedia + lucene-analyzers-common org.apache.zookeeper @@ -84,8 +84,8 @@ prepare-package lib - - + <!– just grab the non-provided runtime dependencies –> + <!– XXX we include guava at the same version as hadoop 2 provides so that we have it on hadoop 1 –> commons-lang,guava,lucene-core,lucene-analyzers,lucene-wikipedia,protobuf-java,accumulo-core,hadoop-core,libthrift,zookeeper,commons-codec,accumulo-fate,accumulo-trace false diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java index 0699cfa..cda08d8 100644 --- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java +++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java @@ -31,7 +31,6 @@ import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer; -import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -39,7 +38,6 @@ import org.apache.hadoop.io.Writable; public class ArticleExtractor { public final static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'Z"); - private static NumberNormalizer nn = new NumberNormalizer(); private static LcNoDiacriticsNormalizer lcdn = new LcNoDiacriticsNormalizer(); public static class Article implements Writable { @@ -91,9 +89,11 @@ public class ArticleExtractor { public Map getNormalizedFieldValues() { Map fields = new HashMap(); - fields.put("ID", nn.normalizeFieldValue("ID", this.id)); + //fields.put("ID", nn.normalizeFieldValue("ID", this.id)); + fields.put("ID", Integer.toString(this.id)); fields.put("TITLE", lcdn.normalizeFieldValue("TITLE", this.title)); - fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp)); + //fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp)); + fields.put("TIMESTAMP", Long.toString(this.timestamp)); fields.put("COMMENTS", lcdn.normalizeFieldValue("COMMENTS", this.comments)); return fields; } diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java index 44a3fbc..05ce8d8 100644 --- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java +++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java @@ -28,7 +28,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.ReflectionUtils; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.SimpleAnalyzer; public class WikipediaConfiguration { public final static String INSTANCE_NAME = "wikipedia.accumulo.instance_name"; @@ -107,12 +106,6 @@ public class WikipediaConfiguration { return new Path(filename); } - public static Analyzer getAnalyzer(Configuration conf) throws IOException { - Class analyzerClass = - conf.getClass(ANALYZER, SimpleAnalyzer.class, Analyzer.class); - return ReflectionUtils.newInstance(analyzerClass, conf); - } - public static Connector getConnector(Configuration conf) throws AccumuloException, AccumuloSecurityException { return getInstance(conf).getConnector(getUser(conf), getPassword(conf)); diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java index c751637..c2fed03 100644 --- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java +++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java @@ -47,8 +47,8 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.log4j.Logger; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer; +import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; +import org.apache.lucene.util.Attribute; import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; @@ -223,31 +223,18 @@ public class WikipediaMapper extends Mapper { /** * Tokenize the wikipedia content */ - static Set getTokens(Article article) throws IOException { + static Set getTokens(Article article) { Set tokenList = new HashSet<>(); - WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText())); - TermAttribute term = tok.addAttribute(TermAttribute.class); - try { + try (WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()))) { + Attribute term = tok.addAttribute(Attribute.class); while (tok.incrementToken()) { - String token = term.term(); + String token = term.toString(); if (!StringUtils.isEmpty(token)) { tokenList.add(token); } } } catch (IOException e) { log.error("Error tokenizing text", e); - } finally { - try { - tok.end(); - } catch (IOException e) { - log.error("Error calling end()", e); - } finally { - try { - tok.close(); - } catch (IOException e) { - log.error("Error closing tokenizer", e); - } - } } return tokenList; } diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java deleted file mode 100644 index e0a5cc8..0000000 --- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.accumulo.examples.wikisearch.normalizer; - -import org.apache.commons.lang.math.NumberUtils; -import org.apache.lucene.util.NumericUtils; - -public class NumberNormalizer implements Normalizer { - - public String normalizeFieldValue(String field, Object value) { - if (NumberUtils.isNumber(value.toString())) { - Number n = NumberUtils.createNumber(value.toString()); - if (n instanceof Integer) - return NumericUtils.intToPrefixCoded((Integer) n); - else if (n instanceof Long) - return NumericUtils.longToPrefixCoded((Long) n); - else if (n instanceof Float) - return NumericUtils.floatToPrefixCoded((Float) n); - else if (n instanceof Double) - return NumericUtils.doubleToPrefixCoded((Double) n); - else - throw new IllegalArgumentException("Unhandled numeric type: " + n.getClass()); - } else { - throw new IllegalArgumentException("Value is not a number: " + value); - } - } - -} diff --git a/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java b/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java deleted file mode 100644 index 470633c..0000000 --- a/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.accumulo.examples.wikisearch.normalizer; - -import static org.junit.Assert.assertTrue; - -import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer; -import org.junit.Test; - -public class testNumberNormalizer { - - @Test - public void test1() throws Exception { - NumberNormalizer nn = new NumberNormalizer(); - - String n1 = nn.normalizeFieldValue(null, "1"); - String n2 = nn.normalizeFieldValue(null, "1.00000000"); - - assertTrue(n1.compareTo(n2) < 0); - - } - - @Test - public void test2() { - NumberNormalizer nn = new NumberNormalizer(); - - String n1 = nn.normalizeFieldValue(null, "-1.0"); - String n2 = nn.normalizeFieldValue(null, "1.0"); - - assertTrue(n1.compareTo(n2) < 0); - - } - - @Test - public void test3() { - NumberNormalizer nn = new NumberNormalizer(); - String n1 = nn.normalizeFieldValue(null, "-0.0001"); - String n2 = nn.normalizeFieldValue(null, "0"); - String n3 = nn.normalizeFieldValue(null, "0.00001"); - - assertTrue((n1.compareTo(n2) < 0) && (n2.compareTo(n3) < 0)); - } - - @Test - public void test4() { - NumberNormalizer nn = new NumberNormalizer(); - String nn1 = nn.normalizeFieldValue(null, Integer.toString(Integer.MAX_VALUE)); - String nn2 = nn.normalizeFieldValue(null, Integer.toString(Integer.MAX_VALUE - 1)); - - assertTrue((nn2.compareTo(nn1) < 0)); - - } - - @Test - public void test5() { - NumberNormalizer nn = new NumberNormalizer(); - String nn1 = nn.normalizeFieldValue(null, "-0.001"); - String nn2 = nn.normalizeFieldValue(null, "-0.0009"); - String nn3 = nn.normalizeFieldValue(null, "-0.00090"); - - assertTrue((nn3.compareTo(nn2) == 0) && (nn2.compareTo(nn1) > 0)); - - } - - @Test - public void test6() { - NumberNormalizer nn = new NumberNormalizer(); - String nn1 = nn.normalizeFieldValue(null, "00.0"); - String nn2 = nn.normalizeFieldValue(null, "0"); - String nn3 = nn.normalizeFieldValue(null, "0.0"); - - assertTrue((nn3.compareTo(nn2) == 0) && (nn2.compareTo(nn1) == 0)); - - } - -} diff --git a/pom.xml b/pom.xml index 48c3c46..ba62cf0 100644 --- a/pom.xml +++ b/pom.xml @@ -54,9 +54,8 @@ 1.04 1.2.16 1.0 - 3.6.2 - 3.6.2 - 3.0.3 + 7.1.0 + 4.0.0 1.2 2.5.0 0.12.0 @@ -226,18 +225,8 @@ org.apache.lucene - lucene-wikipedia - ${version.lucene-wikipedia} - - - commons-digester - commons-digester - - - commons-logging - commons-logging - - + lucene-analyzers-common + ${version.lucene-analyzers} org.apache.thrift @@ -386,7 +375,7 @@ prepare-package ../../lib - + <!– just grab the non-provided runtime dependencies –> commons-collections,commons-configuration,commons-io,commons-lang,jline,log4j,libthrift,commons-jci-core,commons-jci-fam,commons-logging,commons-logging-api accumulo true