Return-Path: Delivered-To: apmail-mahout-commits-archive@www.apache.org Received: (qmail 9199 invoked from network); 28 Sep 2010 06:20:42 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 28 Sep 2010 06:20:42 -0000 Received: (qmail 58727 invoked by uid 500); 28 Sep 2010 06:20:41 -0000 Delivered-To: apmail-mahout-commits-archive@mahout.apache.org Received: (qmail 58602 invoked by uid 500); 28 Sep 2010 06:20:38 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 58595 invoked by uid 99); 28 Sep 2010 06:20:37 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 28 Sep 2010 06:20:37 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 28 Sep 2010 06:20:36 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 7B7DE23888E8; Tue, 28 Sep 2010 06:20:16 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1002033 - /mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java Date: Tue, 28 Sep 2010 06:20:16 -0000 To: commits@mahout.apache.org From: tdunning@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100928062016.7B7DE23888E8@eris.apache.org> Author: tdunning Date: Tue Sep 28 06:20:16 2010 New Revision: 1002033 URL: http://svn.apache.org/viewvc?rev=1002033&view=rev Log: got rid of final declarations to avoid style complaints and keep from SHOUTING Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java?rev=1002033&r1=1002032&r2=1002033&view=diff ============================================================================== --- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java (original) +++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java Tue Sep 28 06:20:16 2010 @@ -18,9 +18,11 @@ package org.apache.mahout.classifier.sgd; import com.google.common.collect.ConcurrentHashMultiset; +import com.google.common.collect.HashMultiset; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Multiset; +import com.google.common.collect.Ordering; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -116,16 +118,16 @@ public final class TrainNewsGroups { new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss") }; - private static final Analyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_30); - private static final FeatureVectorEncoder ENCODER = new StaticWordValueEncoder("body"); - private static final FeatureVectorEncoder BIAS = new ConstantValueEncoder("Intercept"); - - private TrainNewsGroups() { - } + private static Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); + private static FeatureVectorEncoder encoder = new StaticWordValueEncoder("body"); + private static FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept"); + private static Multiset overallCounts; public static void main(String[] args) throws IOException { File base = new File(args[0]); + overallCounts = HashMultiset.create(); + int leakType = 0; if (args.length > 1) { leakType = Integer.parseInt(args[1]); @@ -133,7 +135,7 @@ public final class TrainNewsGroups { Dictionary newsGroups = new Dictionary(); - ENCODER.setProbes(2); + encoder.setProbes(2); AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1()); learningAlgorithm.setInterval(800); learningAlgorithm.setAveragingWindow(500); @@ -215,6 +217,18 @@ public final class TrainNewsGroups { learningAlgorithm.close(); dissect(leakType, newsGroups, learningAlgorithm, files); System.out.println("exiting main"); + + List counts = Lists.newArrayList(); + System.out.printf("Word counts\n"); + for (String count : overallCounts.elementSet()) { + counts.add(overallCounts.count(count)); + } + Collections.sort(counts, Ordering.natural().reverse()); + k = 0; + for (Integer count : counts) { + System.out.printf("%d\t%d\n", k, count); + k++; + } } private static void dissect(int leakType, @@ -227,8 +241,8 @@ public final class TrainNewsGroups { Map> traceDictionary = Maps.newTreeMap(); ModelDissector md = new ModelDissector(); - ENCODER.setTraceDictionary(traceDictionary); - BIAS.setTraceDictionary(traceDictionary); + encoder.setTraceDictionary(traceDictionary); + bias.setTraceDictionary(traceDictionary); for (File file : permute(files, rand).subList(0, 500)) { String ng = file.getParentFile().getName(); @@ -254,7 +268,7 @@ public final class TrainNewsGroups { try { String line = reader.readLine(); Reader dateString = new StringReader(DATE_FORMATS[leakType % 3].format(new Date(date))); - countWords(ANALYZER, words, dateString); + countWords(analyzer, words, dateString); while (line != null && line.length() > 0) { boolean countHeader = ( line.startsWith("From:") || line.startsWith("Subject:") || @@ -262,22 +276,22 @@ public final class TrainNewsGroups { do { Reader in = new StringReader(line); if (countHeader) { - countWords(ANALYZER, words, in); + countWords(analyzer, words, in); } line = reader.readLine(); } while (line.startsWith(" ")); } if (leakType < 3) { - countWords(ANALYZER, words, reader); + countWords(analyzer, words, reader); } } finally { reader.close(); } Vector v = new RandomAccessSparseVector(FEATURES); - BIAS.addToVector("", 1, v); + bias.addToVector("", 1, v); for (String word : words.elementSet()) { - ENCODER.addToVector(word, Math.log(1 + words.count(word)), v); + encoder.addToVector(word, Math.log(1 + words.count(word)), v); } return v; @@ -290,6 +304,7 @@ public final class TrainNewsGroups { String s = ts.getAttribute(TermAttribute.class).term(); words.add(s); } + overallCounts.addAll(words); } private static List permute(Iterable files, Random rand) {