mahout-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Thomas Quenolle <tqueno...@multiposting.fr>
Subject Re: Mahout 0.7 API Naive Bayes
Date Wed, 17 Oct 2012 07:31:42 GMT
Hi I also struggled on this. Here is some code from my classifier, hope you
will find this helpful.
The key point is to use the dictionary you created on your process to train
your model. Which I modified to get the total number of docs.


private void loadTermDictionary(InputStream is) throws IOException {
123         /**
124          * Read and load a dictionary file.
125          * Retrieving:
126          *   Number of documents used for tfidf computation.
127          * Retrieving for each word:
128          *   the feature's index used in the vectors.
129          *   it's document frequency (the number of docs it appears in).
130          * Inspired by VectorHelper.class
131          */
132
133         FileLineIterator it = new FileLineIterator(is);
134
135         int numEntries = Integer.parseInt(it.next());
136         dictMap = new HashMap<String, Integer>();
137         docFreqMap = new HashMap<Integer, Integer>();
138
139         while (it.hasNext()) {
140             String line = it.next();
141             if (line.startsWith("#")) {
142                 if (line.startsWith("#numDocs")) {
143                     this.numDocs =
Integer.parseInt(SPACE.split(line)[1]);
144                 }
145                 continue;
146             }
147             String[] tokens = TAB_PATTERN.split(line);
148             // tokens[0] is the word
149             // tokens[1] is the doc freq
150             // tokens[2] is the feature index
151             if (tokens.length < 3) {
152                 continue;
153             }
154             int index = Integer.parseInt(tokens[2]);
155             int docfreq = Integer.parseInt(tokens[1]);
156             // Saving mapping word -> feature index
157             if (!dictMap.containsKey(tokens[0]))
158                 dictMap.put(tokens[0], new Integer(index));
159             // Saving mapping feature index -> doc freq
160             if (!docFreqMap.containsKey(tokens[0]))
161                 docFreqMap.put(new Integer(index), new
Integer(docfreq));
162         }
163     }



230         private String classify(String[] ts) {
231             /**
232              * Return the guessed category's label.
233              * Term Frequency computation.
234              * TFIDF weight computation.
235              * Classification based on a model.
236              * The best score is returned.
237              */
238
239             Map<Integer, Integer> termFreqs = new HashMap<Integer,
Integer>();
240             for (int k = 0; k<ts.length; k++) {
241                 String val = ts[k];
242                 Integer index = dictMap.get(val);
243                 if (index != null) {
244                     if (termFreqs.containsKey(index)) {
245                         termFreqs.put(index, termFreqs.get(index) + new
Integer(1));
246                     } else {
247                         termFreqs.put(index, new Integer(1));
248                     }
249                 }
250             }
251             Vector vec = new RandomAccessSparseVector((int)
termFreqs.size());
252             for (Integer idx: termFreqs.keySet()) {
253                 double termWeight =
weight.calculate((int)termFreqs.get(idx), (int) docFreqMap.get(idx), 0,
numDocs);
254                 vec.setQuick((int) idx, termWeight);
255             }
256             Vector scores = classifier.classifyFull(vec.normalize());
257             int bestIdx = Integer.MIN_VALUE;
258             double bestScore = Long.MIN_VALUE;
259             for (Iterator<Vector.Element> score = scores.iterator();
score.hasNext();) {
260                 Vector.Element element = score.next();
261                 if (element.get() > bestScore) {
262                     bestScore = element.get();
263                     bestIdx = element.index();
264                 }
265             }
266             if (debug)
267                 System.out.println("Classified as: " +
labelMap.get(bestIdx));
268
269             return labelMap.get(bestIdx);
270         }


2012/10/17 Sarath P R <sarath.amrita@gmail.com>

> Hi Jaggu,
>
> I am also working with 0.7 .  I too tried input vector to the classifier .
> Vector nbResult = nbClassifier.classifyFull(getVector());
>
> But don't know how to get the correct label. I got the following piece of
> code from
>
> /mahout-distribution-0.7/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
> for analyzing result.
>
> private static void analyzeResults(Map<Integer, String> labelMap,
>                                      SequenceFileDirIterable<Text,
> VectorWritable> dirIterable,
>                                      ResultAnalyzer analyzer) {
>     for (Pair<Text, VectorWritable> pair : dirIterable) {
>       int bestIdx = Integer.MIN_VALUE;
>       double bestScore = Long.MIN_VALUE;
>       for (Vector.Element element : pair.getSecond().get()) {
>         if (element.get() > bestScore) {
>           bestScore = element.get();
>           bestIdx = element.index();
>         }
>       }
>       if (bestIdx != Integer.MIN_VALUE) {
>         ClassifierResult classifierResult = new
> ClassifierResult(labelMap.get(bestIdx), bestScore);
>         analyzer.addInstance(pair.getFirst().toString(), classifierResult);
>       }
>     }
>   }
>
> But couldn't get the correct label.
>
>
>
> On Thu, Oct 11, 2012 at 3:33 PM, JAGANADH G <jaganadhg@gmail.com> wrote:
>
> > Hi
> >
> > I just created a sample use class of NaiveBayes . Can somebody say
> wheather
> > I am in the right track or not
> >
> > Here is my code
> >
> > public class NaiveBayesClassifierExample {
> >
> > public static void loadClassifier(String strModelPath, Vector v)
> > throws IOException {
> >  Configuration conf = new Configuration();
> >
> > NaiveBayesModel model = NaiveBayesModel.materialize(new Path(
> >  strModelPath), conf);
> > AbstractNaiveBayesClassifier classifier = new
> StandardNaiveBayesClassifier(
> >  model);
> >
> > Vector st = classifier.classifyFull(v);
> >  System.out.println(st.toString());
> >  }
> >
> > public static Vector createVect() throws IOException {
> > FeatureVectorEncoder encoder = new StaticWordValueEncoder("text");
> >  Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
> > StringReader in = new StringReader(
> >  "The movie sherk was very cool and attractive one. We like the movie"
> > + "because of the theme and directon. All the actores were excellent");
> >
> > TokenStream ts = analyzer.tokenStream("body", in);
> >
> > CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
> >  Vector v1 = new RandomAccessSparseVector(100000);
> >
> > while (ts.incrementToken()) {
> >  char[] termBuffer = termAtt.buffer();
> > int termLen = termAtt.length();
> >  String w = new String(termBuffer, 0, termLen);
> > encoder.addToVector(w, 1.0, v1);
> >  }
> >  v1.normalize();
> >  return v1;
> > }
> >
> > public static void main(String[] args) throws IOException {
> >  Vector v = createVect();
> > String mp =
> > "/home/u179995/Downloads/mahout-distribution-0.7/playg/movie_model";
> >  loadClassifier(mp, v);
> > }
> > }
> >
> > --
> > **********************************
> > JAGANADH G
> > http://jaganadhg.in
> > *ILUGCBE*
> > http://ilugcbe.org.in
> >
>
>
>
> --
> Thank You
> Sarath P R | cell +91 99 95 02 4287 | http://sprism.blogspot.com
>



-- 

*Thomas Quenolle*  |  Chef de Projet
Direct : +33 6 80 84 26 58
www.multiposting.fr
Standard : +33 1 42 72 57 84   |   Fax : +33 1 73 76 93 23


Rejoignez-nous sur Facebook <http://www.facebook.com/multiposting>
Suivez-nous sur Twitter <http://twitter.com/multiposting>

 N’imprimez cet email qu’en cas de nécessité / Please do not print this
email unless necessary

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message