lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Karl Wettin (JIRA)" <j...@apache.org>
Subject [jira] Created: (LUCENE-1039) Bayesian classifiers using Lucene as data store
Date Tue, 30 Oct 2007 23:46:51 GMT
Bayesian classifiers using Lucene as data store
-----------------------------------------------

                 Key: LUCENE-1039
                 URL: https://issues.apache.org/jira/browse/LUCENE-1039
             Project: Lucene - Java
          Issue Type: New Feature
            Reporter: Karl Wettin
            Priority: Minor


Bayesian classifiers using Lucene as data store. Based on the Naive Bayes and Fisher method
algorithms as described by Toby Segaran in "Programming Collective Intelligence", ISBN 978-0-596-52932-1.


Have fun.

Poor java docs, but the TestCase shows how to use it:

{code:java}
public class TestClassifier extends TestCase {

  public void test() throws Exception {

    InstanceFactory instanceFactory = new InstanceFactory() {

      public Document factory(String text, String _class) {
        Document doc = new Document();
        doc.add(new Field("class", _class, Field.Store.YES, Field.Index.NO_NORMS));

        doc.add(new Field("text", text, Field.Store.YES, Field.Index.NO, Field.TermVector.NO));

        doc.add(new Field("text/ngrams/start", text, Field.Store.NO, Field.Index.TOKENIZED,
Field.TermVector.YES));
        doc.add(new Field("text/ngrams/inner", text, Field.Store.NO, Field.Index.TOKENIZED,
Field.TermVector.YES));
        doc.add(new Field("text/ngrams/end", text, Field.Store.NO, Field.Index.TOKENIZED,
Field.TermVector.YES));
        return doc;
      }

      Analyzer analyzer = new Analyzer() {
        private int minGram = 2;
        private int maxGram = 3;

        public TokenStream tokenStream(String fieldName, Reader reader) {
          TokenStream ts = new StandardTokenizer(reader);
          ts = new LowerCaseFilter(ts);
          if (fieldName.endsWith("/ngrams/start")) {
            ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, minGram, maxGram);
          } else if (fieldName.endsWith("/ngrams/inner")) {
            ts = new NGramTokenFilter(ts, minGram, maxGram);
          } else if (fieldName.endsWith("/ngrams/end")) {
            ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK, minGram, maxGram);
          }
          return ts;
        }
      };

      public Analyzer getAnalyzer() {
        return analyzer;
      }
    };

    Directory dir = new RAMDirectory();
    new IndexWriter(dir, null, true).close();

    Instances instances = new Instances(dir, instanceFactory, "class");

    instances.addInstance("hello world", "en");
    instances.addInstance("hallå världen", "sv");

    instances.addInstance("this is london calling", "en");
    instances.addInstance("detta är london som ringer", "sv");

    instances.addInstance("john has a long mustache", "en");
    instances.addInstance("john har en lång mustache", "sv");

    instances.addInstance("all work and no play makes jack a dull boy", "en");
    instances.addInstance("att bara arbeta och aldrig leka gör jack en trist gosse", "sv");

    instances.addInstance("shrimp sandwich", "en");
    instances.addInstance("räksmörgås", "sv");

    instances.addInstance("it's now or never", "en");
    instances.addInstance("det är nu eller aldrig", "sv");

    instances.addInstance("to tie up at a landing-stage", "en");
    instances.addInstance("att angöra en brygga", "sv");

    instances.addInstance("it's now time for the children's television shows", "en");
    instances.addInstance("nu är det dags för barnprogram", "sv");

    instances.flush();

    testClassifier(instances, new NaiveBayesClassifier());
    testClassifier(instances, new FishersMethodClassifier());

    instances.close();
  }

  private void testClassifier(Instances instances, BayesianClassifier classifier) throws IOException
{

    assertEquals("sv", classifier.classify(instances, "detta blir ett test")[0].getClassification());
    assertEquals("en", classifier.classify(instances, "this will be a test")[0].getClassification());

    // test training data instances. all ought to match!
    for (int documentNumber = 0; documentNumber < instances.getIndexReader().maxDoc();
documentNumber++) {
      if (!instances.getIndexReader().isDeleted(documentNumber)) {
        Map<Term, Double> features = instances.extractFeatures(instances.getIndexReader(),
documentNumber, classifier.isNormalized());
        Document document = instances.getIndexReader().document(documentNumber);
        assertEquals(document.get("class"), classifier.classify(instances, features)[0].getClassification());
      }
    }
  }

{code}

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Mime
View raw message