lucene-general mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Iraida <tokmola...@yandex.ru>
Subject Re: sorting frequencies
Date Wed, 06 Mar 2013 15:52:20 GMT
I'm not sure what I'm doing it right.
Here is the code of the program
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopwordAnalyzerBase.*;
import org.apache.lucene.document.Field.*;
import org.apache.lucene.document.*;
import org.apache.lucene.store.*;
import org.apache.lucene.index.*;
import org.apache.lucene.util.Version;
import java.io.*;
import java.io.FileOutputStream.*;

public class JavaApplication1 {

public static File dataDir = new File("C:/filestoindex");  
public static   File indexDir = new File("C:/fileindex");    


   public static void index(File indexDir,File dataDir) throws IOException
   {
              if (!dataDir.exists() || !dataDir.isDirectory()) 
       {
           throw new IOException(dataDir + " does not exist or is not a
directory");
       }

      
       Analyzer ac=new StandardAnalyzer(Version.LUCENE_30);
       IndexWriter indexWriter = new
IndexWriter(FSDirectory.open(indexDir),ac, true,
IndexWriter.MaxFieldLength.UNLIMITED);
       indexDirectory(indexWriter, dataDir);   
       indexWriter.close();
    }
  
   
    private static void indexDirectory(IndexWriter writer, File dir)      
            throws IOException {
        File[] files = dir.listFiles();
        for (int i = 0; i < files.length; i++)
        {            File f = files[i];
        if (f.isDirectory()) 
        {                indexDirectory(writer, f); 
        } 
              
        indexFile(writer, f);
        
        }
    }
    
    private static void indexFile(IndexWriter writer, File f) throws
IOException
    {  
       System.out.println("Индексация " + f.getName());
       Document doc = new Document();
       doc.add(new Field("contents" , new FileReader(f),
Field.TermVector.YES));
       doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
       doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
  
        writer.addDocument(doc);
    }
    
   

    public static void main(String[] args) throws Exception 
    {
         
     index(indexDir, dataDir);
    
     IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
     FileOutputStream fr = new FileOutputStream("C:/fileout/f.txt");
     for (int docNum=0; docNum<reader.numDocs(); docNum++) {
        
        TermFreqVector tfv = reader.getTermFreqVector(docNum, "contents");
        if (tfv == null) {
        continue;
        }
        String terms[] = tfv.getTerms();
        int termCount = terms.length;
        int freqs[] = tfv.getTermFrequencies();

        for (int t=0; t < termCount; t++) {
        String
st=reader.document(docNum).getField("filename").stringValue()+" "+ terms[t]
+ " " +freqs[t]+ "\r\n";
       
        fr.write(st.getBytes("UTF-8") );
    
        System.out.println(
reader.document(docNum).getField("filename").stringValue()+" "+ terms[t] + "
" + freqs[t]);
        }
    }fr.close();
  
  

    
    }    
    
    
  }




--
View this message in context: http://lucene.472066.n3.nabble.com/sorting-frequencies-tp4045197p4045249.html
Sent from the Lucene - General mailing list archive at Nabble.com.

Mime
View raw message