lucene-general mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Iraida <tokmola...@yandex.ru>
Subject TermFreqVector for the Russian documents
Date Thu, 28 Mar 2013 14:59:04 GMT
Hello!Help me please!
The program is not working TermFreqVector for the Russian documents,
although I use RussianAnalyzer.
The program sorts the frequency of words in documents.
public class JavaApplication1 {

public static File dataDir = new File("C:/filestoindex");  
public static   File indexDir = new File("C:/fileindex");    


   public static void index(File indexDir,File dataDir) throws IOException
   {
              if (!dataDir.exists() || !dataDir.isDirectory()) 
       {
           throw new IOException(dataDir + " does not exist or is not a
directory");
       }

      
   
       Analyzer a=new RussianAnalyzer(Version.LUCENE_30);
       IndexWriter indexWriter = new
IndexWriter(FSDirectory.open(indexDir),a, true, 
IndexWriter.MaxFieldLength.UNLIMITED);
       indexDirectory(indexWriter, dataDir);   
       indexWriter.close(); 
    }
  
   
    private static void indexDirectory(IndexWriter writer, File dir)      
            throws IOException {
        File[] files = dir.listFiles();
        for (int i = 0; i < files.length; i++)
        {            File f = files[i];
        if (f.isDirectory()) 
        {                indexDirectory(writer, f); 
        } 
              
        indexFile(writer, f);
        
        }
    }
    
    private static void indexFile(IndexWriter writer, File f) throws
IOException
    {  
       System.out.println("Индексация " + f.getName());
       Document doc = new Document();
       doc.add(new Field("contents" , new FileReader(f), 
Field.TermVector.YES));
       doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
       doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
  
        writer.addDocument(doc);
    }
    
   

    public static void main(String[] args) throws Exception 
    {
         
     index(indexDir, dataDir);
   // File indexDirr = new File("C:/indexMaterial");  
     IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
      
     for (int docNum=0; docNum<reader.numDocs(); docNum++)
     {
        HashMap&lt;String,Integer> totalTfv = new
HashMap<String,Integer>(1024);
        TermFreqVector tfv = reader.getTermFreqVector(docNum, "contents");
        if (tfv == null) 
        {
        continue;
        }
        String terms[] = tfv.getTerms();
        int termCount = terms.length;
        int freqs[] = tfv.getTermFrequencies();
        
        for (int t=0; t < termCount; t++)
        {
        String term = terms[t];
        int freq = freqs[t];
        Integer totalFreq = totalTfv.get(term);
        totalFreq = (totalFreq == null) ? freq : freq + totalFreq;
        totalTfv.put(term, totalFreq);
         }
        
        List<Entry&lt;String, Integer>> entries = new
ArrayList<Entry&lt;String, Integer>>(totalTfv.entrySet());
        Collections.sort(entries, new Comparator<Entry&lt;String,
Integer>>() {
            @Override
            public int compare(Entry<String, Integer> e1, Entry<String,
Integer> e2) {
                int v1 = e1.getValue(); // can be NPE when autounboxing
                int v2 = e2.getValue();
                return (v1 < v2) ? 1 : (v1 == v2) ? 0 : -1;
            }
        });
         
        String adres="C:/fileout/"
+"out"+reader.document(docNum).getField("filename").stringValue()+".txt";
        FileOutputStream fr = new FileOutputStream(adres);
         
        for (Entry<String, Integer> e : entries) 
        {
       
System.out.println(reader.document(docNum).getField("filename").stringValue()+"
"+e.getKey() + "/" + e.getValue());
        String
st=reader.document(docNum).getField("filename").stringValue()+" "+
e.getKey() + "/" + e.getValue()+ "\r\n";
        fr.write(st.getBytes("UTF-8") );
        }
    
       fr.close(); 
 
    }

 
 
     
  }    
    
}



--
View this message in context: http://lucene.472066.n3.nabble.com/TermFreqVector-for-the-Russian-documents-tp4052069.html
Sent from the Lucene - General mailing list archive at Nabble.com.

Mime
View raw message