lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "bbrown" <bbr...@botspiritcompany.com>
Subject Can't code to index documents
Date Wed, 14 Nov 2007 20:23:34 GMT
I am using this code which is pretty basic.  And it won't index the documents.
 I run the index code and print the document to make sure that it gets
indexed, but when I looked at the output "gen" and "segments" file, there are
only like 20bytes of data in the files.  I am indexing about 300k of text
data.  I am using scala but I dont think that is an issue as I have used
similar code before.  When I do a search 0 documents for everything.

I am using Lucene 2.2.0 (just downloaded).

Here is the core of my code:

  def indexData(writer:IndexWriter, file: File) {
    val doc = new LucDocument()
	
	// Read the content from the file
	val contentReader = new ContentReader(file.getAbsolutePath)
    val (title, content) = contentReader.readFile()

	// Extract data from the java File class
	val link = new DocumentLink(file.getAbsolutePath, file.getName,
							content, file.getAbsolutePath)
	
	// Index the document and data.
    doc.add(new Field(LUC_KEY_FULL_PATH, link.fullPath, 
					  Field.Store.YES, Field.Index.TOKENIZED))
    doc.add(new Field(LUC_KEY_FILE_NAME, link.filename, 
					  Field.Store.YES, Field.Index.TOKENIZED))
	doc.add(new Field(LUC_KEY_CONTENT, link.content, 
					  Field.Store.YES, Field.Index.TOKENIZED))
    doc.add(new Field(LUC_KEY_IDENTITY, link.id, 
					  Field.Store.YES, Field.Index.UN_TOKENIZED))
    writer.addDocument(doc)
  }


But if you want the full scala source, here it is.  Think of it as pseudo code:

object BotlistIndexDocuments {
   
  val LUC_KEY_FULL_PATH = "full_path"
  val LUC_KEY_FILE_NAME = "file_name"
  val LUC_KEY_CONTENT = "content"
  val LUC_KEY_IDENTITY = "id"
  
    //
  // Read the content file.  The first line should contain
  // a "#title summary" line and the rest of the document
  // will contain the "wiki" document.
  class ContentReader(filename: String) {
    def readFile(): (String, String) = {
      val file = Source.fromFile(filename)
      var counted = file.getLines.counted
      val fileData = new StringBuilder()
      var title = ""
      counted.foreach { (line: String) =>
		if (counted.count == 0) {
	      //title = line.substring(6).trim()
		  title = line
		} else { 
	      fileData.append(line)
		}
      }
      (title, fileData.toString())
    }
  } // End of Class //

  case class DocumentLink(abs_path: String, file: String, data: String,
						  unique_id:String) {
    val fullPath = abs_path
    val filename = file
	val content = data
    val id = unique_id
  }
  def indexData(writer:IndexWriter, file: File) {
    val doc = new LucDocument()
	
	// Read the content from the file
	val contentReader = new ContentReader(file.getAbsolutePath)
    val (title, content) = contentReader.readFile()

	// Extract data from the java File class
	val link = new DocumentLink(file.getAbsolutePath, file.getName,
							content, file.getAbsolutePath)
	
	// Index the document and data.
    doc.add(new Field(LUC_KEY_FULL_PATH, link.fullPath, 
					  Field.Store.YES, Field.Index.TOKENIZED))
    doc.add(new Field(LUC_KEY_FILE_NAME, link.filename, 
					  Field.Store.YES, Field.Index.TOKENIZED))
	doc.add(new Field(LUC_KEY_CONTENT, link.content, 
					  Field.Store.YES, Field.Index.TOKENIZED))
    doc.add(new Field(LUC_KEY_IDENTITY, link.id, 
					  Field.Store.YES, Field.Index.UN_TOKENIZED))
    writer.addDocument(doc)
  }
  
  //
  // Utility for recursively walking directory tree
  // See:
  // override final def flatMap  [B](f : (A) => Iterable[B]) : List[B]
  class DocWalkFile(file: File) {  
	def children = new Iterable[File] {
    def elements = 
      if (file.isDirectory) file.listFiles.elements else Iterator.empty;
	}
	def andTree : Iterable[File] = (
      Seq.single(file) ++ children.flatMap(child => new
DocWalkFile(child).andTree))
  }
  def listDocuments(dir: File): List[File] =
	(new DocWalkFile(dir)).andTree.toList filter (f =>
(f.getName.endsWith(".java") || f.getName.endsWith(".txt")))
	  
  def indexDocuments(index_dir: File, files: List[File]) {
	Console.println("INFO: number of files to index=" + files.length)
    val writer = new IndexWriter(index_dir, new StandardAnalyzer(), true)
    for (val file <- files) {
      indexData(writer, file)
    }
  } 
  def main(args: Array[String]): Unit = {
    
    if (args.length != 2) {
      Console.println("usage: java BotlistIndexDocuments parent-index-dir
input-doc-dir")
	  Console.println("\n")
	  Console.println("\nRun the BotlistIndexDocuments index tool on the provided
index directory.")
	  Console.println("\nFor bug reporting instructions, please see:")
	  Console.println("<URL:http://code.google.com/p/openbotlist>.")
      return
    }

    Console.println("INFO: Indexing Document Data <standby> ...")
    val index = new File(args(0) + "/index")
	val doc_dir = new File(args(1)) 
    if (!index.exists()) {
      index.mkdir();
      Console.println("Creating index directory.")
    } else {
      Console.println("WARN: Index already exists (remove directory to continue)")
      Console.println("DIR: " + index.getAbsolutePath())
      //return
    }
	
	// Calculate the processing time to run application
    val timeStart = System.currentTimeMillis()
    indexDocuments(index, (listDocuments(doc_dir)))
    val timeEnd = System.currentTimeMillis()
    Console.println("Done...")
    Console.println("Completed processing in " + (timeEnd - timeStart) + " ms.")
  }
}

--
Berlin Brown
[berlin dot brown at gmail dot com]
http://botspiritcompany.com/botlist/?


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message