Return-Path: Delivered-To: apmail-jakarta-lucene-user-archive@www.apache.org Received: (qmail 57875 invoked from network); 24 Aug 2004 17:19:32 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur-2.apache.org with SMTP; 24 Aug 2004 17:19:32 -0000 Received: (qmail 83199 invoked by uid 500); 24 Aug 2004 17:19:02 -0000 Delivered-To: apmail-jakarta-lucene-user-archive@jakarta.apache.org Received: (qmail 83156 invoked by uid 500); 24 Aug 2004 17:19:02 -0000 Mailing-List: contact lucene-user-help@jakarta.apache.org; run by ezmlm Precedence: bulk List-Unsubscribe: List-Subscribe: List-Help: List-Post: List-Id: "Lucene Users List" Reply-To: "Lucene Users List" Delivered-To: mailing list lucene-user@jakarta.apache.org Received: (qmail 83030 invoked by uid 99); 24 Aug 2004 17:19:00 -0000 X-ASF-Spam-Status: No, hits=1.6 required=10.0 tests=DNS_FROM_RFC_ABUSE,HTML_40_50,HTML_MESSAGE,MSGID_FROM_MTA_HEADER,RCVD_IN_NJABL_PROXY,RCVD_IN_SORBS_MISC X-Spam-Check-By: apache.org Received: from [203.199.83.28] (HELO rediffmail.com) (203.199.83.28) by apache.org (qpsmtpd/0.27.1) with SMTP; Tue, 24 Aug 2004 10:18:59 -0700 Received: (qmail 11129 invoked by uid 510); 24 Aug 2004 13:55:09 -0000 Date: 24 Aug 2004 13:55:09 -0000 Message-ID: <20040824135509.11128.qmail@webmail18.rediffmail.com> Received: from unknown (61.11.82.240) by rediffmail.com via HTTP; 24 aug 2004 13:55:09 -0000 MIME-Version: 1.0 From: "sivalingam T" Reply-To: "sivalingam T" To: lucene-user@jakarta.apache.org Cc: viji_anbumani@yahoo.com Subject: PDF indexing Content-type: multipart/alternative; boundary="Next_1093355709---0-203.199.83.28-11108" X-Virus-Checked: Checked X-Spam-Rating: minotaur-2.apache.org 1.6.2 0/1000/N --Next_1093355709---0-203.199.83.28-11108 Content-type: text/plain; charset=iso-8859-1 Content-Transfer-Encoding: quoted-printable Content-Disposition: inline =A0Hi=0A=0AI have written one files for PDF Indexing. Here I have written = as follows .. =0A=0AThis is my IndexPDF file.=0A=0Aimport org.apache.lucen= e.analysis.standard.StandardAnalyzer;=0Aimport org.apache.lucene.document.D= ocument;=0Aimport org.apache.lucene.index.IndexReader;=0Aimport org.apache.= lucene.index.IndexWriter;=0Aimport org.apache.lucene.index.Term;=0Aimport o= rg.apache.lucene.index.TermEnum;=0A=0Aimport org.pdfbox.searchengine.lucene= .LucenePDFDocument;=0A=0Aimport java.io.File;=0Aimport java.util.Date;=0Aim= port java.util.Arrays;=0A=0Aclass IndexPDF {=0A private static boolean del= eting =3D false; // true during deletion pass=0A private static Inde= xReader reader; // existing index=0A private static IndexWriter= writer; // new index being built=0A private static TermEnum ui= dIter; // document id iterator=0A=0A public static void main(St= ring[] argv) {=0A try {=0A String index =3D "index";=0A boolea= n create =3D false;=0A File root =3D null;=0A=0A String usage =3D= "IndexHTML [-create] [-index ] ";=0A=0A if (ar= gv.length =3D=3D 0) {=0A System.err.println("Usage: " + usage);=0A = return;=0A }=0A=0A for (int i =3D 0; i < argv.length; i++) {=0A = if (argv[i].equals("-index")) { // parse -index option=0A = index =3D argv[++i];=0A } else if (argv[i].equals("-create")) { = // parse -create option=0A create =3D true;=0A } else if (i != =3D argv.length-1) {=0A System.err.println("Usage: " + usage);=0A = return;=0A } else=0A root =3D new File(argv[i]);=0A }=0A= =0A Date start =3D new Date();=0A=0A if (!create) { = // delete stale docs=0A deleting =3D true;=0A indexDocs(roo= t, index, create);=0A }=0A=0A writer =3D new IndexWriter(index, n= ew StandardAnalyzer(), create);=0A writer.maxFieldLength =3D 1000000;= =0A=0A indexDocs(root, index, create); // add new docs=0A= =0A System.out.println("Optimizing index...");=0A writer.optimize= ();=0A writer.close();=0A=0A Date end =3D new Date();=0A=0A = System.out.print(end.getTime() - start.getTime());=0A System.out.print= ln(" total milliseconds");=0A=0A } catch (Exception e) {=0A System.= out.println(" caught a " + e.getClass() +=0A "\n with messag= e: " + e.getMessage());=0A }=0A }=0A=0A /* Walk directory hierarchy in= uid order, while keeping uid iterator from=0A /* existing index in sync. = Mismatches indicate one of: (a) old documents to=0A /* be deleted; (b) un= changed documents, to be left alone; or (c) new=0A /* documents, to be ind= exed.=0A */=0A=0A private static void indexDocs(File file, String index, = boolean create)=0A throws Exception {=0A if (!create) { = // incrementally update=0A=0A reader =3D IndexReader.open(in= dex); // open existing index=0A uidIter =3D reader.terms(ne= w Term("uid", "")); // init uid iterator=0A=0A indexDocs(file);=0A=0A = if (deleting) { // delete rest of stale docs=0A = while (uidIter.term() !=3D null && uidIter.term().field() =3D=3D "uid") = {=0A System.out.println("deleting " +=0A HTMLDocume= nt.uid2url(uidIter.term().text()));=0A reader.delete(uidIter.term());= =0A uidIter.next();=0A }=0A deleting =3D false;=0A }=0A= =0A uidIter.close(); // close uid iterator=0A = reader.close(); // close existing index=0A=0A } e= lse // don't have exisiting=0A indexDocs(fil= e);=0A }=0A=0A private static void indexDocs(File file) throws Exception= =0A {=0A if (file.isDirectory())=0A { // if a direc= tory=0A String[] files =3D file.list(); // list its files= =0A Arrays.sort(files); // sort the files=0A for = (int i =3D 0; i < files.length; i++)=0A { // recursively index th= em=0A indexDocs(new File(file, files[i]));=0A }=0A=0A } =0A= if ((file.getPath().endsWith(".pdf" )) || (file.getPath().endsWith(".P= DF" )))=0A {=0A System.out.println( "Indexing PDF document: "= + file );=0A try=0A {=0A //Document= doc =3D LucenePDFDocument.getDocument( file );=0A writer.addDoc= ument(LucenePDFDocument.getDocument( file));=0A }=0A = catch(Exception e)=0A {}=0A }=0A =0A }=0A =0A= }=0A=0Awhen i use the following commands, the exceptions are thrown if anyb= ody know please inform me.=0A=0A=0AC:\>java org.apache.lucene.demo.IndexPDF= -create -index c:\lucene\pdf c:\pdfs\Words.pdf=0A=0AIndexing PDF document:= c:\pdfs\Words.pdf=0AException in thread "main" java.lang.NoClassDefFoundEr= ror: org/apache/log4j/Cate=0Agory=0A at org.pdfbox.searchengine.luce= ne.LucenePDFDocument.addContent(LucenePDF=0ADocument.java:197)=0A at= org.pdfbox.searchengine.lucene.LucenePDFDocument.getDocument(LucenePD=0AFD= ocument.java:118)=0A at org.apache.lucene.demo.IndexPDF.indexDocs(Un= known Source)=0A at org.apache.lucene.demo.IndexPDF.indexDocs(Unknow= n Source)=0A at org.apache.lucene.demo.IndexPDF.main(Unknown Source)= =0A=0A=0A=0AThanks.=0A=0A=0A=0A=0A=0A=0AWith Warm Regards,=0D=0ASivalingam.= T=0D=0A=0D=0ASai Eswar Innovations (P) Ltd,=0D=0AChennai-92 --Next_1093355709---0-203.199.83.28-11108--