Return-Path:
+ * Design Overview + *
+ * deleteDocuments() method works by buffering terms to be deleted. Deletes are + * deferred until ram is flushed to disk, either because enough new documents or + * delete terms are buffered, or because close() or flush() is called. Using + * Java synchronization, care is taken to ensure that an interleaved sequence of + * inserts and deletes for the same document are properly serialized. + */ + +public class NewIndexModifier extends IndexWriter { + // number of ram segments a delete term applies to + private class Num { + private int num; + + Num(int num) { + this.num = num; + } + + int getNum() { + return num; + } + + void setNum(int num) { + this.num = num; + } + } + + /** + * Default value is 10. Change using {@link #setMaxBufferedDeleteTerms(int)}. + */ + public final static int DEFAULT_MAX_BUFFERED_DELETE_TERMS = 10; + // the max number of delete terms that can be buffered before + // they must be flushed to disk + private int maxBufferedDeleteTerms = DEFAULT_MAX_BUFFERED_DELETE_TERMS; + + // to buffer delete terms in ram before they are applied + // key is delete term, value is number of ram segments the term applies to + private HashMap bufferedDeleteTerms = new HashMap(); + private int numBufferedDeleteTerms = 0; + + /** + * @see IndexWriter#IndexWriter(String, Analyzer, boolean) + */ + public NewIndexModifier(String path, Analyzer a, boolean create) + throws IOException { + super(path, a, create); + } + + /** + * @see IndexWriter#IndexWriter(File, Analyzer, boolean) + */ + public NewIndexModifier(File path, Analyzer a, boolean create) + throws IOException { + super(path, a, create); + } + + /** + * @see IndexWriter#IndexWriter(Directory, Analyzer, boolean) + */ + public NewIndexModifier(Directory d, Analyzer a, boolean create) + throws IOException { + super(d, a, create); + } + + /** + * @see IndexWriter#IndexWriter(String, Analyzer) + */ + public NewIndexModifier(String path, Analyzer a) throws IOException { + super(path, a); + } + + /** + * @see IndexWriter#IndexWriter(File, Analyzer) + */ + public NewIndexModifier(File path, Analyzer a) throws IOException { + super(path, a); + } + + /** + * @see IndexWriter#IndexWriter(Directory, Analyzer) + */ + public NewIndexModifier(Directory d, Analyzer a) throws IOException { + super(d, a); + } + + /** + * Determines the minimal number of delete terms required before the buffered + * in-memory delete terms are applied and flushed. If there are documents + * buffered in memory at the time, they are merged and a new Segment is + * created. The delete terms are applied appropriately. + *
+ * The default value is 10.
+ * @throws IllegalArgumentException if maxBufferedDeleteTerms is smaller than
+ * 1
+ */
+ public void setMaxBufferedDeleteTerms(int maxBufferedDeleteTerms) {
+ if (maxBufferedDeleteTerms < 1)
+ throw new IllegalArgumentException("maxBufferedDeleteTerms must at least be 1");
+ this.maxBufferedDeleteTerms = maxBufferedDeleteTerms;
+ }
+
+ /**
+ * @see #setMaxBufferedDeleteTerms
+ */
+ public int getMaxBufferedDeleteTerms() {
+ return maxBufferedDeleteTerms;
+ }
+
+ // for test purpose
+ final synchronized int getBufferedDeleteTermsSize() {
+ return bufferedDeleteTerms.size();
+ }
+
+ // for test purpose
+ final synchronized int getNumBufferedDeleteTerms() {
+ return numBufferedDeleteTerms;
+ }
+
+ /**
+ * Updates a document by first deleting all documents containing
+ * term
and then adding the new document.
+ */
+ public void updateDocument(Term term, Document doc) throws IOException {
+ updateDocument(term, doc, getAnalyzer());
+ }
+
+ /**
+ * Updates a document by first deleting all documents containing
+ * term
and then adding the new document.
+ */
+ public void updateDocument(Term term, Document doc, Analyzer analyzer)
+ throws IOException {
+ SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer);
+ synchronized (this) {
+ bufferDeleteTerm(term);
+ ramSegmentInfos.addElement(newSegmentInfo);
+ maybeFlushRamSegments();
+ }
+ }
+
+ /**
+ * Deletes all documents containing term
.
+ */
+ public synchronized void deleteDocuments(Term term) throws IOException {
+ bufferDeleteTerm(term);
+ maybeFlushRamSegments();
+ }
+
+ /**
+ * Deletes all documents containing any of the terms. All deletes are flushed
+ * at the same time.
+ */
+ public synchronized void deleteDocuments(Term[] terms) throws IOException {
+ for (int i = 0; i < terms.length; i++) {
+ bufferDeleteTerm(terms[i]);
+ }
+ maybeFlushRamSegments();
+ }
+
+ // buffer a term in bufferedDeleteTerms. bufferedDeleteTerms also records
+ // the current number of documents buffered in ram so that the delete term
+ // will be applied to those ram segments as well as the disk segments
+ private void bufferDeleteTerm(Term term) {
+ Num num = (Num)bufferedDeleteTerms.get(term);
+ if (num == null) {
+ bufferedDeleteTerms.put(term, new Num(getRAMSegmentCount()));
+ } else {
+ num.setNum(getRAMSegmentCount());
+ }
+ numBufferedDeleteTerms++;
+ }
+
+ // a flush is triggered if enough new documents are buffered or
+ // if enough delete terms are buffered
+ protected boolean timeToFlushRam() {
+ return super.timeToFlushRam()
+ || numBufferedDeleteTerms >= maxBufferedDeleteTerms;
+ }
+
+ protected boolean anythingToFlushRam() {
+ return super.anythingToFlushRam() || bufferedDeleteTerms.size() > 0;
+ }
+
+ protected boolean onlyRamDocsToFlush() {
+ return super.onlyRamDocsToFlush() && bufferedDeleteTerms.size() == 0;
+ }
+
+ protected void doAfterFlushRamSegments(boolean flushedRamSegments)
+ throws IOException {
+ if (bufferedDeleteTerms.size() > 0) {
+ if (getInfoStream() != null)
+ getInfoStream().println(
+ "flush " + numBufferedDeleteTerms + " buffered terms on "
+ + segmentInfos.size() + " segments.");
+
+ if (flushedRamSegments) {
+ IndexReader reader = null;
+ try {
+ reader = SegmentReader.get(segmentInfos.info(segmentInfos.size() - 1));
+ reader.setDeleter(getDeleter());
+
+ // apply delete terms to the segment just flushed from ram
+ // apply appropriately so that a delete term is only applied to
+ // the documents buffered before it, not those buffered after it
+ applyDeletesSelectively(bufferedDeleteTerms, reader);
+ } finally {
+ if (reader != null)
+ reader.close();
+ }
+ }
+
+ int infosEnd = segmentInfos.size();
+ if (flushedRamSegments) {
+ infosEnd--;
+ }
+
+ for (int i = 0; i < infosEnd; i++) {
+ IndexReader reader = null;
+ try {
+ reader = SegmentReader.get(segmentInfos.info(i));
+ reader.setDeleter(getDeleter());
+
+ // apply delete terms to disk segments
+ // except the one just flushed from ram
+ applyDeletes(bufferedDeleteTerms, reader);
+ } finally {
+ if (reader != null)
+ reader.close();
+ }
+ }
+
+ // clean up bufferedDeleteTerms
+ bufferedDeleteTerms.clear();
+ numBufferedDeleteTerms = 0;
+ }
+ }
+
+ // apply buffered delete terms to the segment just flushed from ram
+ // apply appropriately so that a delete term is only applied to
+ // the documents buffered before it, not those buffered after it
+ private final void applyDeletesSelectively(HashMap deleteTerms,
+ IndexReader reader) throws IOException {
+ Iterator iter = deleteTerms.entrySet().iterator();
+ while (iter.hasNext()) {
+ Entry entry = (Entry)iter.next();
+ Term term = (Term)entry.getKey();
+
+ TermDocs docs = reader.termDocs(term);
+ if (docs != null) {
+ int num = ((Num)entry.getValue()).getNum();
+ try {
+ while (docs.next()) {
+ int doc = docs.doc();
+ if (doc >= num) {
+ break;
+ }
+ reader.deleteDocument(doc);
+ }
+ } finally {
+ docs.close();
+ }
+ }
+ }
+ }
+
+ // apply buffered delete terms to disk segments
+ // except the one just flushed from ram
+ private final void applyDeletes(HashMap deleteTerms, IndexReader reader)
+ throws IOException {
+ Iterator iter = deleteTerms.entrySet().iterator();
+ while (iter.hasNext()) {
+ Entry entry = (Entry)iter.next();
+ Term term = (Term)entry.getKey();
+ reader.deleteDocuments(term);
+ }
+ }
+}
Added: lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java?view=auto&rev=502191
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java Thu Feb 1 02:55:12 2007
@@ -0,0 +1,446 @@
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MockRAMDirectory;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestNewIndexModifierDelete extends TestCase {
+
+ // test the simple case
+ public void testSimpleCase() throws IOException {
+ String[] keywords = { "1", "2" };
+ String[] unindexed = { "Netherlands", "Italy" };
+ String[] unstored = { "Amsterdam has lots of bridges",
+ "Venice has lots of canals" };
+ String[] text = { "Amsterdam", "Venice" };
+
+ Directory dir = new RAMDirectory();
+ NewIndexModifier modifier = new NewIndexModifier(dir,
+ new WhitespaceAnalyzer(), true);
+ modifier.setUseCompoundFile(true);
+ modifier.setMaxBufferedDeleteTerms(1);
+
+ for (int i = 0; i < keywords.length; i++) {
+ Document doc = new Document();
+ doc.add(new Field("id", keywords[i], Field.Store.YES,
+ Field.Index.UN_TOKENIZED));
+ doc.add(new Field("country", unindexed[i], Field.Store.YES,
+ Field.Index.NO));
+ doc.add(new Field("contents", unstored[i], Field.Store.NO,
+ Field.Index.TOKENIZED));
+ doc
+ .add(new Field("city", text[i], Field.Store.YES,
+ Field.Index.TOKENIZED));
+ modifier.addDocument(doc);
+ }
+ modifier.optimize();
+
+ Term term = new Term("city", "Amsterdam");
+ int hitCount = getHitCount(dir, term);
+ assertEquals(1, hitCount);
+ modifier.deleteDocuments(term);
+ hitCount = getHitCount(dir, term);
+ assertEquals(0, hitCount);
+
+ modifier.close();
+ }
+
+ // test when delete terms only apply to disk segments
+ public void testNonRAMDelete() throws IOException {
+ Directory dir = new RAMDirectory();
+ NewIndexModifier modifier = new NewIndexModifier(dir,
+ new WhitespaceAnalyzer(), true);
+ modifier.setMaxBufferedDocs(2);
+ modifier.setMaxBufferedDeleteTerms(2);
+
+ int id = 0;
+ int value = 100;
+
+ for (int i = 0; i < 7; i++) {
+ addDoc(modifier, ++id, value);
+ }
+ modifier.flush();
+
+ assertEquals(0, modifier.getRAMSegmentCount());
+ assertTrue(0 < modifier.getSegmentCount());
+
+ IndexReader reader = IndexReader.open(dir);
+ assertEquals(7, reader.numDocs());
+ reader.close();
+
+ modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+ modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+
+ reader = IndexReader.open(dir);
+ assertEquals(0, reader.numDocs());
+ reader.close();
+
+ modifier.close();
+ }
+
+ // test when delete terms only apply to ram segments
+ public void testRAMDeletes() throws IOException {
+ Directory dir = new RAMDirectory();
+ NewIndexModifier modifier = new NewIndexModifier(dir,
+ new WhitespaceAnalyzer(), true);
+ modifier.setMaxBufferedDocs(4);
+ modifier.setMaxBufferedDeleteTerms(4);
+
+ int id = 0;
+ int value = 100;
+
+ addDoc(modifier, ++id, value);
+ modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+ addDoc(modifier, ++id, value);
+ modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+
+ assertEquals(2, modifier.getNumBufferedDeleteTerms());
+ assertEquals(1, modifier.getBufferedDeleteTermsSize());
+
+ addDoc(modifier, ++id, value);
+ assertEquals(0, modifier.getSegmentCount());
+ modifier.flush();
+
+ IndexReader reader = IndexReader.open(dir);
+ assertEquals(1, reader.numDocs());
+
+ int hitCount = getHitCount(dir, new Term("id", String.valueOf(id)));
+ assertEquals(1, hitCount);
+ reader.close();
+
+ modifier.close();
+ }
+
+ // test when delete terms apply to both disk and ram segments
+ public void testBothDeletes() throws IOException {
+ Directory dir = new RAMDirectory();
+ NewIndexModifier modifier = new NewIndexModifier(dir,
+ new WhitespaceAnalyzer(), true);
+ modifier.setMaxBufferedDocs(100);
+ modifier.setMaxBufferedDeleteTerms(100);
+
+ int id = 0;
+ int value = 100;
+
+ for (int i = 0; i < 5; i++) {
+ addDoc(modifier, ++id, value);
+ }
+
+ value = 200;
+ for (int i = 0; i < 5; i++) {
+ addDoc(modifier, ++id, value);
+ }
+ modifier.flush();
+
+ for (int i = 0; i < 5; i++) {
+ addDoc(modifier, ++id, value);
+ }
+ modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+ modifier.flush();
+
+ IndexReader reader = IndexReader.open(dir);
+ assertEquals(5, reader.numDocs());
+
+ modifier.close();
+ }
+
+ // test that batched delete terms are flushed together
+ public void testBatchDeletes() throws IOException {
+ Directory dir = new RAMDirectory();
+ NewIndexModifier modifier = new NewIndexModifier(dir,
+ new WhitespaceAnalyzer(), true);
+ modifier.setMaxBufferedDocs(2);
+ modifier.setMaxBufferedDeleteTerms(2);
+
+ int id = 0;
+ int value = 100;
+
+ for (int i = 0; i < 7; i++) {
+ addDoc(modifier, ++id, value);
+ }
+ modifier.flush();
+
+ IndexReader reader = IndexReader.open(dir);
+ assertEquals(7, reader.numDocs());
+ reader.close();
+
+ id = 0;
+ modifier.deleteDocuments(new Term("id", String.valueOf(++id)));
+ modifier.deleteDocuments(new Term("id", String.valueOf(++id)));
+
+ reader = IndexReader.open(dir);
+ assertEquals(5, reader.numDocs());
+ reader.close();
+
+ Term[] terms = new Term[3];
+ for (int i = 0; i < terms.length; i++) {
+ terms[i] = new Term("id", String.valueOf(++id));
+ }
+ modifier.deleteDocuments(terms);
+
+ reader = IndexReader.open(dir);
+ assertEquals(2, reader.numDocs());
+ reader.close();
+
+ modifier.close();
+ }
+
+ private void addDoc(NewIndexModifier modifier, int id, int value)
+ throws IOException {
+ Document doc = new Document();
+ doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
+ doc.add(new Field("id", String.valueOf(id), Field.Store.YES,
+ Field.Index.UN_TOKENIZED));
+ doc.add(new Field("value", String.valueOf(value), Field.Store.NO,
+ Field.Index.UN_TOKENIZED));
+ modifier.addDocument(doc);
+ }
+
+ private int getHitCount(Directory dir, Term term) throws IOException {
+ IndexSearcher searcher = new IndexSearcher(dir);
+ int hitCount = searcher.search(new TermQuery(term)).length();
+ searcher.close();
+ return hitCount;
+ }
+
+ public void testDeletesOnDiskFull() throws IOException {
+ testOperationsOnDiskFull(false);
+ }
+
+ public void testUpdatesOnDiskFull() throws IOException {
+ testOperationsOnDiskFull(true);
+ }
+
+ /**
+ * Make sure if modifier tries to commit but hits disk full that modifier
+ * remains consistent and usable. Similar to TestIndexReader.testDiskFull().
+ */
+ private void testOperationsOnDiskFull(boolean updates) throws IOException {
+
+ boolean debug = false;
+ Term searchTerm = new Term("content", "aaa");
+ int START_COUNT = 157;
+ int END_COUNT = 144;
+
+ // First build up a starting index:
+ RAMDirectory startDir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(startDir, new WhitespaceAnalyzer(),
+ true);
+ for (int i = 0; i < 157; i++) {
+ Document d = new Document();
+ d.add(new Field("id", Integer.toString(i), Field.Store.YES,
+ Field.Index.UN_TOKENIZED));
+ d.add(new Field("content", "aaa " + i, Field.Store.NO,
+ Field.Index.TOKENIZED));
+ writer.addDocument(d);
+ }
+ writer.close();
+
+ long diskUsage = startDir.sizeInBytes();
+ long diskFree = diskUsage + 10;
+
+ IOException err = null;
+
+ boolean done = false;
+
+ // Iterate w/ ever increasing free disk space:
+ while (!done) {
+ MockRAMDirectory dir = new MockRAMDirectory(startDir);
+ NewIndexModifier modifier = new NewIndexModifier(dir,
+ new WhitespaceAnalyzer(), false);
+
+ modifier.setMaxBufferedDocs(1000); // use flush or close
+ modifier.setMaxBufferedDeleteTerms(1000); // use flush or close
+
+ // For each disk size, first try to commit against
+ // dir that will hit random IOExceptions & disk
+ // full; after, give it infinite disk space & turn
+ // off random IOExceptions & retry w/ same reader:
+ boolean success = false;
+
+ for (int x = 0; x < 2; x++) {
+
+ double rate = 0.1;
+ double diskRatio = ((double)diskFree) / diskUsage;
+ long thisDiskFree;
+ String testName;
+
+ if (0 == x) {
+ thisDiskFree = diskFree;
+ if (diskRatio >= 2.0) {
+ rate /= 2;
+ }
+ if (diskRatio >= 4.0) {
+ rate /= 2;
+ }
+ if (diskRatio >= 6.0) {
+ rate = 0.0;
+ }
+ if (debug) {
+ System.out.println("\ncycle: " + diskFree + " bytes");
+ }
+ testName = "disk full during reader.close() @ " + thisDiskFree
+ + " bytes";
+ } else {
+ thisDiskFree = 0;
+ rate = 0.0;
+ if (debug) {
+ System.out.println("\ncycle: same writer: unlimited disk space");
+ }
+ testName = "reader re-use after disk full";
+ }
+
+ dir.setMaxSizeInBytes(thisDiskFree);
+ dir.setRandomIOExceptionRate(rate, diskFree);
+
+ try {
+ if (0 == x) {
+ int docId = 12;
+ for (int i = 0; i < 13; i++) {
+ if (updates) {
+ Document d = new Document();
+ d.add(new Field("id", Integer.toString(i), Field.Store.YES,
+ Field.Index.UN_TOKENIZED));
+ d.add(new Field("content", "bbb " + i, Field.Store.NO,
+ Field.Index.TOKENIZED));
+ modifier.updateDocument(
+ new Term("id", Integer.toString(docId)), d);
+ } else { // deletes
+ modifier
+ .deleteDocuments(new Term("id", Integer.toString(docId)));
+ // modifier.setNorm(docId, "contents", (float)2.0);
+ }
+ docId += 12;
+ }
+ }
+ modifier.close();
+ success = true;
+ if (0 == x) {
+ done = true;
+ }
+ }
+ catch (IOException e) {
+ if (debug) {
+ System.out.println(" hit IOException: " + e);
+ }
+ err = e;
+ if (1 == x) {
+ e.printStackTrace();
+ fail(testName + " hit IOException after disk space was freed up");
+ }
+ }
+
+ // Whether we succeeded or failed, check that all
+ // un-referenced files were in fact deleted (ie,
+ // we did not create garbage). Just create a
+ // new IndexFileDeleter, have it delete
+ // unreferenced files, then verify that in fact
+ // no files were deleted:
+ String[] startFiles = dir.list();
+ SegmentInfos infos = new SegmentInfos();
+ infos.read(dir);
+ IndexFileDeleter d = new IndexFileDeleter(infos, dir);
+ d.findDeletableFiles();
+ d.deleteFiles();
+ String[] endFiles = dir.list();
+
+ Arrays.sort(startFiles);
+ Arrays.sort(endFiles);
+
+ // for(int i=0;i