Return-Path: Delivered-To: apmail-incubator-lucene-net-commits-archive@locus.apache.org Received: (qmail 54095 invoked from network); 25 Jun 2008 02:53:28 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 25 Jun 2008 02:53:28 -0000 Received: (qmail 74453 invoked by uid 500); 25 Jun 2008 02:53:29 -0000 Delivered-To: apmail-incubator-lucene-net-commits-archive@incubator.apache.org Received: (qmail 74435 invoked by uid 500); 25 Jun 2008 02:53:29 -0000 Mailing-List: contact lucene-net-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@incubator.apache.org Delivered-To: mailing list lucene-net-commits@incubator.apache.org Received: (qmail 74419 invoked by uid 99); 25 Jun 2008 02:53:29 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 24 Jun 2008 19:53:29 -0700 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 25 Jun 2008 02:52:37 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id E90E62388A79; Tue, 24 Jun 2008 19:52:26 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r671404 [2/10] - /incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/ Date: Wed, 25 Jun 2008 02:52:24 -0000 To: lucene-net-commits@incubator.apache.org From: aroush@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20080625025226.E90E62388A79@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/DocumentsWriter.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/DocumentsWriter.cs?rev=671404&view=auto ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/DocumentsWriter.cs (added) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/DocumentsWriter.cs Tue Jun 24 19:52:22 2008 @@ -0,0 +1,3941 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; + +using Document = Lucene.Net.Documents.Document; +using Fieldable = Lucene.Net.Documents.Fieldable; +using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException; +using Directory = Lucene.Net.Store.Directory; +using IndexInput = Lucene.Net.Store.IndexInput; +using IndexOutput = Lucene.Net.Store.IndexOutput; +using RAMOutputStream = Lucene.Net.Store.RAMOutputStream; +using Analyzer = Lucene.Net.Analysis.Analyzer; +using Token = Lucene.Net.Analysis.Token; +using TokenStream = Lucene.Net.Analysis.TokenStream; +using Similarity = Lucene.Net.Search.Similarity; + +namespace Lucene.Net.Index +{ + + /// This class accepts multiple added documents and directly + /// writes a single segment file. It does this more + /// efficiently than creating a single segment per document + /// (with DocumentWriter) and doing standard merges on those + /// segments. + /// + /// When a document is added, its stored fields (if any) and + /// term vectors (if any) are immediately written to the + /// Directory (ie these do not consume RAM). The freq/prox + /// postings are accumulated into a Postings hash table keyed + /// by term. Each entry in this hash table holds a separate + /// byte stream (allocated as incrementally growing slices + /// into large shared byte[] arrays) for freq and prox, that + /// contains the postings data for multiple documents. If + /// vectors are enabled, each unique term for each document + /// also allocates a PostingVector instance to similarly + /// track the offsets & positions byte stream. + /// + /// Once the Postings hash is full (ie is consuming the + /// allowed RAM) or the number of added docs is large enough + /// (in the case we are flushing by doc count instead of RAM + /// usage), we create a real segment and flush it to disk and + /// reset the Postings hash. + /// + /// In adding a document we first organize all of its fields + /// by field name. We then process field by field, and + /// record the Posting hash per-field. After each field we + /// flush its term vectors. When it's time to flush the full + /// segment we first sort the fields by name, and then go + /// field by field and sorts its postings. + /// + /// + /// Threads: + /// + /// Multiple threads are allowed into addDocument at once. + /// There is an initial synchronized call to getThreadState + /// which allocates a ThreadState for this thread. The same + /// thread will get the same ThreadState over time (thread + /// affinity) so that if there are consistent patterns (for + /// example each thread is indexing a different content + /// source) then we make better use of RAM. Then + /// processDocument is called on that ThreadState without + /// synchronization (most of the "heavy lifting" is in this + /// call). Finally the synchronized "finishDocument" is + /// called to flush changes to the directory. + /// + /// Each ThreadState instance has its own Posting hash. Once + /// we're using too much RAM, we flush all Posting hashes to + /// a segment by merging the docIDs in the posting lists for + /// the same term across multiple thread states (see + /// writeSegment and appendPostings). + /// + /// When flush is called by IndexWriter, or, we flush + /// internally when autoCommit=false, we forcefully idle all + /// threads and flush only once they are all idle. This + /// means you can call flush with a given thread even while + /// other threads are actively adding/deleting documents. + /// + /// + /// Exceptions: + /// + /// Because this class directly updates in-memory posting + /// lists, and flushes stored fields and term vectors + /// directly to files in the directory, there are certain + /// limited times when an exception can corrupt this state. + /// For example, a disk full while flushing stored fields + /// leaves this file in a corrupt state. Or, an OOM + /// exception while appending to the in-memory posting lists + /// can corrupt that posting list. We call such exceptions + /// "aborting exceptions". In these cases we must call + /// abort() to discard all docs added since the last flush. + /// + /// All other exceptions ("non-aborting exceptions") can + /// still partially update the index structures. These + /// updates are consistent, but, they represent only a part + /// of the document seen up until the exception was hit. + /// When this happens, we immediately mark the document as + /// deleted so that the document is always atomically ("all + /// or none") added to the index. + /// + + public sealed class DocumentsWriter + { + private void InitBlock() + { + threadStates = new ThreadState[0]; + waitingThreadStates = new ThreadState[MAX_THREAD_STATE]; + maxBufferedDeleteTerms = IndexWriter.DEFAULT_MAX_BUFFERED_DELETE_TERMS; + ramBufferSize = (long) (IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB * 1024 * 1024); // {{Aroush-2.3.1}} should 'ramBufferSize' + maxBufferedDocs = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS; + norms = new BufferedNorms[0]; + } + + private IndexWriter writer; + private Directory directory; + + private FieldInfos fieldInfos = new FieldInfos(); // All fields we've seen + private IndexOutput tvx, tvf, tvd; // To write term vectors + private FieldsWriter fieldsWriter; // To write stored fields + + private System.String segment; // Current segment we are working on + private System.String docStoreSegment; // Current doc-store segment we are writing + private int docStoreOffset; // Current starting doc-store offset of current segment + + private int nextDocID; // Next docID to be added + private int numDocsInRAM; // # docs buffered in RAM + private int numDocsInStore; // # docs written to doc stores + private int nextWriteDocID; // Next docID to be written + + // Max # ThreadState instances; if there are more threads + // than this they share ThreadStates + private const int MAX_THREAD_STATE = 5; + private ThreadState[] threadStates; + private System.Collections.Hashtable threadBindings = new System.Collections.Hashtable(); + private int numWaiting; + private ThreadState[] waitingThreadStates; + private int pauseThreads; // Non-zero when we need all threads to + // pause (eg to flush) + private bool flushPending; // True when a thread has decided to flush + private bool bufferIsFull; // True when it's time to write segment + private int abortCount; // Non-zero while abort is pending or running + + private System.IO.TextWriter infoStream; + + // This Hashmap buffers delete terms in ram before they + // are applied. The key is delete term; the value is + // number of buffered documents the term applies to. + private System.Collections.Hashtable bufferedDeleteTerms = new System.Collections.Hashtable(); + private int numBufferedDeleteTerms = 0; + + // Currently used only for deleting a doc on hitting an non-aborting exception + private System.Collections.IList bufferedDeleteDocIDs = new System.Collections.ArrayList(); + + // The max number of delete terms that can be buffered before + // they must be flushed to disk. + private int maxBufferedDeleteTerms; + + // How much RAM we can use before flushing. This is 0 if + // we are flushing by doc count instead. + private long ramBufferSize; + + // Flush @ this number of docs. If rarmBufferSize is + // non-zero we will flush by RAM usage instead. + private int maxBufferedDocs; + + private bool closed; + + // Coarse estimates used to measure RAM usage of buffered deletes + private static int OBJECT_HEADER_BYTES = 8; + private static int OBJECT_POINTER_BYTES = 4; // TODO: should be 8 on 64-bit platform + private static int BYTES_PER_CHAR = 2; + private static int BYTES_PER_INT = 4; + + private BufferedNorms[] norms; // Holds norms until we flush + + internal DocumentsWriter(Directory directory, IndexWriter writer) + { + InitBlock(); + this.directory = directory; + this.writer = writer; + + postingsFreeList = new Posting[0]; + } + + /// If non-null, various details of indexing are printed + /// here. + /// + internal void SetInfoStream(System.IO.TextWriter infoStream) + { + this.infoStream = infoStream; + } + + /// Set how much RAM we can use before flushing. + internal void SetRAMBufferSizeMB(double mb) + { + if (mb == IndexWriter.DISABLE_AUTO_FLUSH) + { + ramBufferSize = IndexWriter.DISABLE_AUTO_FLUSH; + } + else + { + ramBufferSize = (long) (mb * 1024 * 1024); + } + } + + internal double GetRAMBufferSizeMB() + { + if (ramBufferSize == IndexWriter.DISABLE_AUTO_FLUSH) + { + return ramBufferSize; + } + else + { + return ramBufferSize / 1024.0 / 1024.0; + } + } + + /// Set max buffered docs, which means we will flush by + /// doc count instead of by RAM usage. + /// + internal void SetMaxBufferedDocs(int count) + { + maxBufferedDocs = count; + } + + internal int GetMaxBufferedDocs() + { + return maxBufferedDocs; + } + + /// Get current segment name we are writing. + internal System.String GetSegment() + { + return segment; + } + + /// Returns how many docs are currently buffered in RAM. + internal int GetNumDocsInRAM() + { + return numDocsInRAM; + } + + /// Returns the current doc store segment we are writing + /// to. This will be the same as segment when autoCommit + /// * is true. + /// + internal System.String GetDocStoreSegment() + { + return docStoreSegment; + } + + /// Returns the doc offset into the shared doc store for + /// the current buffered docs. + /// + internal int GetDocStoreOffset() + { + return docStoreOffset; + } + + /// Closes the current open doc stores an returns the doc + /// store segment name. This returns null if there are * + /// no buffered documents. + /// + internal System.String CloseDocStore() + { + + System.Diagnostics.Debug.Assert(AllThreadsIdle()); + + System.Collections.IList flushedFiles = Files(); + + if (infoStream != null) + infoStream.WriteLine("\ncloseDocStore: " + flushedFiles.Count + " files to flush to segment " + docStoreSegment + " numDocs=" + numDocsInStore); + + if (flushedFiles.Count > 0) + { + files = null; + + if (tvx != null) + { + // At least one doc in this run had term vectors enabled + System.Diagnostics.Debug.Assert(docStoreSegment != null); + tvx.Close(); + tvf.Close(); + tvd.Close(); + tvx = null; + } + + if (fieldsWriter != null) + { + System.Diagnostics.Debug.Assert(docStoreSegment != null); + fieldsWriter.Close(); + fieldsWriter = null; + } + + System.String s = docStoreSegment; + docStoreSegment = null; + docStoreOffset = 0; + numDocsInStore = 0; + return s; + } + else + { + return null; + } + } + + private System.Collections.IList files = null; // Cached list of files we've created + private System.Collections.IList abortedFiles = null; // List of files that were written before last abort() + + internal System.Collections.IList AbortedFiles() + { + return abortedFiles; + } + + /* Returns list of files in use by this instance, + * including any flushed segments. */ + internal System.Collections.IList Files() + { + lock (this) + { + + if (files != null) + return files; + + files = new System.Collections.ArrayList(); + + // Stored fields: + if (fieldsWriter != null) + { + System.Diagnostics.Debug.Assert(docStoreSegment != null); + files.Add(docStoreSegment + "." + IndexFileNames.FIELDS_EXTENSION); + files.Add(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); + } + + // Vectors: + if (tvx != null) + { + System.Diagnostics.Debug.Assert(docStoreSegment != null); + files.Add(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); + files.Add(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); + files.Add(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); + } + + return files; + } + } + + internal void SetAborting() + { + lock (this) + { + abortCount++; + } + } + + /// Called if we hit an exception when adding docs, + /// flushing, etc. This resets our state, discarding any + /// docs added since last flush. If ae is non-null, it + /// contains the root cause exception (which we re-throw + /// after we are done aborting). + /// + internal void Abort(AbortException ae) + { + lock (this) + { + + // Anywhere that throws an AbortException must first + // mark aborting to make sure while the exception is + // unwinding the un-synchronized stack, no thread grabs + // the corrupt ThreadState that hit the aborting + // exception: + System.Diagnostics.Debug.Assert(ae == null || abortCount > 0); + + try + { + + if (infoStream != null) + infoStream.WriteLine("docWriter: now abort"); + + // Forcefully remove waiting ThreadStates from line + for (int i = 0; i < numWaiting; i++) + waitingThreadStates[i].isIdle = true; + numWaiting = 0; + + // Wait for all other threads to finish with DocumentsWriter: + PauseAllThreads(); + + System.Diagnostics.Debug.Assert(0 == numWaiting); + + try + { + + bufferedDeleteTerms.Clear(); + bufferedDeleteDocIDs.Clear(); + numBufferedDeleteTerms = 0; + + try + { + abortedFiles = Files(); + } + catch (System.Exception) + { + abortedFiles = null; + } + + docStoreSegment = null; + numDocsInStore = 0; + docStoreOffset = 0; + files = null; + + // Clear vectors & fields from ThreadStates + for (int i = 0; i < threadStates.Length; i++) + { + ThreadState state = threadStates[i]; + state.tvfLocal.Reset(); + state.fdtLocal.Reset(); + if (state.localFieldsWriter != null) + { + try + { + state.localFieldsWriter.Close(); + } + catch (System.Exception) + { + } + state.localFieldsWriter = null; + } + } + + // Reset vectors writer + if (tvx != null) + { + try + { + tvx.Close(); + } + catch (System.Exception) + { + } + tvx = null; + } + if (tvd != null) + { + try + { + tvd.Close(); + } + catch (System.Exception) + { + } + tvd = null; + } + if (tvf != null) + { + try + { + tvf.Close(); + } + catch (System.Exception) + { + } + tvf = null; + } + + // Reset fields writer + if (fieldsWriter != null) + { + try + { + fieldsWriter.Close(); + } + catch (System.Exception) + { + } + fieldsWriter = null; + } + + // Discard pending norms: + int numField = fieldInfos.Size(); + for (int i = 0; i < numField; i++) + { + FieldInfo fi = fieldInfos.FieldInfo(i); + if (fi.isIndexed && !fi.omitNorms) + { + BufferedNorms n = norms[i]; + if (n != null) + try + { + n.Reset(); + } + catch (System.Exception) + { + } + } + } + + // Reset all postings data + ResetPostingsData(); + } + finally + { + ResumeAllThreads(); + } + + // If we have a root cause exception, re-throw it now: + if (ae != null) + { + System.Exception t = ae.InnerException; + if (t is System.IO.IOException) + throw (System.IO.IOException) t; + else if (t is System.SystemException) + throw (System.SystemException) t; + else if (t is System.ApplicationException) + throw (System.ApplicationException) t; + else + // Should not get here + System.Diagnostics.Debug.Assert(false, "unknown exception: " + t); + } + } + finally + { + if (ae != null) + abortCount--; + System.Threading.Monitor.PulseAll(this); + } + } + } + + /// Reset after a flush + private void ResetPostingsData() + { + // All ThreadStates should be idle when we are called + System.Diagnostics.Debug.Assert(AllThreadsIdle()); + threadBindings.Clear(); + segment = null; + numDocsInRAM = 0; + nextDocID = 0; + nextWriteDocID = 0; + files = null; + BalanceRAM(); + bufferIsFull = false; + flushPending = false; + for (int i = 0; i < threadStates.Length; i++) + { + threadStates[i].numThreads = 0; + threadStates[i].ResetPostings(); + } + numBytesUsed = 0; + } + + // Returns true if an abort is in progress + internal bool PauseAllThreads() + { + lock (this) + { + pauseThreads++; + while (!AllThreadsIdle()) + { + try + { + System.Threading.Monitor.Wait(this); + } + catch (System.Threading.ThreadInterruptedException) + { + SupportClass.ThreadClass.Current().Interrupt(); + } + } + return abortCount > 0; + } + } + + internal void ResumeAllThreads() + { + lock (this) + { + pauseThreads--; + System.Diagnostics.Debug.Assert(pauseThreads >= 0); + if (0 == pauseThreads) + System.Threading.Monitor.PulseAll(this); + } + } + + private bool AllThreadsIdle() + { + lock (this) + { + for (int i = 0; i < threadStates.Length; i++) + if (!threadStates[i].isIdle) + return false; + return true; + } + } + + private bool hasNorms; // Whether any norms were seen since last flush + + internal System.Collections.IList newFiles; + + /// Flush all pending docs to a new segment + internal int Flush(bool closeDocStore) + { + lock (this) + { + + System.Diagnostics.Debug.Assert(AllThreadsIdle()); + + if (segment == null) + // In case we are asked to flush an empty segment + segment = writer.NewSegmentName(); + + newFiles = new System.Collections.ArrayList(); + + docStoreOffset = numDocsInStore; + + int docCount; + + System.Diagnostics.Debug.Assert(numDocsInRAM > 0); + + if (infoStream != null) + infoStream.WriteLine("\nflush postings as segment " + segment + " numDocs=" + numDocsInRAM); + + bool success = false; + + try + { + System.Collections.IEnumerator e; + + if (closeDocStore) + { + System.Diagnostics.Debug.Assert(docStoreSegment != null); + System.Diagnostics.Debug.Assert(docStoreSegment.Equals(segment)); + e = Files().GetEnumerator(); + while (e.MoveNext()) + newFiles.Add(e.Current); + CloseDocStore(); + } + + fieldInfos.Write(directory, segment + ".fnm"); + + docCount = numDocsInRAM; + + e = WriteSegment().GetEnumerator(); + while (e.MoveNext()) + newFiles.Add(e.Current); + + success = true; + } + finally + { + if (!success) + Abort(null); + } + + return docCount; + } + } + + /// Build compound file for the segment we just flushed + internal void CreateCompoundFile(System.String segment) + { + CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION); + int size = newFiles.Count; + for (int i = 0; i < size; i++) + cfsWriter.AddFile((System.String) newFiles[i]); + + // Perform the merge + cfsWriter.Close(); + } + + /// Set flushPending if it is not already set and returns + /// whether it was set. This is used by IndexWriter to * + /// trigger a single flush even when multiple threads are + /// * trying to do so. + /// + internal bool SetFlushPending() + { + lock (this) + { + if (flushPending) + return false; + else + { + flushPending = true; + return true; + } + } + } + + internal void ClearFlushPending() + { + lock (this) + { + flushPending = false; + } + } + + /// Per-thread state. We keep a separate Posting hash and + /// other state for each thread and then merge postings * + /// hashes from all threads when writing the segment. + /// + sealed internal class ThreadState + { + private void InitBlock(DocumentsWriter enclosingInstance) + { + this.enclosingInstance = enclosingInstance; + allFieldDataArray = new FieldData[10]; + postingsPool = new ByteBlockPool(enclosingInstance); + vectorsPool = new ByteBlockPool(enclosingInstance); + charPool = new CharBlockPool(enclosingInstance); + } + private DocumentsWriter enclosingInstance; + public DocumentsWriter Enclosing_Instance + { + get + { + return enclosingInstance; + } + + } + + internal Posting[] postingsFreeList; // Free Posting instances + internal int postingsFreeCount; + + internal RAMOutputStream tvfLocal = new RAMOutputStream(); // Term vectors for one doc + internal RAMOutputStream fdtLocal = new RAMOutputStream(); // Stored fields for one doc + internal FieldsWriter localFieldsWriter; // Fields for one doc + + internal long[] vectorFieldPointers; + internal int[] vectorFieldNumbers; + + internal bool isIdle = true; // Whether we are in use + internal int numThreads = 1; // Number of threads that use this instance + + internal int docID; // docID we are now working on + internal int numStoredFields; // How many stored fields in current doc + internal float docBoost; // Boost for current doc + + internal FieldData[] fieldDataArray; // Fields touched by current doc + internal int numFieldData; // How many fields in current doc + internal int numVectorFields; // How many vector fields in current doc + + internal FieldData[] allFieldDataArray; // All FieldData instances + internal int numAllFieldData; + internal FieldData[] fieldDataHash; // Hash FieldData instances by field name + internal int fieldDataHashMask; + internal System.String maxTermPrefix; // Non-null prefix of a too-large term if this + // doc has one + + internal bool doFlushAfter; + + public ThreadState(DocumentsWriter enclosingInstance) + { + InitBlock(enclosingInstance); + fieldDataArray = new FieldData[8]; + + fieldDataHash = new FieldData[16]; + fieldDataHashMask = 15; + + vectorFieldPointers = new long[10]; + vectorFieldNumbers = new int[10]; + postingsFreeList = new Posting[256]; + postingsFreeCount = 0; + } + + /// Clear the postings hash and return objects back to + /// shared pool + /// + public void ResetPostings() + { + fieldGen = 0; + maxPostingsVectors = 0; + doFlushAfter = false; + if (localFieldsWriter != null) + { + localFieldsWriter.Close(); + localFieldsWriter = null; + } + postingsPool.Reset(); + charPool.Reset(); + Enclosing_Instance.RecyclePostings(postingsFreeList, postingsFreeCount); + postingsFreeCount = 0; + for (int i = 0; i < numAllFieldData; i++) + { + FieldData fp = allFieldDataArray[i]; + fp.lastGen = - 1; + if (fp.numPostings > 0) + fp.ResetPostingArrays(); + } + } + + /// Move all per-document state that was accumulated in + /// the ThreadState into the "real" stores. + /// + public void WriteDocument() + { + + // If we hit an exception while appending to the + // stored fields or term vectors files, we have to + // abort all documents since we last flushed because + // it means those files are possibly inconsistent. + try + { + + Enclosing_Instance.numDocsInStore++; + + // Append stored fields to the real FieldsWriter: + Enclosing_Instance.fieldsWriter.FlushDocument(numStoredFields, fdtLocal); + fdtLocal.Reset(); + + // Append term vectors to the real outputs: + if (Enclosing_Instance.tvx != null) + { + Enclosing_Instance.tvx.WriteLong(Enclosing_Instance.tvd.GetFilePointer()); + Enclosing_Instance.tvd.WriteVInt(numVectorFields); + if (numVectorFields > 0) + { + for (int i = 0; i < numVectorFields; i++) + Enclosing_Instance.tvd.WriteVInt(vectorFieldNumbers[i]); + System.Diagnostics.Debug.Assert(0 == vectorFieldPointers [0]); + Enclosing_Instance.tvd.WriteVLong(Enclosing_Instance.tvf.GetFilePointer()); + long lastPos = vectorFieldPointers[0]; + for (int i = 1; i < numVectorFields; i++) + { + long pos = vectorFieldPointers[i]; + Enclosing_Instance.tvd.WriteVLong(pos - lastPos); + lastPos = pos; + } + tvfLocal.WriteTo(Enclosing_Instance.tvf); + tvfLocal.Reset(); + } + } + + // Append norms for the fields we saw: + for (int i = 0; i < numFieldData; i++) + { + FieldData fp = fieldDataArray[i]; + if (fp.doNorms) + { + BufferedNorms bn = Enclosing_Instance.norms[fp.fieldInfo.number]; + System.Diagnostics.Debug.Assert(bn != null); + System.Diagnostics.Debug.Assert(bn.upto <= docID); + bn.Fill(docID); + float norm = fp.boost * Enclosing_Instance.writer.GetSimilarity().LengthNorm(fp.fieldInfo.name, fp.length); + bn.Add(norm); + } + } + } + catch (System.Exception t) + { + // Forcefully idle this threadstate -- its state will + // be reset by abort() + isIdle = true; + throw new AbortException(t, Enclosing_Instance); + } + + if (Enclosing_Instance.bufferIsFull && !Enclosing_Instance.flushPending) + { + Enclosing_Instance.flushPending = true; + doFlushAfter = true; + } + } + + internal int fieldGen; + + /// Initializes shared state for this new document + internal void Init(Document doc, int docID) + { + + System.Diagnostics.Debug.Assert(!isIdle); + + this.docID = docID; + docBoost = doc.GetBoost(); + numStoredFields = 0; + numFieldData = 0; + numVectorFields = 0; + maxTermPrefix = null; + + System.Diagnostics.Debug.Assert(0 == fdtLocal.Length()); + System.Diagnostics.Debug.Assert(0 == fdtLocal.GetFilePointer()); + System.Diagnostics.Debug.Assert(0 == tvfLocal.Length()); + System.Diagnostics.Debug.Assert(0 == tvfLocal.GetFilePointer()); + int thisFieldGen = fieldGen++; + + System.Collections.IList docFields = doc.GetFields(); + int numDocFields = docFields.Count; + bool docHasVectors = false; + + // Absorb any new fields first seen in this document. + // Also absorb any changes to fields we had already + // seen before (eg suddenly turning on norms or + // vectors, etc.): + + for (int i = 0; i < numDocFields; i++) + { + Fieldable field = (Fieldable) docFields[i]; + + FieldInfo fi = Enclosing_Instance.fieldInfos.Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false); + if (fi.isIndexed && !fi.omitNorms) + { + // Maybe grow our buffered norms + if (Enclosing_Instance.norms.Length <= fi.number) + { + int newSize = (int) ((1 + fi.number) * 1.25); + BufferedNorms[] newNorms = new BufferedNorms[newSize]; + Array.Copy(Enclosing_Instance.norms, 0, newNorms, 0, Enclosing_Instance.norms.Length); + Enclosing_Instance.norms = newNorms; + } + + if (Enclosing_Instance.norms[fi.number] == null) + Enclosing_Instance.norms[fi.number] = new BufferedNorms(); + + Enclosing_Instance.hasNorms = true; + } + + // Make sure we have a FieldData allocated + int hashPos = fi.name.GetHashCode() & fieldDataHashMask; + FieldData fp = fieldDataHash[hashPos]; + while (fp != null && !fp.fieldInfo.name.Equals(fi.name)) + fp = fp.next; + + if (fp == null) + { + + fp = new FieldData(this, fi); + fp.next = fieldDataHash[hashPos]; + fieldDataHash[hashPos] = fp; + + if (numAllFieldData == allFieldDataArray.Length) + { + int newSize = (int) (allFieldDataArray.Length * 1.5); + int newHashSize = fieldDataHash.Length * 2; + + FieldData[] newArray = new FieldData[newSize]; + FieldData[] newHashArray = new FieldData[newHashSize]; + Array.Copy(allFieldDataArray, 0, newArray, 0, numAllFieldData); + + // Rehash + fieldDataHashMask = newSize - 1; + for (int j = 0; j < fieldDataHash.Length; j++) + { + FieldData fp0 = fieldDataHash[j]; + while (fp0 != null) + { + hashPos = fp0.fieldInfo.name.GetHashCode() & fieldDataHashMask; + FieldData nextFP0 = fp0.next; + fp0.next = newHashArray[hashPos]; + newHashArray[hashPos] = fp0; + fp0 = nextFP0; + } + } + + allFieldDataArray = newArray; + fieldDataHash = newHashArray; + } + allFieldDataArray[numAllFieldData++] = fp; + } + else + { + System.Diagnostics.Debug.Assert(fp.fieldInfo == fi); + } + + if (thisFieldGen != fp.lastGen) + { + + // First time we're seeing this field for this doc + fp.lastGen = thisFieldGen; + fp.fieldCount = 0; + fp.doVectors = fp.doVectorPositions = fp.doVectorOffsets = false; + fp.doNorms = fi.isIndexed && !fi.omitNorms; + + if (numFieldData == fieldDataArray.Length) + { + int newSize = fieldDataArray.Length * 2; + FieldData[] newArray = new FieldData[newSize]; + Array.Copy(fieldDataArray, 0, newArray, 0, numFieldData); + fieldDataArray = newArray; + } + fieldDataArray[numFieldData++] = fp; + } + + if (field.IsTermVectorStored()) + { + if (!fp.doVectors && numVectorFields++ == vectorFieldPointers.Length) + { + int newSize = (int) (numVectorFields * 1.5); + vectorFieldPointers = new long[newSize]; + vectorFieldNumbers = new int[newSize]; + } + fp.doVectors = true; + docHasVectors = true; + + fp.doVectorPositions |= field.IsStorePositionWithTermVector(); + fp.doVectorOffsets |= field.IsStoreOffsetWithTermVector(); + } + + if (fp.fieldCount == fp.docFields.Length) + { + Fieldable[] newArray = new Fieldable[fp.docFields.Length * 2]; + Array.Copy(fp.docFields, 0, newArray, 0, fp.docFields.Length); + fp.docFields = newArray; + } + + // Lazily allocate arrays for postings: + if (field.IsIndexed() && fp.postingsHash == null) + fp.InitPostingArrays(); + + fp.docFields[fp.fieldCount++] = field; + } + + // Maybe init the local & global fieldsWriter + if (localFieldsWriter == null) + { + if (Enclosing_Instance.fieldsWriter == null) + { + System.Diagnostics.Debug.Assert(Enclosing_Instance.docStoreSegment == null); + System.Diagnostics.Debug.Assert(Enclosing_Instance.segment != null); + Enclosing_Instance.docStoreSegment = Enclosing_Instance.segment; + // If we hit an exception while init'ing the + // fieldsWriter, we must abort this segment + // because those files will be in an unknown + // state: + try + { + Enclosing_Instance.fieldsWriter = new FieldsWriter(Enclosing_Instance.directory, Enclosing_Instance.docStoreSegment, Enclosing_Instance.fieldInfos); + } + catch (System.Exception t) + { + throw new AbortException(t, Enclosing_Instance); + } + Enclosing_Instance.files = null; + } + localFieldsWriter = new FieldsWriter(null, fdtLocal, Enclosing_Instance.fieldInfos); + } + + // First time we see a doc that has field(s) with + // stored vectors, we init our tvx writer + if (docHasVectors) + { + if (Enclosing_Instance.tvx == null) + { + System.Diagnostics.Debug.Assert(Enclosing_Instance.docStoreSegment != null); + // If we hit an exception while init'ing the term + // vector output files, we must abort this segment + // because those files will be in an unknown + // state: + try + { + Enclosing_Instance.tvx = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); + Enclosing_Instance.tvx.WriteInt(TermVectorsReader.FORMAT_VERSION); + Enclosing_Instance.tvd = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); + Enclosing_Instance.tvd.WriteInt(TermVectorsReader.FORMAT_VERSION); + Enclosing_Instance.tvf = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); + Enclosing_Instance.tvf.WriteInt(TermVectorsReader.FORMAT_VERSION); + + // We must "catch up" for all docs before us + // that had no vectors: + for (int i = 0; i < Enclosing_Instance.numDocsInStore; i++) + { + Enclosing_Instance.tvx.WriteLong(Enclosing_Instance.tvd.GetFilePointer()); + Enclosing_Instance.tvd.WriteVInt(0); + } + } + catch (System.Exception t) + { + throw new AbortException(t, Enclosing_Instance); + } + Enclosing_Instance.files = null; + } + + numVectorFields = 0; + } + } + + /// Do in-place sort of Posting array + internal void DoPostingSort(Posting[] postings, int numPosting) + { + QuickSort(postings, 0, numPosting - 1); + } + + internal void QuickSort(Posting[] postings, int lo, int hi) + { + if (lo >= hi) + return ; + + int mid = SupportClass.Number.URShift((lo + hi), 1); + + if (ComparePostings(postings[lo], postings[mid]) > 0) + { + Posting tmp = postings[lo]; + postings[lo] = postings[mid]; + postings[mid] = tmp; + } + + if (ComparePostings(postings[mid], postings[hi]) > 0) + { + Posting tmp = postings[mid]; + postings[mid] = postings[hi]; + postings[hi] = tmp; + + if (ComparePostings(postings[lo], postings[mid]) > 0) + { + Posting tmp2 = postings[lo]; + postings[lo] = postings[mid]; + postings[mid] = tmp2; + } + } + + int left = lo + 1; + int right = hi - 1; + + if (left >= right) + return ; + + Posting partition = postings[mid]; + + for (; ; ) + { + while (ComparePostings(postings[right], partition) > 0) + --right; + + while (left < right && ComparePostings(postings[left], partition) <= 0) + ++left; + + if (left < right) + { + Posting tmp = postings[left]; + postings[left] = postings[right]; + postings[right] = tmp; + --right; + } + else + { + break; + } + } + + QuickSort(postings, lo, left); + QuickSort(postings, left + 1, hi); + } + + /// Do in-place sort of PostingVector array + internal void DoVectorSort(PostingVector[] postings, int numPosting) + { + QuickSort(postings, 0, numPosting - 1); + } + + internal void QuickSort(PostingVector[] postings, int lo, int hi) + { + if (lo >= hi) + return ; + + int mid = SupportClass.Number.URShift((lo + hi), 1); + + if (ComparePostings(postings[lo].p, postings[mid].p) > 0) + { + PostingVector tmp = postings[lo]; + postings[lo] = postings[mid]; + postings[mid] = tmp; + } + + if (ComparePostings(postings[mid].p, postings[hi].p) > 0) + { + PostingVector tmp = postings[mid]; + postings[mid] = postings[hi]; + postings[hi] = tmp; + + if (ComparePostings(postings[lo].p, postings[mid].p) > 0) + { + PostingVector tmp2 = postings[lo]; + postings[lo] = postings[mid]; + postings[mid] = tmp2; + } + } + + int left = lo + 1; + int right = hi - 1; + + if (left >= right) + return ; + + PostingVector partition = postings[mid]; + + for (; ; ) + { + while (ComparePostings(postings[right].p, partition.p) > 0) + --right; + + while (left < right && ComparePostings(postings[left].p, partition.p) <= 0) + ++left; + + if (left < right) + { + PostingVector tmp = postings[left]; + postings[left] = postings[right]; + postings[right] = tmp; + --right; + } + else + { + break; + } + } + + QuickSort(postings, lo, left); + QuickSort(postings, left + 1, hi); + } + + /// If there are fields we've seen but did not see again + /// in the last run, then free them up. Also reduce + /// postings hash size. + /// + internal void TrimFields() + { + + int upto = 0; + for (int i = 0; i < numAllFieldData; i++) + { + FieldData fp = allFieldDataArray[i]; + if (fp.lastGen == - 1) + { + // This field was not seen since the previous + // flush, so, free up its resources now + + // Unhash + int hashPos = fp.fieldInfo.name.GetHashCode() & fieldDataHashMask; + FieldData last = null; + FieldData fp0 = fieldDataHash[hashPos]; + while (fp0 != fp) + { + last = fp0; + fp0 = fp0.next; + } + System.Diagnostics.Debug.Assert(fp0 != null); + + if (last == null) + fieldDataHash[hashPos] = fp.next; + else + last.next = fp.next; + + if (Enclosing_Instance.infoStream != null) + Enclosing_Instance.infoStream.WriteLine(" remove field=" + fp.fieldInfo.name); + } + else + { + // Reset + fp.lastGen = - 1; + allFieldDataArray[upto++] = fp; + + if (fp.numPostings > 0 && ((float) fp.numPostings) / fp.postingsHashSize < 0.2) + { + int hashSize = fp.postingsHashSize; + + // Reduce hash so it's between 25-50% full + while (fp.numPostings < (hashSize >> 1) && hashSize >= 2) + hashSize >>= 1; + hashSize <<= 1; + + if (hashSize != fp.postingsHash.Length) + fp.RehashPostings(hashSize); + } + } + } + + // If we didn't see any norms for this field since + // last flush, free it + for (int i = 0; i < Enclosing_Instance.norms.Length; i++) + { + BufferedNorms n = Enclosing_Instance.norms[i]; + if (n != null && n.upto == 0) + Enclosing_Instance.norms[i] = null; + } + + numAllFieldData = upto; + + // Also pare back PostingsVectors if it's excessively + // large + if (maxPostingsVectors * 1.5 < postingsVectors.Length) + { + int newSize; + if (0 == maxPostingsVectors) + newSize = 1; + else + { + newSize = (int) (1.5 * maxPostingsVectors); + } + PostingVector[] newArray = new PostingVector[newSize]; + Array.Copy(postingsVectors, 0, newArray, 0, newSize); + postingsVectors = newArray; + } + } + + /// Tokenizes the fields of a document into Postings + internal void ProcessDocument(Analyzer analyzer) + { + + int numFields = numFieldData; + + System.Diagnostics.Debug.Assert(0 == fdtLocal.Length()); + + if (Enclosing_Instance.tvx != null) + // If we are writing vectors then we must visit + // fields in sorted order so they are written in + // sorted order. TODO: we actually only need to + // sort the subset of fields that have vectors + // enabled; we could save [small amount of] CPU + // here. + System.Array.Sort(fieldDataArray, 0, numFields - 0); + + // We process the document one field at a time + for (int i = 0; i < numFields; i++) + fieldDataArray[i].ProcessField(analyzer); + + if (maxTermPrefix != null && Enclosing_Instance.infoStream != null) + Enclosing_Instance.infoStream.WriteLine("WARNING: document contains at least one immense term (longer than the max length " + Lucene.Net.Index.DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + maxTermPrefix + "...'"); + + if (Enclosing_Instance.ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && Enclosing_Instance.numBytesUsed > 0.95 * Enclosing_Instance.ramBufferSize) + Enclosing_Instance.BalanceRAM(); + } + + internal ByteBlockPool postingsPool; + internal ByteBlockPool vectorsPool; + internal CharBlockPool charPool; + + // Current posting we are working on + internal Posting p; + internal PostingVector vector; + + // USE ONLY FOR DEBUGGING! + /* + public String getPostingText() { + char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT]; + int upto = p.textStart & CHAR_BLOCK_MASK; + while(text[upto] != 0xffff) + upto++; + return new String(text, p.textStart, upto-(p.textStart & BYTE_BLOCK_MASK)); + } + */ + + /// Test whether the text for current Posting p equals + /// current tokenText. + /// + internal bool PostingEquals(char[] tokenText, int tokenTextLen) + { + + char[] text = charPool.buffers[p.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; + System.Diagnostics.Debug.Assert(text != null); + int pos = p.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; + + int tokenPos = 0; + for (; tokenPos < tokenTextLen; pos++, tokenPos++) + if (tokenText[tokenPos] != text[pos]) + return false; + return 0xffff == text[pos]; + } + + /// Compares term text for two Posting instance and + /// returns -1 if p1 < p2; 1 if p1 > p2; else 0. + /// + internal int ComparePostings(Posting p1, Posting p2) + { + char[] text1 = charPool.buffers[p1.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; + int pos1 = p1.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; + char[] text2 = charPool.buffers[p2.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; + int pos2 = p2.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; + while (true) + { + char c1 = text1[pos1++]; + char c2 = text2[pos2++]; + if (c1 < c2) + if (0xffff == c2) + return 1; + else + return - 1; + else if (c2 < c1) + if (0xffff == c1) + return - 1; + else + return 1; + else if (0xffff == c1) + return 0; + } + } + + /// Write vInt into freq stream of current Posting + public void WriteFreqVInt(int i) + { + while ((i & ~ 0x7F) != 0) + { + WriteFreqByte((byte) ((i & 0x7f) | 0x80)); + i = SupportClass.Number.URShift(i, 7); + } + WriteFreqByte((byte) i); + } + + /// Write vInt into prox stream of current Posting + public void WriteProxVInt(int i) + { + while ((i & ~ 0x7F) != 0) + { + WriteProxByte((byte) ((i & 0x7f) | 0x80)); + i = SupportClass.Number.URShift(i, 7); + } + WriteProxByte((byte) i); + } + + /// Write byte into freq stream of current Posting + internal byte[] freq; + internal int freqUpto; + public void WriteFreqByte(byte b) + { + System.Diagnostics.Debug.Assert(freq != null); + if (freq[freqUpto] != 0) + { + freqUpto = postingsPool.AllocSlice(freq, freqUpto); + freq = postingsPool.buffer; + p.freqUpto = postingsPool.byteOffset; + } + freq[freqUpto++] = b; + } + + /// Write byte into prox stream of current Posting + internal byte[] prox; + internal int proxUpto; + public void WriteProxByte(byte b) + { + System.Diagnostics.Debug.Assert(prox != null); + if (prox[proxUpto] != 0) + { + proxUpto = postingsPool.AllocSlice(prox, proxUpto); + prox = postingsPool.buffer; + p.proxUpto = postingsPool.byteOffset; + System.Diagnostics.Debug.Assert(prox != null); + } + prox[proxUpto++] = b; + System.Diagnostics.Debug.Assert(proxUpto != prox.Length); + } + + /// Currently only used to copy a payload into the prox + /// stream. + /// + public void WriteProxBytes(byte[] b, int offset, int len) + { + int offsetEnd = offset + len; + while (offset < offsetEnd) + { + if (prox[proxUpto] != 0) + { + // End marker + proxUpto = postingsPool.AllocSlice(prox, proxUpto); + prox = postingsPool.buffer; + p.proxUpto = postingsPool.byteOffset; + } + + prox[proxUpto++] = b[offset++]; + System.Diagnostics.Debug.Assert(proxUpto != prox.Length); + } + } + + /// Write vInt into offsets stream of current + /// PostingVector + /// + public void WriteOffsetVInt(int i) + { + while ((i & ~ 0x7F) != 0) + { + WriteOffsetByte((byte) ((i & 0x7f) | 0x80)); + i = SupportClass.Number.URShift(i, 7); + } + WriteOffsetByte((byte) i); + } + + internal byte[] offsets; + internal int offsetUpto; + + /// Write byte into offsets stream of current + /// PostingVector + /// + public void WriteOffsetByte(byte b) + { + System.Diagnostics.Debug.Assert(offsets != null); + if (offsets[offsetUpto] != 0) + { + offsetUpto = vectorsPool.AllocSlice(offsets, offsetUpto); + offsets = vectorsPool.buffer; + vector.offsetUpto = vectorsPool.byteOffset; + } + offsets[offsetUpto++] = b; + } + + /// Write vInt into pos stream of current + /// PostingVector + /// + public void WritePosVInt(int i) + { + while ((i & ~ 0x7F) != 0) + { + WritePosByte((byte) ((i & 0x7f) | 0x80)); + i = SupportClass.Number.URShift(i, 7); + } + WritePosByte((byte) i); + } + + internal byte[] pos; + internal int posUpto; + + /// Write byte into pos stream of current + /// PostingVector + /// + public void WritePosByte(byte b) + { + System.Diagnostics.Debug.Assert(pos != null); + if (pos[posUpto] != 0) + { + posUpto = vectorsPool.AllocSlice(pos, posUpto); + pos = vectorsPool.buffer; + vector.posUpto = vectorsPool.byteOffset; + } + pos[posUpto++] = b; + } + + internal PostingVector[] postingsVectors = new PostingVector[1]; + internal int maxPostingsVectors; + + // Used to read a string value for a field + internal ReusableStringReader stringReader = new ReusableStringReader(); + + /// Holds data associated with a single field, including + /// the Postings hash. A document may have many * + /// occurrences for a given field name; we gather all * + /// such occurrences here (in docFields) so that we can + /// * process the entire field at once. + /// + sealed internal class FieldData : System.IComparable + { + private void InitBlock(ThreadState enclosingInstance) + { + this.enclosingInstance = enclosingInstance; + } + private ThreadState enclosingInstance; + public ThreadState Enclosing_Instance + { + get + { + return enclosingInstance; + } + + } + + internal ThreadState threadState; + internal FieldInfo fieldInfo; + + internal int fieldCount; + internal Fieldable[] docFields = new Fieldable[1]; + + internal int lastGen = - 1; + internal FieldData next; + + internal bool doNorms; + internal bool doVectors; + internal bool doVectorPositions; + internal bool doVectorOffsets; + internal bool postingsCompacted; + + internal int numPostings; + + internal Posting[] postingsHash; + internal int postingsHashSize; + internal int postingsHashHalfSize; + internal int postingsHashMask; + + internal int position; + internal int length; + internal int offset; + internal float boost; + internal int postingsVectorsUpto; + + public FieldData(ThreadState enclosingInstance, FieldInfo fieldInfo) + { + InitBlock(enclosingInstance); + this.fieldInfo = fieldInfo; + threadState = Enclosing_Instance; + } + + internal void ResetPostingArrays() + { + if (!postingsCompacted) + CompactPostings(); + Enclosing_Instance.Enclosing_Instance.RecyclePostings(this.postingsHash, numPostings); + Array.Clear(postingsHash, 0, postingsHash.Length); + postingsCompacted = false; + numPostings = 0; + } + + internal void InitPostingArrays() + { + // Target hash fill factor of <= 50% + // NOTE: must be a power of two for hash collision + // strategy to work correctly + postingsHashSize = 4; + postingsHashHalfSize = 2; + postingsHashMask = postingsHashSize - 1; + postingsHash = new Posting[postingsHashSize]; + } + + /// So Arrays.sort can sort us. + public int CompareTo(System.Object o) + { + return String.CompareOrdinal(fieldInfo.name, ((FieldData) o).fieldInfo.name); + } + + private void CompactPostings() + { + int upto = 0; + for (int i = 0; i < postingsHashSize; i++) + if (postingsHash[i] != null) + postingsHash[upto++] = postingsHash[i]; + + System.Diagnostics.Debug.Assert(upto == numPostings); + postingsCompacted = true; + } + + /// Collapse the hash table & sort in-place. + public Posting[] SortPostings() + { + CompactPostings(); + Enclosing_Instance.DoPostingSort(postingsHash, numPostings); + return postingsHash; + } + + /// Process all occurrences of one field in the document. + public void ProcessField(Analyzer analyzer) + { + length = 0; + position = 0; + offset = 0; + boost = Enclosing_Instance.docBoost; + + int maxFieldLength = Enclosing_Instance.Enclosing_Instance.writer.GetMaxFieldLength(); + + int limit = fieldCount; + Fieldable[] docFieldsFinal = docFields; + + bool doWriteVectors = true; + + // Walk through all occurrences in this doc for this + // field: + try + { + for (int j = 0; j < limit; j++) + { + Fieldable field = docFieldsFinal[j]; + + if (field.IsIndexed()) + InvertField(field, analyzer, maxFieldLength); + + if (field.IsStored()) + { + Enclosing_Instance.numStoredFields++; + bool success = false; + try + { + Enclosing_Instance.localFieldsWriter.WriteField(fieldInfo, field); + success = true; + } + finally + { + // If we hit an exception inside + // localFieldsWriter.writeField, the + // contents of fdtLocal can be corrupt, so + // we must discard all stored fields for + // this document: + if (!success) + Enclosing_Instance.fdtLocal.Reset(); + } + } + + docFieldsFinal[j] = null; + } + } + catch (AbortException ae) + { + doWriteVectors = false; + throw ae; + } + finally + { + if (postingsVectorsUpto > 0) + { + try + { + if (doWriteVectors) + { + // Add term vectors for this field + bool success = false; + try + { + WriteVectors(fieldInfo); + success = true; + } + finally + { + if (!success) + { + // If we hit an exception inside + // writeVectors, the contents of tvfLocal + // can be corrupt, so we must discard all + // term vectors for this document: + Enclosing_Instance.numVectorFields = 0; + Enclosing_Instance.tvfLocal.Reset(); + } + } + } + } + finally + { + if (postingsVectorsUpto > Enclosing_Instance.maxPostingsVectors) + Enclosing_Instance.maxPostingsVectors = postingsVectorsUpto; + postingsVectorsUpto = 0; + Enclosing_Instance.vectorsPool.Reset(); + } + } + } + } + + internal int offsetEnd; + internal Token localToken = new Token(); + + /* Invert one occurrence of one field in the document */ + public void InvertField(Fieldable field, Analyzer analyzer, int maxFieldLength) + { + + if (length > 0) + position += analyzer.GetPositionIncrementGap(fieldInfo.name); + + if (!field.IsTokenized()) + { + // un-tokenized field + System.String stringValue = field.StringValue(); + int valueLength = stringValue.Length; + Token token = localToken; + token.Clear(); + char[] termBuffer = token.TermBuffer(); + if (termBuffer.Length < valueLength) + termBuffer = token.ResizeTermBuffer(valueLength); + DocumentsWriter.GetCharsFromString(stringValue, 0, valueLength, termBuffer, 0); + token.SetTermLength(valueLength); + token.SetStartOffset(offset); + token.SetEndOffset(offset + stringValue.Length); + AddPosition(token); + offset += stringValue.Length; + length++; + } + else + { + // tokenized field + TokenStream stream; + TokenStream streamValue = field.TokenStreamValue(); + + if (streamValue != null) + stream = streamValue; + else + { + // the field does not have a TokenStream, + // so we have to obtain one from the analyzer + System.IO.TextReader reader; // find or make Reader + System.IO.TextReader readerValue = field.ReaderValue(); + + if (readerValue != null) + reader = readerValue; + else + { + System.String stringValue = field.StringValue(); + if (stringValue == null) + throw new System.ArgumentException("field must have either TokenStream, String or Reader value"); + Enclosing_Instance.stringReader.Init(stringValue); + reader = Enclosing_Instance.stringReader; + } + + // Tokenize field and add to postingTable + stream = analyzer.ReusableTokenStream(fieldInfo.name, reader); + } + + // reset the TokenStream to the first token + stream.Reset(); + + try + { + offsetEnd = offset - 1; + Token token; + for (; ; ) + { + token = stream.Next(localToken); + if (token == null) + break; + position += (token.GetPositionIncrement() - 1); + AddPosition(token); + if (++length >= maxFieldLength) + { + if (Enclosing_Instance.Enclosing_Instance.infoStream != null) + Enclosing_Instance.Enclosing_Instance.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); + break; + } + } + offset = offsetEnd + 1; + } + finally + { + stream.Close(); + } + } + + boost *= field.GetBoost(); + } + + /// Only called when term vectors are enabled. This + /// is called the first time we see a given term for + /// each * document, to allocate a PostingVector + /// instance that * is used to record data needed to + /// write the posting * vectors. + /// + private PostingVector AddNewVector() + { + + if (postingsVectorsUpto == Enclosing_Instance.postingsVectors.Length) + { + int newSize; + if (Enclosing_Instance.postingsVectors.Length < 2) + newSize = 2; + else + { + newSize = (int) (1.5 * Enclosing_Instance.postingsVectors.Length); + } + PostingVector[] newArray = new PostingVector[newSize]; + Array.Copy(Enclosing_Instance.postingsVectors, 0, newArray, 0, Enclosing_Instance.postingsVectors.Length); + Enclosing_Instance.postingsVectors = newArray; + } + + Enclosing_Instance.p.vector = Enclosing_Instance.postingsVectors[postingsVectorsUpto]; + if (Enclosing_Instance.p.vector == null) + Enclosing_Instance.p.vector = Enclosing_Instance.postingsVectors[postingsVectorsUpto] = new PostingVector(); + + postingsVectorsUpto++; + + PostingVector v = Enclosing_Instance.p.vector; + v.p = Enclosing_Instance.p; + + int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0]; + + if (doVectorPositions) + { + int upto = Enclosing_Instance.vectorsPool.NewSlice(firstSize); + v.posStart = v.posUpto = Enclosing_Instance.vectorsPool.byteOffset + upto; + } + + if (doVectorOffsets) + { + int upto = Enclosing_Instance.vectorsPool.NewSlice(firstSize); + v.offsetStart = v.offsetUpto = Enclosing_Instance.vectorsPool.byteOffset + upto; + } + + return v; + } + + internal int offsetStartCode; + internal int offsetStart; + + /// This is the hotspot of indexing: it's called once + /// for every term of every document. Its job is to * + /// update the postings byte stream (Postings hash) * + /// based on the occurence of a single term. + /// + private void AddPosition(Token token) + { + + Payload payload = token.GetPayload(); + + // Get the text of this term. Term can either + // provide a String token or offset into a char[] + // array + char[] tokenText = token.TermBuffer(); + int tokenTextLen = token.TermLength(); + + int code = 0; + + // Compute hashcode + int downto = tokenTextLen; + while (downto > 0) + code = (code * 31) + tokenText[--downto]; + + // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); + + int hashPos = code & postingsHashMask; + + System.Diagnostics.Debug.Assert(!postingsCompacted); + + // Locate Posting in hash + Enclosing_Instance.p = postingsHash[hashPos]; + + if (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen)) + { + // Conflict: keep searching different locations in + // the hash table. + int inc = ((code >> 8) + code) | 1; + do + { + code += inc; + hashPos = code & postingsHashMask; + Enclosing_Instance.p = postingsHash[hashPos]; + } + while (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen)); + } + + int proxCode; + + // If we hit an exception below, it's possible the + // posting list or term vectors data will be + // partially written and thus inconsistent if + // flushed, so we have to abort all documents + // since the last flush: + + try + { + + if (Enclosing_Instance.p != null) + { + // term seen since last flush + + if (Enclosing_Instance.docID != Enclosing_Instance.p.lastDocID) + { + // term not yet seen in this doc + + // System.out.println(" seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto); + + System.Diagnostics.Debug.Assert(Enclosing_Instance.p.docFreq > 0); + + // Now that we know doc freq for previous doc, + // write it & lastDocCode + Enclosing_Instance.freqUpto = Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; + Enclosing_Instance.freq = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.freqUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; + if (1 == Enclosing_Instance.p.docFreq) + Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode | 1); + else + { + Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode); + Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.docFreq); + } + Enclosing_Instance.p.freqUpto = Enclosing_Instance.freqUpto + (Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); + + if (doVectors) + { + Enclosing_Instance.vector = AddNewVector(); + if (doVectorOffsets) + { + offsetStartCode = offsetStart = offset + token.StartOffset(); + offsetEnd = offset + token.EndOffset(); + } + } + + proxCode = position; + + Enclosing_Instance.p.docFreq = 1; + + // Store code so we can write this after we're + // done with this new doc + Enclosing_Instance.p.lastDocCode = (Enclosing_Instance.docID - Enclosing_Instance.p.lastDocID) << 1; + Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID; + } + else + { + // term already seen in this doc + // System.out.println(" seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto); + Enclosing_Instance.p.docFreq++; + + proxCode = position - Enclosing_Instance.p.lastPosition; + + if (doVectors) + { + Enclosing_Instance.vector = Enclosing_Instance.p.vector; + if (Enclosing_Instance.vector == null) + Enclosing_Instance.vector = AddNewVector(); + if (doVectorOffsets) + { + offsetStart = offset + token.StartOffset(); + offsetEnd = offset + token.EndOffset(); + offsetStartCode = offsetStart - Enclosing_Instance.vector.lastOffset; + } + } + } + } + else + { + // term not seen before + // System.out.println(" never seen docID=" + docID); + + // Refill? + if (0 == Enclosing_Instance.postingsFreeCount) + { + Enclosing_Instance.Enclosing_Instance.GetPostings(Enclosing_Instance.postingsFreeList); + Enclosing_Instance.postingsFreeCount = Enclosing_Instance.postingsFreeList.Length; + } + + int textLen1 = 1 + tokenTextLen; + if (textLen1 + Enclosing_Instance.charPool.byteUpto > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE) + { + if (textLen1 > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE) + { + // Just skip this term, to remain as robust as + // possible during indexing. A TokenFilter + // can be inserted into the analyzer chain if + // other behavior is wanted (pruning the term + // to a prefix, throwing an exception, etc). + if (Enclosing_Instance.maxTermPrefix == null) + Enclosing_Instance.maxTermPrefix = new System.String(tokenText, 0, 30); + + // Still increment position: + position++; + return ; + } + Enclosing_Instance.charPool.NextBuffer(); + } + char[] text = Enclosing_Instance.charPool.buffer; + int textUpto = Enclosing_Instance.charPool.byteUpto; + + // Pull next free Posting from free list + Enclosing_Instance.p = Enclosing_Instance.postingsFreeList[--Enclosing_Instance.postingsFreeCount]; + + Enclosing_Instance.p.textStart = textUpto + Enclosing_Instance.charPool.byteOffset; + Enclosing_Instance.charPool.byteUpto += textLen1; + + Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); + + text[textUpto + tokenTextLen] = (char) (0xffff); + + System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null); + + postingsHash[hashPos] = Enclosing_Instance.p; + numPostings++; + + if (numPostings == postingsHashHalfSize) + RehashPostings(2 * postingsHashSize); + + // Init first slice for freq & prox streams + int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0]; + + int upto1 = Enclosing_Instance.postingsPool.NewSlice(firstSize); + Enclosing_Instance.p.freqStart = Enclosing_Instance.p.freqUpto = Enclosing_Instance.postingsPool.byteOffset + upto1; + + int upto2 = Enclosing_Instance.postingsPool.NewSlice(firstSize); + Enclosing_Instance.p.proxStart = Enclosing_Instance.p.proxUpto = Enclosing_Instance.postingsPool.byteOffset + upto2; + + Enclosing_Instance.p.lastDocCode = Enclosing_Instance.docID << 1; + Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID; + Enclosing_Instance.p.docFreq = 1; + + if (doVectors) + { + Enclosing_Instance.vector = AddNewVector(); + if (doVectorOffsets) + { + offsetStart = offsetStartCode = offset + token.StartOffset(); + offsetEnd = offset + token.EndOffset(); + } + } + + proxCode = position; + } + + Enclosing_Instance.proxUpto = Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; + Enclosing_Instance.prox = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.proxUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; + System.Diagnostics.Debug.Assert(Enclosing_Instance.prox != null); + + if (payload != null && payload.length > 0) + { + Enclosing_Instance.WriteProxVInt((proxCode << 1) | 1); + Enclosing_Instance.WriteProxVInt(payload.length); + Enclosing_Instance.WriteProxBytes(payload.data, payload.offset, payload.length); + fieldInfo.storePayloads = true; + } + else + Enclosing_Instance.WriteProxVInt(proxCode << 1); + + Enclosing_Instance.p.proxUpto = Enclosing_Instance.proxUpto + (Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); + + Enclosing_Instance.p.lastPosition = position++; + + if (doVectorPositions) + { + Enclosing_Instance.posUpto = Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; + Enclosing_Instance.pos = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.posUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; + Enclosing_Instance.WritePosVInt(proxCode); + Enclosing_Instance.vector.posUpto = Enclosing_Instance.posUpto + (Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); + } + + if (doVectorOffsets) + { + Enclosing_Instance.offsetUpto = Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; + Enclosing_Instance.offsets = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.offsetUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; + Enclosing_Instance.WriteOffsetVInt(offsetStartCode); + Enclosing_Instance.WriteOffsetVInt(offsetEnd - offsetStart); + Enclosing_Instance.vector.lastOffset = offsetEnd; + Enclosing_Instance.vector.offsetUpto = Enclosing_Instance.offsetUpto + (Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); + } + } + catch (System.Exception t) + { + throw new AbortException(t, Enclosing_Instance.Enclosing_Instance); + } + } + + /// Called when postings hash is too small (> 50% + /// occupied) or too large (< 20% occupied). + /// + internal void RehashPostings(int newSize) + { + + int newMask = newSize - 1; + + Posting[] newHash = new Posting[newSize]; + for (int i = 0; i < postingsHashSize; i++) + { + Posting p0 = postingsHash[i]; + if (p0 != null) + { + int start = p0.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; + char[] text = Enclosing_Instance.charPool.buffers[p0.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; + int pos = start; + while (text[pos] != 0xffff) + pos++; + int code = 0; + while (pos > start) + code = (code * 31) + text[--pos]; + + int hashPos = code & newMask; + System.Diagnostics.Debug.Assert(hashPos >= 0); + if (newHash[hashPos] != null) + { + int inc = ((code >> 8) + code) | 1; + do + { + code += inc; + hashPos = code & newMask; + } + while (newHash[hashPos] != null); + } + newHash[hashPos] = p0; + } + } + + postingsHashMask = newMask; + postingsHash = newHash; + postingsHashSize = newSize; + postingsHashHalfSize = newSize >> 1; + } + + internal ByteSliceReader vectorSliceReader = new ByteSliceReader(); + + /// Called once per field per document if term vectors + /// are enabled, to write the vectors to * + /// RAMOutputStream, which is then quickly flushed to + /// * the real term vectors files in the Directory. + /// + internal void WriteVectors(FieldInfo fieldInfo) + { + + System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); + + Enclosing_Instance.vectorFieldNumbers[Enclosing_Instance.numVectorFields] = fieldInfo.number; + Enclosing_Instance.vectorFieldPointers[Enclosing_Instance.numVectorFields] = Enclosing_Instance.tvfLocal.GetFilePointer(); + Enclosing_Instance.numVectorFields++; + + int numPostingsVectors = postingsVectorsUpto; + + Enclosing_Instance.tvfLocal.WriteVInt(numPostingsVectors); + byte bits = (byte) (0x0); + if (doVectorPositions) + bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; + if (doVectorOffsets) + bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; + Enclosing_Instance.tvfLocal.WriteByte(bits); + + Enclosing_Instance.DoVectorSort(Enclosing_Instance.postingsVectors, numPostingsVectors); + + Posting lastPosting = null; + + ByteSliceReader reader = vectorSliceReader; + + for (int j = 0; j < numPostingsVectors; j++) + { + PostingVector vector = Enclosing_Instance.postingsVectors[j]; + Posting posting = vector.p; + int freq = posting.docFreq; + + int prefix; + char[] text2 = Enclosing_Instance.charPool.buffers[posting.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; + int start2 = posting.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; + int pos2 = start2; + + // Compute common prefix between last term and + // this term + if (lastPosting == null) + prefix = 0; + else + { + char[] text1 = Enclosing_Instance.charPool.buffers[lastPosting.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; + int start1 = lastPosting.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; + int pos1 = start1; + while (true) + { + char c1 = text1[pos1]; + char c2 = text2[pos2]; + if (c1 != c2 || c1 == 0xffff) + { + prefix = pos1 - start1; + break; + } + pos1++; + pos2++; + } + } + lastPosting = posting; + + // Compute length + while (text2[pos2] != 0xffff) + pos2++; + + int suffix = pos2 - start2 - prefix; + Enclosing_Instance.tvfLocal.WriteVInt(prefix); + Enclosing_Instance.tvfLocal.WriteVInt(suffix); + Enclosing_Instance.tvfLocal.WriteChars(text2, start2 + prefix, suffix); + Enclosing_Instance.tvfLocal.WriteVInt(freq); + + if (doVectorPositions) + { + reader.Init(Enclosing_Instance.vectorsPool, vector.posStart, vector.posUpto); + reader.WriteTo(Enclosing_Instance.tvfLocal); + } + + if (doVectorOffsets) + { + reader.Init(Enclosing_Instance.vectorsPool, vector.offsetStart, vector.offsetUpto); + reader.WriteTo(Enclosing_Instance.tvfLocal); + } + } + } + } + } + + private static readonly byte defaultNorm; + + /// Write norms in the "true" segment format. This is + /// called only during commit, to create the .nrm file. + /// + internal void WriteNorms(System.String segmentName, int totalNumDoc) + { + + IndexOutput normsOut = directory.CreateOutput(segmentName + "." + IndexFileNames.NORMS_EXTENSION); + + try + { + normsOut.WriteBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.Length); + + int numField = fieldInfos.Size(); + + for (int fieldIdx = 0; fieldIdx < numField; fieldIdx++) + { + FieldInfo fi = fieldInfos.FieldInfo(fieldIdx); + if (fi.isIndexed && !fi.omitNorms) + { + BufferedNorms n = norms[fieldIdx]; + long v; + if (n == null) + v = 0; + else + { + v = n.out_Renamed.GetFilePointer(); + n.out_Renamed.WriteTo(normsOut); + n.Reset(); + } + if (v < totalNumDoc) + FillBytes(normsOut, defaultNorm, (int) (totalNumDoc - v)); + } + } + } + finally + { + normsOut.Close(); + } + } + + private DefaultSkipListWriter skipListWriter = null; + + private bool currentFieldStorePayloads; + + /// Creates a segment from all Postings in the Postings + /// hashes across all ThreadStates & FieldDatas. + /// + private System.Collections.IList WriteSegment() + { + + System.Diagnostics.Debug.Assert(AllThreadsIdle()); + + System.Diagnostics.Debug.Assert(nextDocID == numDocsInRAM); + + System.String segmentName; + + segmentName = segment; + + TermInfosWriter termsOut = new TermInfosWriter(directory, segmentName, fieldInfos, writer.GetTermIndexInterval()); + + IndexOutput freqOut = directory.CreateOutput(segmentName + ".frq"); + IndexOutput proxOut = directory.CreateOutput(segmentName + ".prx"); + + // Gather all FieldData's that have postings, across all + // ThreadStates + System.Collections.ArrayList allFields = new System.Collections.ArrayList(); + System.Diagnostics.Debug.Assert(AllThreadsIdle()); + for (int i = 0; i < threadStates.Length; i++) + { + ThreadState state = threadStates[i]; + state.TrimFields(); + int numFields = state.numAllFieldData; + for (int j = 0; j < numFields; j++) + { + ThreadState.FieldData fp = state.allFieldDataArray[j]; + if (fp.numPostings > 0) + allFields.Add(fp); + } + } + + // Sort by field name + allFields.Sort(); + int numAllFields = allFields.Count; + + skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, numDocsInRAM, freqOut, proxOut); + + int start = 0; + while (start < numAllFields) + { + + System.String fieldName = ((ThreadState.FieldData) allFields[start]).fieldInfo.name; + + int end = start + 1; + while (end < numAllFields && ((ThreadState.FieldData) allFields[end]).fieldInfo.name.Equals(fieldName)) + end++; + + ThreadState.FieldData[] fields = new ThreadState.FieldData[end - start]; + for (int i = start; i < end; i++) + fields[i - start] = (ThreadState.FieldData) allFields[i]; + + // If this field has postings then add them to the + // segment + AppendPostings(fields, termsOut, freqOut, proxOut); + + for (int i = 0; i < fields.Length; i++) + fields[i].ResetPostingArrays(); + + start = end; + } + + freqOut.Close(); + proxOut.Close(); + termsOut.Close(); + + // Record all files we have flushed + System.Collections.IList flushedFiles = new System.Collections.ArrayList(); + flushedFiles.Add(SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION)); + flushedFiles.Add(SegmentFileName(IndexFileNames.FREQ_EXTENSION)); + flushedFiles.Add(SegmentFileName(IndexFileNames.PROX_EXTENSION)); + flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_EXTENSION)); + flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); + + if (hasNorms) + { + WriteNorms(segmentName, numDocsInRAM); + flushedFiles.Add(SegmentFileName(IndexFileNames.NORMS_EXTENSION)); + } + + if (infoStream != null) + { + long newSegmentSize = SegmentSize(segmentName); + System.String message = String.Format(nf, " oldRAMSize={0:d} newFlushedSize={1:d} docs/MB={2:f} new/old={3:%}", + new Object[] { numBytesUsed, newSegmentSize, (numDocsInRAM / (newSegmentSize / 1024.0 / 1024.0)), (newSegmentSize / numBytesUsed) }); + infoStream.WriteLine(message); + } + + ResetPostingsData(); + + nextDocID = 0; + nextWriteDocID = 0; + numDocsInRAM = 0; + files = null; + + // Maybe downsize postingsFreeList array + if (postingsFreeList.Length > 1.5 * postingsFreeCount) + { + int newSize = postingsFreeList.Length; + while (newSize > 1.25 * postingsFreeCount) + { + newSize = (int) (newSize * 0.8); + } + Posting[] newArray = new Posting[newSize]; + Array.Copy(postingsFreeList, 0, newArray, 0, postingsFreeCount); + postingsFreeList = newArray; + } + + return flushedFiles; + } + + /// Returns the name of the file with this extension, on + /// the current segment we are working on. + /// + private System.String SegmentFileName(System.String extension) + { + return segment + "." + extension; + } + + private TermInfo termInfo = new TermInfo(); // minimize consing + + /// Used to merge the postings from multiple ThreadStates + /// when creating a segment + /// + internal sealed class FieldMergeState + { + + internal ThreadState.FieldData field; + + internal Posting[] postings; + + private Posting p; + internal char[] text; + internal int textOffset; + + private int postingUpto = - 1; + + private ByteSliceReader freq = new ByteSliceReader(); + internal ByteSliceReader prox = new ByteSliceReader(); + + internal int docID; + internal int termFreq; + + internal bool NextTerm() + { + postingUpto++; + if (postingUpto == field.numPostings) + return false; + + p = postings[postingUpto]; + docID = 0; + + text = field.threadState.charPool.buffers[p.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT]; + textOffset = p.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK; + + if (p.freqUpto > p.freqStart) + freq.Init(field.threadState.postingsPool, p.freqStart, p.freqUpto); + else + freq.bufferOffset = freq.upto = freq.endIndex = 0; + + prox.Init(field.threadState.postingsPool, p.proxStart, p.proxUpto); + + // Should always be true + System.Diagnostics.Debug.Assert(NextDoc()); + + return true; + } + + public bool NextDoc() + { + if (freq.bufferOffset + freq.upto == freq.endIndex) + { + if (p.lastDocCode != - 1) + { + // Return last doc + docID = p.lastDocID; + termFreq = p.docFreq; + p.lastDocCode = - 1; + return true; + } + // EOF + else + return false; + } + + int code = freq.ReadVInt(); + docID += SupportClass.Number.URShift(code, 1); + if ((code & 1) != 0) + termFreq = 1; + else + termFreq = freq.ReadVInt(); + + return true; + } + } + + internal int CompareText(char[] text1, int pos1, char[] text2, int pos2) + { + while (true) + { + char c1 = text1[pos1++]; + char c2 = text2[pos2++]; + if (c1 < c2) + if (0xffff == c2) + return 1; + else + return - 1; + else if (c2 < c1) + if (0xffff == c1) + return - 1; + else + return 1; + else if (0xffff == c1) + return 0; + } + } + + /* Walk through all unique text tokens (Posting + * instances) found in this field and serialize them + * into a single RAM segment. */ + internal void AppendPostings(ThreadState.FieldData[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut) + { + + int fieldNumber = fields[0].fieldInfo.number; + int numFields = fields.Length; + + FieldMergeState[] mergeStates = new FieldMergeState[numFields]; + + for (int i = 0; i < numFields; i++) + { + FieldMergeState fms = mergeStates[i] = new FieldMergeState(); + fms.field = fields[i]; + fms.postings = fms.field.SortPostings(); + + System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo); + + // Should always be true + System.Diagnostics.Debug.Assert(fms.NextTerm()); + } + + int skipInterval = termsOut.skipInterval; + currentFieldStorePayloads = fields[0].fieldInfo.storePayloads; + + FieldMergeState[] termStates = new FieldMergeState[numFields]; + + while (numFields > 0) + { + + // Get the next term to merge + termStates[0] = mergeStates[0]; + int numToMerge = 1; + + for (int i = 1; i < numFields; i++) + { + char[] text = mergeStates[i].text; + int textOffset = mergeStates[i].textOffset; + int cmp = CompareText(text, textOffset, termStates[0].text, termStates[0].textOffset); + + if (cmp < 0) + { + termStates[0] = mergeStates[i]; + numToMerge = 1; + } + else if (cmp == 0) + termStates[numToMerge++] = mergeStates[i]; + } + + int df = 0; + int lastPayloadLength = - 1; + + int lastDoc = 0; + + char[] text2 = termStates[0].text; + int start = termStates[0].textOffset; + int pos = start; + while (text2[pos] != 0xffff) + pos++; + + long freqPointer = freqOut.GetFilePointer(); + long proxPointer = proxOut.GetFilePointer(); + + skipListWriter.ResetSkip(); + + // Now termStates has numToMerge FieldMergeStates + // which all share the same term. Now we must + // interleave the docID streams. + while (numToMerge > 0) + { + + if ((++df % skipInterval) == 0) + { + skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); + skipListWriter.BufferSkip(df); + } + + FieldMergeState minState = termStates[0]; + for (int i = 1; i < numToMerge; i++) + if (termStates[i].docID < minState.docID) + minState = termStates[i]; + + int doc = minState.docID; + int termDocFreq = minState.termFreq; + + System.Diagnostics.Debug.Assert(doc < numDocsInRAM); + System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); + + int newDocCode = (doc - lastDoc) << 1; + lastDoc = doc; + + ByteSliceReader prox = minState.prox; + + // Carefully copy over the prox + payload info, + // changing the format to match Lucene's segment + // format. + for (int j = 0; j < termDocFreq; j++) + { + int code = prox.ReadVInt(); + if (currentFieldStorePayloads) + { + int payloadLength; + if ((code & 1) != 0) + { + // This position has a payload + payloadLength = prox.ReadVInt(); + } + else + payloadLength = 0; + if (payloadLength != lastPayloadLength) + { + proxOut.WriteVInt(code | 1); + proxOut.WriteVInt(payloadLength); + lastPayloadLength = payloadLength; + } + else + proxOut.WriteVInt(code & (~ 1)); + if (payloadLength > 0) + CopyBytes(prox, proxOut, payloadLength); + } + else + { + System.Diagnostics.Debug.Assert(0 ==(code & 1)); + proxOut.WriteVInt(code >> 1); + } + } + + if (1 == termDocFreq) + { + freqOut.WriteVInt(newDocCode | 1); + } + else + { + freqOut.WriteVInt(newDocCode); + freqOut.WriteVInt(termDocFreq); + } + + if (!minState.NextDoc()) + { + + // Remove from termStates + int upto = 0; + for (int i = 0; i < numToMerge; i++) + if (termStates[i] != minState) + termStates[upto++] = termStates[i]; + numToMerge--; + System.Diagnostics.Debug.Assert(upto == numToMerge); + + // Advance this state to the next term + + if (!minState.NextTerm()) + { + // OK, no more terms, so remove from mergeStates + // as well + upto = 0; + for (int i = 0; i < numFields; i++) + if (mergeStates[i] != minState) + mergeStates[upto++] = mergeStates[i]; + numFields--; + System.Diagnostics.Debug.Assert(upto == numFields); + } + } + } + + System.Diagnostics.Debug.Assert(df > 0); + + // Done merging this term + + long skipPointer = skipListWriter.WriteSkip(freqOut); + + // Write term + termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer)); + termsOut.Add(fieldNumber, text2, start, pos - start, termInfo); + } + } + + internal void Close() + { + lock (this) + { + closed = true; + System.Threading.Monitor.PulseAll(this); + } + } + + /// Returns a free (idle) ThreadState that may be used for + /// indexing this one document. This call also pauses if a + /// flush is pending. If delTerm is non-null then we + /// buffer this deleted term after the thread state has + /// been acquired. + /// + internal ThreadState GetThreadState(Document doc, Term delTerm) + { + lock (this) + { + + // First, find a thread state. If this thread already + // has affinity to a specific ThreadState, use that one + // again. + ThreadState state = (ThreadState) threadBindings[SupportClass.ThreadClass.Current()]; + if (state == null) + { [... 1213 lines stripped ...]