Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id DCFC0200C73 for ; Tue, 4 Apr 2017 19:19:17 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id DBD95160B90; Tue, 4 Apr 2017 17:19:17 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id A2193160BA7 for ; Tue, 4 Apr 2017 19:19:14 +0200 (CEST) Received: (qmail 51453 invoked by uid 500); 4 Apr 2017 17:19:10 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 50327 invoked by uid 99); 4 Apr 2017 17:19:09 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 04 Apr 2017 17:19:09 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 292F1DFE59; Tue, 4 Apr 2017 17:19:09 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: nightowl888@apache.org To: commits@lucenenet.apache.org Date: Tue, 04 Apr 2017 17:19:53 -0000 Message-Id: <3e9e2aceb47e4b1896b77a5eb97d452e@git.apache.org> In-Reply-To: <937e795e09544d619dd5cda8869fe2af@git.apache.org> References: <937e795e09544d619dd5cda8869fe2af@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [47/62] [abbrv] [partial] lucenenet git commit: Renamed Lucene.Net.Core folder Lucene.Net because the dotnet.exe pack command doesn't allow creating a NuGet package with a different name than its folder. Working around it with the script was much more co archived-at: Tue, 04 Apr 2017 17:19:18 -0000 http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a5dc68d0/src/Lucene.Net.Core/Codecs/BlockTreeTermsReader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Codecs/BlockTreeTermsReader.cs b/src/Lucene.Net.Core/Codecs/BlockTreeTermsReader.cs deleted file mode 100644 index fedc3b0..0000000 --- a/src/Lucene.Net.Core/Codecs/BlockTreeTermsReader.cs +++ /dev/null @@ -1,3536 +0,0 @@ -using Lucene.Net.Index; -using Lucene.Net.Support; -using Lucene.Net.Util.Fst; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; -using System.Text; - -namespace Lucene.Net.Codecs -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - using ArrayUtil = Lucene.Net.Util.ArrayUtil; - using IBits = Lucene.Net.Util.IBits; - using ByteArrayDataInput = Lucene.Net.Store.ByteArrayDataInput; - using ByteSequenceOutputs = Lucene.Net.Util.Fst.ByteSequenceOutputs; - using BytesRef = Lucene.Net.Util.BytesRef; - using CompiledAutomaton = Lucene.Net.Util.Automaton.CompiledAutomaton; - using CorruptIndexException = Lucene.Net.Index.CorruptIndexException; - using Directory = Lucene.Net.Store.Directory; - using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum; - using DocsEnum = Lucene.Net.Index.DocsEnum; - using FieldInfo = Lucene.Net.Index.FieldInfo; - using FieldInfos = Lucene.Net.Index.FieldInfos; - using IndexFileNames = Lucene.Net.Index.IndexFileNames; - using IndexInput = Lucene.Net.Store.IndexInput; - using IndexOptions = Lucene.Net.Index.IndexOptions; - using IOContext = Lucene.Net.Store.IOContext; - using IOUtils = Lucene.Net.Util.IOUtils; - using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator; - using RunAutomaton = Lucene.Net.Util.Automaton.RunAutomaton; - using SegmentInfo = Lucene.Net.Index.SegmentInfo; - using StringHelper = Lucene.Net.Util.StringHelper; - using Terms = Lucene.Net.Index.Terms; - using TermsEnum = Lucene.Net.Index.TermsEnum; - using TermState = Lucene.Net.Index.TermState; - using Transition = Lucene.Net.Util.Automaton.Transition; - - /// - /// A block-based terms index and dictionary that assigns - /// terms to variable length blocks according to how they - /// share prefixes. The terms index is a prefix trie - /// whose leaves are term blocks. The advantage of this - /// approach is that seekExact is often able to - /// determine a term cannot exist without doing any IO, and - /// intersection with Automata is very fast. Note that this - /// terms dictionary has it's own fixed terms index (ie, it - /// does not support a pluggable terms index - /// implementation). - /// - ///

NOTE: this terms dictionary does not support - /// index divisor when opening an IndexReader. Instead, you - /// can change the min/maxItemsPerBlock during indexing.

- /// - ///

The data structure used by this implementation is very - /// similar to a burst trie - /// (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), - /// but with added logic to break up too-large blocks of all - /// terms sharing a given prefix into smaller ones.

- /// - ///

Use with the -verbose - /// option to see summary statistics on the blocks in the - /// dictionary. - /// - /// See . - /// - /// @lucene.experimental - ///

- public class BlockTreeTermsReader : FieldsProducer - { - private void InitializeInstanceFields() - { - NO_OUTPUT = fstOutputs.NoOutput; - } - - // Open input to the main terms dict file (_X.tib) - private readonly IndexInput @in; - - //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; - - // Reads the terms dict entries, to gather state to - // produce DocsEnum on demand - private readonly PostingsReaderBase postingsReader; - - // LUCENENET specific: Use StringComparer.Ordinal to get the same ordering as Java - private readonly SortedDictionary fields = new SortedDictionary(StringComparer.Ordinal); - - /// - /// File offset where the directory starts in the terms file. - private long dirOffset; - - /// - /// File offset where the directory starts in the index file. - private long indexDirOffset; - - private string segment; - - private readonly int version; - - /// - /// Sole constructor. - public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info, PostingsReaderBase postingsReader, IOContext ioContext, string segmentSuffix, int indexDivisor) - { - InitializeInstanceFields(); - - this.postingsReader = postingsReader; - - this.segment = info.Name; - @in = dir.OpenInput(IndexFileNames.SegmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_EXTENSION), ioContext); - - bool success = false; - IndexInput indexIn = null; - - try - { - version = ReadHeader(@in); - if (indexDivisor != -1) - { - indexIn = dir.OpenInput(IndexFileNames.SegmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION), ioContext); - int indexVersion = ReadIndexHeader(indexIn); - if (indexVersion != version) - { - throw new CorruptIndexException("mixmatched version files: " + @in + "=" + version + "," + indexIn + "=" + indexVersion); - } - } - - // verify - if (indexIn != null && version >= BlockTreeTermsWriter.VERSION_CHECKSUM) - { - CodecUtil.ChecksumEntireFile(indexIn); - } - - // Have PostingsReader init itself - postingsReader.Init(@in); - - // Read per-field details - SeekDir(@in, dirOffset); - if (indexDivisor != -1) - { - SeekDir(indexIn, indexDirOffset); - } - - int numFields = @in.ReadVInt32(); - if (numFields < 0) - { - throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + @in + ")"); - } - - for (int i = 0; i < numFields; i++) - { - int field = @in.ReadVInt32(); - long numTerms = @in.ReadVInt64(); - Debug.Assert(numTerms >= 0); - int numBytes = @in.ReadVInt32(); - BytesRef rootCode = new BytesRef(new byte[numBytes]); - @in.ReadBytes(rootCode.Bytes, 0, numBytes); - rootCode.Length = numBytes; - FieldInfo fieldInfo = fieldInfos.FieldInfo(field); - Debug.Assert(fieldInfo != null, "field=" + field); - long sumTotalTermFreq = fieldInfo.IndexOptions == IndexOptions.DOCS_ONLY ? -1 : @in.ReadVInt64(); - long sumDocFreq = @in.ReadVInt64(); - int docCount = @in.ReadVInt32(); - int longsSize = version >= BlockTreeTermsWriter.VERSION_META_ARRAY ? @in.ReadVInt32() : 0; - if (docCount < 0 || docCount > info.DocCount) // #docs with field must be <= #docs - { - throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.DocCount + " (resource=" + @in + ")"); - } - if (sumDocFreq < docCount) // #postings must be >= #docs with field - { - throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount + " (resource=" + @in + ")"); - } - if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) // #positions must be >= #postings - { - throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + @in + ")"); - } - long indexStartFP = indexDivisor != -1 ? indexIn.ReadVInt64() : 0; - - if (fields.ContainsKey(fieldInfo.Name)) - { - throw new CorruptIndexException("duplicate field: " + fieldInfo.Name + " (resource=" + @in + ")"); - } - else - { - fields[fieldInfo.Name] = new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, longsSize, indexIn); - } - } - if (indexDivisor != -1) - { - indexIn.Dispose(); - } - - success = true; - } - finally - { - if (!success) - { - // this.Dispose() will close in: - IOUtils.CloseWhileHandlingException(indexIn, this); - } - } - } - - /// - /// Reads terms file header. - protected internal virtual int ReadHeader(IndexInput input) - { - int version = CodecUtil.CheckHeader(input, BlockTreeTermsWriter.TERMS_CODEC_NAME, BlockTreeTermsWriter.VERSION_START, BlockTreeTermsWriter.VERSION_CURRENT); - if (version < BlockTreeTermsWriter.VERSION_APPEND_ONLY) - { - dirOffset = input.ReadInt64(); - } - return version; - } - - /// - /// Reads index file header. - protected internal virtual int ReadIndexHeader(IndexInput input) - { - int version = CodecUtil.CheckHeader(input, BlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, BlockTreeTermsWriter.VERSION_START, BlockTreeTermsWriter.VERSION_CURRENT); - if (version < BlockTreeTermsWriter.VERSION_APPEND_ONLY) - { - indexDirOffset = input.ReadInt64(); - } - return version; - } - - /// - /// Seek {@code input} to the directory offset. - protected internal virtual void SeekDir(IndexInput input, long dirOffset) - { - if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) - { - input.Seek(input.Length - CodecUtil.FooterLength() - 8); - dirOffset = input.ReadInt64(); - } - else if (version >= BlockTreeTermsWriter.VERSION_APPEND_ONLY) - { - input.Seek(input.Length - 8); - dirOffset = input.ReadInt64(); - } - input.Seek(dirOffset); - } - - // for debugging - // private static String toHex(int v) { - // return "0x" + Integer.toHexString(v); - // } - - protected override void Dispose(bool disposing) - { - if (disposing) - { - try - { - IOUtils.Close(@in, postingsReader); - } - finally - { - // Clear so refs to terms index is GCable even if - // app hangs onto us: - fields.Clear(); - } - } - } - - public override IEnumerator GetEnumerator() - { - return fields.Keys.GetEnumerator(); - } - - public override Terms GetTerms(string field) - { - Debug.Assert(field != null); - FieldReader ret; - fields.TryGetValue(field, out ret); - return ret; - } - - public override int Count - { - get { return fields.Count; } - } - - // for debugging - internal virtual string BrToString(BytesRef b) - { - if (b == null) - { - return "null"; - } - else - { - try - { - return b.Utf8ToString() + " " + b; - } - catch (Exception) - { - // If BytesRef isn't actually UTF8, or it's eg a - // prefix of UTF8 that ends mid-unicode-char, we - // fallback to hex: - return b.ToString(); - } - } - } - - /// - /// BlockTree statistics for a single field - /// returned by . - /// - public class Stats - { - /// - /// How many nodes in the index FST. - public long IndexNodeCount { get; set; } - - /// - /// How many arcs in the index FST. - public long IndexArcCount { get; set; } - - /// - /// Byte size of the index. - public long IndexNumBytes { get; set; } - - /// - /// Total number of terms in the field. - public long TotalTermCount { get; set; } - - /// - /// Total number of bytes (sum of term lengths) across all terms in the field. - public long TotalTermBytes { get; set; } - - /// - /// The number of normal (non-floor) blocks in the terms file. - public int NonFloorBlockCount { get; set; } - - /// - /// The number of floor blocks (meta-blocks larger than the - /// allowed {@code maxItemsPerBlock}) in the terms file. - /// - public int FloorBlockCount { get; set; } - - /// - /// The number of sub-blocks within the floor blocks. - public int FloorSubBlockCount { get; set; } - - /// - /// The number of "internal" blocks (that have both - /// terms and sub-blocks). - /// - public int MixedBlockCount { get; set; } - - /// - /// The number of "leaf" blocks (blocks that have only - /// terms). - /// - public int TermsOnlyBlockCount { get; set; } - - /// - /// The number of "internal" blocks that do not contain - /// terms (have only sub-blocks). - /// - public int SubBlocksOnlyBlockCount { get; set; } - - /// - /// Total number of blocks. - public int TotalBlockCount { get; set; } - - /// - /// Number of blocks at each prefix depth. - [WritableArray] - [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")] - public int[] BlockCountByPrefixLen - { - get { return blockCountByPrefixLen; } - set { blockCountByPrefixLen = value; } - } - private int[] blockCountByPrefixLen = new int[10]; - - internal int startBlockCount; - internal int endBlockCount; - - /// - /// Total number of bytes used to store term suffixes. - public long TotalBlockSuffixBytes { get; set; } - - /// - /// Total number of bytes used to store term stats (not - /// including what the - /// stores. - /// - public long TotalBlockStatsBytes { get; set; } - - /// - /// Total bytes stored by the , - /// plus the other few vInts stored in the frame. - /// - public long TotalBlockOtherBytes { get; set; } - - /// - /// Segment name. - public string Segment { get; private set; } - - /// - /// Field name. - public string Field { get; private set; } - - internal Stats(string segment, string field) - { - this.Segment = segment; - this.Field = field; - } - - internal virtual void StartBlock(FieldReader.SegmentTermsEnum.Frame frame, bool isFloor) - { - TotalBlockCount++; - if (isFloor) - { - if (frame.fp == frame.fpOrig) - { - FloorBlockCount++; - } - FloorSubBlockCount++; - } - else - { - NonFloorBlockCount++; - } - - if (blockCountByPrefixLen.Length <= frame.prefix) - { - blockCountByPrefixLen = ArrayUtil.Grow(blockCountByPrefixLen, 1 + frame.prefix); - } - blockCountByPrefixLen[frame.prefix]++; - startBlockCount++; - TotalBlockSuffixBytes += frame.suffixesReader.Length; - TotalBlockStatsBytes += frame.statsReader.Length; - } - - internal virtual void EndBlock(FieldReader.SegmentTermsEnum.Frame frame) - { - int termCount = frame.isLeafBlock ? frame.entCount : frame.state.TermBlockOrd; - int subBlockCount = frame.entCount - termCount; - TotalTermCount += termCount; - if (termCount != 0 && subBlockCount != 0) - { - MixedBlockCount++; - } - else if (termCount != 0) - { - TermsOnlyBlockCount++; - } - else if (subBlockCount != 0) - { - SubBlocksOnlyBlockCount++; - } - else - { - throw new InvalidOperationException(); - } - endBlockCount++; - long otherBytes = frame.fpEnd - frame.fp - frame.suffixesReader.Length - frame.statsReader.Length; - Debug.Assert(otherBytes > 0, "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd); - TotalBlockOtherBytes += otherBytes; - } - - internal virtual void Term(BytesRef term) - { - TotalTermBytes += term.Length; - } - - internal virtual void Finish() - { - Debug.Assert(startBlockCount == endBlockCount, "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount); - Debug.Assert(TotalBlockCount == FloorSubBlockCount + NonFloorBlockCount, "floorSubBlockCount=" + FloorSubBlockCount + " nonFloorBlockCount=" + NonFloorBlockCount + " totalBlockCount=" + TotalBlockCount); - Debug.Assert(TotalBlockCount == MixedBlockCount + TermsOnlyBlockCount + SubBlocksOnlyBlockCount, "totalBlockCount=" + TotalBlockCount + " mixedBlockCount=" + MixedBlockCount + " subBlocksOnlyBlockCount=" + SubBlocksOnlyBlockCount + " termsOnlyBlockCount=" + TermsOnlyBlockCount); - } - - public override string ToString() - { - StringBuilder @out = new StringBuilder(); - - @out.AppendLine(" index FST:"); - @out.AppendLine(" " + IndexNodeCount + " nodes"); - @out.AppendLine(" " + IndexArcCount + " arcs"); - @out.AppendLine(" " + IndexNumBytes + " bytes"); - @out.AppendLine(" terms:"); - @out.AppendLine(" " + TotalTermCount + " terms"); - @out.AppendLine(" " + TotalTermBytes + " bytes" + (TotalTermCount != 0 ? " (" + ((double)TotalTermBytes / TotalTermCount).ToString("0.0") + " bytes/term)" : "")); - @out.AppendLine(" blocks:"); - @out.AppendLine(" " + TotalBlockCount + " blocks"); - @out.AppendLine(" " + TermsOnlyBlockCount + " terms-only blocks"); - @out.AppendLine(" " + SubBlocksOnlyBlockCount + " sub-block-only blocks"); - @out.AppendLine(" " + MixedBlockCount + " mixed blocks"); - @out.AppendLine(" " + FloorBlockCount + " floor blocks"); - @out.AppendLine(" " + (TotalBlockCount - FloorSubBlockCount) + " non-floor blocks"); - @out.AppendLine(" " + FloorSubBlockCount + " floor sub-blocks"); - @out.AppendLine(" " + TotalBlockSuffixBytes + " term suffix bytes" + (TotalBlockCount != 0 ? " (" + ((double)TotalBlockSuffixBytes / TotalBlockCount).ToString("0.0") + " suffix-bytes/block)" : "")); - @out.AppendLine(" " + TotalBlockStatsBytes + " term stats bytes" + (TotalBlockCount != 0 ? " (" + ((double)TotalBlockStatsBytes / TotalBlockCount).ToString("0.0") + " stats-bytes/block)" : "")); - @out.AppendLine(" " + TotalBlockOtherBytes + " other bytes" + (TotalBlockCount != 0 ? " (" + ((double)TotalBlockOtherBytes / TotalBlockCount).ToString("0.0") + " other-bytes/block)" : "")); - if (TotalBlockCount != 0) - { - @out.AppendLine(" by prefix length:"); - int total = 0; - for (int prefix = 0; prefix < blockCountByPrefixLen.Length; prefix++) - { - int blockCount = blockCountByPrefixLen[prefix]; - total += blockCount; - if (blockCount != 0) - { - @out.AppendLine(" " + prefix.ToString().PadLeft(2, ' ') + ": " + blockCount); - } - } - Debug.Assert(TotalBlockCount == total); - } - return @out.ToString(); - } - } - - internal readonly Outputs fstOutputs = ByteSequenceOutputs.Singleton; - internal BytesRef NO_OUTPUT; - - /// - /// BlockTree's implementation of . - public sealed class FieldReader : Terms - { - private readonly BlockTreeTermsReader outerInstance; - - internal readonly long numTerms; - internal readonly FieldInfo fieldInfo; - internal readonly long sumTotalTermFreq; - internal readonly long sumDocFreq; - internal readonly int docCount; - internal readonly long indexStartFP; - internal readonly long rootBlockFP; - internal readonly BytesRef rootCode; - internal readonly int longsSize; - - internal readonly FST index; - //private boolean DEBUG; - - internal FieldReader(BlockTreeTermsReader outerInstance, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, int longsSize, IndexInput indexIn) - { - this.outerInstance = outerInstance; - Debug.Assert(numTerms > 0); - this.fieldInfo = fieldInfo; - //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.Equals("id", StringComparison.Ordinal); - this.numTerms = numTerms; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - this.docCount = docCount; - this.indexStartFP = indexStartFP; - this.rootCode = rootCode; - this.longsSize = longsSize; - // if (DEBUG) { - // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); - // } - - rootBlockFP = (int)((uint)(new ByteArrayDataInput(rootCode.Bytes, rootCode.Offset, rootCode.Length)).ReadVInt64() >> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS); - - if (indexIn != null) - { - IndexInput clone = (IndexInput)indexIn.Clone(); - //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); - clone.Seek(indexStartFP); - index = new FST(clone, ByteSequenceOutputs.Singleton); - - /* - if (false) { - final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; - Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); - Util.toDot(index, w, false, false); - System.out.println("FST INDEX: SAVED to " + dotFileName); - w.Dispose(); - } - */ - } - else - { - index = null; - } - } - - /// - /// For debugging -- used by CheckIndex too - // TODO: maybe push this into Terms? - public Stats ComputeStats() - { - return (new SegmentTermsEnum(this)).ComputeBlockStats(); - } - - public override IComparer Comparer - { - get - { - return BytesRef.UTF8SortedAsUnicodeComparer; - } - } - - public override bool HasFreqs - { - get { return fieldInfo.IndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS) >= 0; } - } - - public override bool HasOffsets - { - get { return fieldInfo.IndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; } - } - - public override bool HasPositions - { - get { return fieldInfo.IndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; } - } - - public override bool HasPayloads - { - get { return fieldInfo.HasPayloads; } - } - - public override TermsEnum GetIterator(TermsEnum reuse) - { - return new SegmentTermsEnum(this); - } - - public override long Count - { - get { return numTerms; } - } - - public override long SumTotalTermFreq - { - get - { - return sumTotalTermFreq; - } - } - - public override long SumDocFreq - { - get - { - return sumDocFreq; - } - } - - public override int DocCount - { - get - { - return docCount; - } - } - - public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) - { - if (compiled.Type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) - { - throw new System.ArgumentException("please use CompiledAutomaton.getTermsEnum instead"); - } - return new IntersectEnum(this, compiled, startTerm); - } - - /// - /// Returns approximate RAM bytes used - public long RamBytesUsed() - { - return ((index != null) ? index.GetSizeInBytes() : 0); - } - - // NOTE: cannot seek! - private sealed class IntersectEnum : TermsEnum - { - private readonly BlockTreeTermsReader.FieldReader outerInstance; - - private readonly IndexInput @in; - - private Frame[] stack; - - private FST.Arc[] arcs = new FST.Arc[5]; - - private readonly RunAutomaton runAutomaton; - private readonly CompiledAutomaton compiledAutomaton; - - private Frame currentFrame; - - private readonly BytesRef term = new BytesRef(); - - private readonly FST.BytesReader fstReader; - - // TODO: can we share this with the frame in STE? - private sealed class Frame - { - private readonly BlockTreeTermsReader.FieldReader.IntersectEnum outerInstance; - - internal readonly int ord; - internal long fp; - internal long fpOrig; - internal long fpEnd; - internal long lastSubFP; - - // State in automaton - internal int state; - - internal int metaDataUpto; - - internal byte[] suffixBytes = new byte[128]; - internal readonly ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); - - internal byte[] statBytes = new byte[64]; - internal readonly ByteArrayDataInput statsReader = new ByteArrayDataInput(); - - internal byte[] floorData = new byte[32]; - internal readonly ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); - - // Length of prefix shared by all terms in this block - internal int prefix; - - // Number of entries (term or sub-block) in this block - internal int entCount; - - // Which term we will next read - internal int nextEnt; - - // True if this block is either not a floor block, - // or, it's the last sub-block of a floor block - internal bool isLastInFloor; - - // True if all entries are terms - internal bool isLeafBlock; - - internal int numFollowFloorBlocks; - internal int nextFloorLabel; - - internal Transition[] transitions; - internal int curTransitionMax; - internal int transitionIndex; - - internal FST.Arc arc; - - internal readonly BlockTermState termState; - - // metadata buffer, holding monotonic values - /// - /// NOTE: This was longs (field) in Lucene - /// - [WritableArray] - [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")] - public long[] Int64s - { - get { return longs; } - set { longs = value; } - } - private long[] longs; - - // metadata buffer, holding general values - [WritableArray] - [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")] - public byte[] Bytes - { - get { return bytes; } - set { bytes = value; } - } - private byte[] bytes; - - internal ByteArrayDataInput bytesReader; - - // Cumulative output so far - internal BytesRef outputPrefix; - - internal int startBytePos; - internal int suffix; - - public Frame(BlockTreeTermsReader.FieldReader.IntersectEnum outerInstance, int ord) - { - this.outerInstance = outerInstance; - this.ord = ord; - this.termState = outerInstance.outerInstance.outerInstance.postingsReader.NewTermState(); - this.termState.TotalTermFreq = -1; - this.longs = new long[outerInstance.outerInstance.longsSize]; - } - - internal void LoadNextFloorBlock() - { - Debug.Assert(numFollowFloorBlocks > 0); - //if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]); - - do - { - fp = fpOrig + ((int)((uint)floorDataReader.ReadVInt64() >> 1)); - numFollowFloorBlocks--; - // if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); - if (numFollowFloorBlocks != 0) - { - nextFloorLabel = floorDataReader.ReadByte() & 0xff; - } - else - { - nextFloorLabel = 256; - } - // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel); - } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[transitionIndex].Min); - - Load(null); - } - - public void SetState(int state) - { - this.state = state; - transitionIndex = 0; - transitions = outerInstance.compiledAutomaton.SortedTransitions[state]; - if (transitions.Length != 0) - { - curTransitionMax = transitions[0].Max; - } - else - { - curTransitionMax = -1; - } - } - - internal void Load(BytesRef frameIndexData) - { - // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state)); - - if (frameIndexData != null && transitions.Length != 0) - { - // Floor frame - if (floorData.Length < frameIndexData.Length) - { - this.floorData = new byte[ArrayUtil.Oversize(frameIndexData.Length, 1)]; - } - System.Buffer.BlockCopy(frameIndexData.Bytes, frameIndexData.Offset, floorData, 0, frameIndexData.Length); - floorDataReader.Reset(floorData, 0, frameIndexData.Length); - // Skip first long -- has redundant fp, hasTerms - // flag, isFloor flag - long code = floorDataReader.ReadVInt64(); - if ((code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0) - { - numFollowFloorBlocks = floorDataReader.ReadVInt32(); - nextFloorLabel = floorDataReader.ReadByte() & 0xff; - // if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel); - - // If current state is accept, we must process - // first block in case it has empty suffix: - if (outerInstance.runAutomaton.IsAccept(state)) - { - // Maybe skip floor blocks: - while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[0].Min) - { - fp = fpOrig + ((int)((uint)floorDataReader.ReadVInt64() >> 1)); - numFollowFloorBlocks--; - // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); - if (numFollowFloorBlocks != 0) - { - nextFloorLabel = floorDataReader.ReadByte() & 0xff; - } - else - { - nextFloorLabel = 256; - } - } - } - } - } - - outerInstance.@in.Seek(fp); - int code_ = outerInstance.@in.ReadVInt32(); - entCount = (int)((uint)code_ >> 1); - Debug.Assert(entCount > 0); - isLastInFloor = (code_ & 1) != 0; - - // term suffixes: - code_ = outerInstance.@in.ReadVInt32(); - isLeafBlock = (code_ & 1) != 0; - int numBytes = (int)((uint)code_ >> 1); - // if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes); - if (suffixBytes.Length < numBytes) - { - suffixBytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; - } - outerInstance.@in.ReadBytes(suffixBytes, 0, numBytes); - suffixesReader.Reset(suffixBytes, 0, numBytes); - - // stats - numBytes = outerInstance.@in.ReadVInt32(); - if (statBytes.Length < numBytes) - { - statBytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; - } - outerInstance.@in.ReadBytes(statBytes, 0, numBytes); - statsReader.Reset(statBytes, 0, numBytes); - metaDataUpto = 0; - - termState.TermBlockOrd = 0; - nextEnt = 0; - - // metadata - numBytes = outerInstance.@in.ReadVInt32(); - if (bytes == null) - { - bytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; - bytesReader = new ByteArrayDataInput(); - } - else if (bytes.Length < numBytes) - { - bytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; - } - outerInstance.@in.ReadBytes(bytes, 0, numBytes); - bytesReader.Reset(bytes, 0, numBytes); - - if (!isLastInFloor) - { - // Sub-blocks of a single floor block are always - // written one after another -- tail recurse: - fpEnd = outerInstance.@in.GetFilePointer(); - } - } - - // TODO: maybe add scanToLabel; should give perf boost - - public bool Next() - { - return isLeafBlock ? NextLeaf() : NextNonLeaf(); - } - - // Decodes next entry; returns true if it's a sub-block - public bool NextLeaf() - { - //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); - Debug.Assert(nextEnt != -1 && nextEnt < entCount, "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp); - nextEnt++; - suffix = suffixesReader.ReadVInt32(); - startBytePos = suffixesReader.Position; - suffixesReader.SkipBytes(suffix); - return false; - } - - public bool NextNonLeaf() - { - //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); - Debug.Assert(nextEnt != -1 && nextEnt < entCount, "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp); - nextEnt++; - int code = suffixesReader.ReadVInt32(); - suffix = (int)((uint)code >> 1); - startBytePos = suffixesReader.Position; - suffixesReader.SkipBytes(suffix); - if ((code & 1) == 0) - { - // A normal term - termState.TermBlockOrd++; - return false; - } - else - { - // A sub-block; make sub-FP absolute: - lastSubFP = fp - suffixesReader.ReadVInt64(); - return true; - } - } - - public int TermBlockOrd - { - get - { - return isLeafBlock ? nextEnt : termState.TermBlockOrd; - } - } - - public void DecodeMetaData() - { - // lazily catch up on metadata decode: - int limit = TermBlockOrd; - bool absolute = metaDataUpto == 0; - Debug.Assert(limit > 0); - - // TODO: better API would be "jump straight to term=N"??? - while (metaDataUpto < limit) - { - // TODO: we could make "tiers" of metadata, ie, - // decode docFreq/totalTF but don't decode postings - // metadata; this way caller could get - // docFreq/totalTF w/o paying decode cost for - // postings - - // TODO: if docFreq were bulk decoded we could - // just skipN here: - - // stats - termState.DocFreq = statsReader.ReadVInt32(); - //if (DEBUG) System.out.println(" dF=" + state.docFreq); - if (outerInstance.outerInstance.fieldInfo.IndexOptions != IndexOptions.DOCS_ONLY) - { - termState.TotalTermFreq = termState.DocFreq + statsReader.ReadVInt64(); - //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); - } - // metadata - for (int i = 0; i < outerInstance.outerInstance.longsSize; i++) - { - longs[i] = bytesReader.ReadVInt64(); - } - outerInstance.outerInstance.outerInstance.postingsReader.DecodeTerm(longs, bytesReader, outerInstance.outerInstance.fieldInfo, termState, absolute); - - metaDataUpto++; - absolute = false; - } - termState.TermBlockOrd = metaDataUpto; - } - } - - private BytesRef savedStartTerm; - - // TODO: in some cases we can filter by length? eg - // regexp foo*bar must be at least length 6 bytes - public IntersectEnum(BlockTreeTermsReader.FieldReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) - { - this.outerInstance = outerInstance; - // if (DEBUG) { - // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); - // } - runAutomaton = compiled.RunAutomaton; - compiledAutomaton = compiled; - @in = (IndexInput)outerInstance.outerInstance.@in.Clone(); - stack = new Frame[5]; - for (int idx = 0; idx < stack.Length; idx++) - { - stack[idx] = new Frame(this, idx); - } - for (int arcIdx = 0; arcIdx < arcs.Length; arcIdx++) - { - arcs[arcIdx] = new FST.Arc(); - } - - if (outerInstance.index == null) - { - fstReader = null; - } - else - { - fstReader = outerInstance.index.GetBytesReader(); - } - - // TODO: if the automaton is "smallish" we really - // should use the terms index to seek at least to - // the initial term and likely to subsequent terms - // (or, maybe just fallback to ATE for such cases). - // Else the seek cost of loading the frames will be - // too costly. - - FST.Arc arc = outerInstance.index.GetFirstArc(arcs[0]); - // Empty string prefix must have an output in the index! - Debug.Assert(arc.IsFinal); - - // Special pushFrame since it's the first one: - Frame f = stack[0]; - f.fp = f.fpOrig = outerInstance.rootBlockFP; - f.prefix = 0; - f.SetState(runAutomaton.InitialState); - f.arc = arc; - f.outputPrefix = arc.Output; - f.Load(outerInstance.rootCode); - - // for assert: - Debug.Assert(SetSavedStartTerm(startTerm)); - - currentFrame = f; - if (startTerm != null) - { - SeekToStartTerm(startTerm); - } - } - - // only for assert: - internal bool SetSavedStartTerm(BytesRef startTerm) - { - savedStartTerm = startTerm == null ? null : BytesRef.DeepCopyOf(startTerm); - return true; - } - - public override TermState GetTermState() - { - currentFrame.DecodeMetaData(); - return (TermState)currentFrame.termState.Clone(); - } - - private Frame GetFrame(int ord) - { - if (ord >= stack.Length) - { - Frame[] next = new Frame[ArrayUtil.Oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - Array.Copy(stack, 0, next, 0, stack.Length); - for (int stackOrd = stack.Length; stackOrd < next.Length; stackOrd++) - { - next[stackOrd] = new Frame(this, stackOrd); - } - stack = next; - } - Debug.Assert(stack[ord].ord == ord); - return stack[ord]; - } - - private FST.Arc GetArc(int ord) - { - if (ord >= arcs.Length) - { - FST.Arc[] next = new FST.Arc[ArrayUtil.Oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - Array.Copy(arcs, 0, next, 0, arcs.Length); - for (int arcOrd = arcs.Length; arcOrd < next.Length; arcOrd++) - { - next[arcOrd] = new FST.Arc(); - } - arcs = next; - } - return arcs[ord]; - } - - private Frame PushFrame(int state) - { - Frame f = GetFrame(currentFrame == null ? 0 : 1 + currentFrame.ord); - - f.fp = f.fpOrig = currentFrame.lastSubFP; - f.prefix = currentFrame.prefix + currentFrame.suffix; - // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix); - f.SetState(state); - - // Walk the arc through the index -- we only - // "bother" with this so we can get the floor data - // from the index and skip floor blocks when - // possible: - FST.Arc arc = currentFrame.arc; - int idx = currentFrame.prefix; - Debug.Assert(currentFrame.suffix > 0); - BytesRef output = currentFrame.outputPrefix; - while (idx < f.prefix) - { - int target = term.Bytes[idx] & 0xff; - // TODO: we could be more efficient for the next() - // case by using current arc as starting point, - // passed to findTargetArc - arc = outerInstance.index.FindTargetArc(target, arc, GetArc(1 + idx), fstReader); - Debug.Assert(arc != null); - output = outerInstance.outerInstance.fstOutputs.Add(output, arc.Output); - idx++; - } - - f.arc = arc; - f.outputPrefix = output; - Debug.Assert(arc.IsFinal); - f.Load(outerInstance.outerInstance.fstOutputs.Add(output, arc.NextFinalOutput)); - return f; - } - - public override BytesRef Term - { - get { return term; } - } - - public override int DocFreq - { - get - { - //if (DEBUG) System.out.println("BTIR.docFreq"); - currentFrame.DecodeMetaData(); - //if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq); - return currentFrame.termState.DocFreq; - } - } - - public override long TotalTermFreq - { - get - { - currentFrame.DecodeMetaData(); - return currentFrame.termState.TotalTermFreq; - } - } - - public override DocsEnum Docs(IBits skipDocs, DocsEnum reuse, DocsFlags flags) - { - currentFrame.DecodeMetaData(); - return outerInstance.outerInstance.postingsReader.Docs(outerInstance.fieldInfo, currentFrame.termState, skipDocs, reuse, flags); - } - - public override DocsAndPositionsEnum DocsAndPositions(IBits skipDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) - { - if (outerInstance.fieldInfo.IndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) - { - // Positions were not indexed: - return null; - } - - currentFrame.DecodeMetaData(); - return outerInstance.outerInstance.postingsReader.DocsAndPositions(outerInstance.fieldInfo, currentFrame.termState, skipDocs, reuse, flags); - } - - private int GetState() - { - int state = currentFrame.state; - for (int idx = 0; idx < currentFrame.suffix; idx++) - { - state = runAutomaton.Step(state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff); - Debug.Assert(state != -1); - } - return state; - } - - // NOTE: specialized to only doing the first-time - // seek, but we could generalize it to allow - // arbitrary seekExact/Ceil. Note that this is a - // seekFloor! - private void SeekToStartTerm(BytesRef target) - { - //if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString()); - Debug.Assert(currentFrame.ord == 0); - if (term.Length < target.Length) - { - term.Bytes = ArrayUtil.Grow(term.Bytes, target.Length); - } - FST.Arc arc = arcs[0]; - Debug.Assert(arc == currentFrame.arc); - - for (int idx = 0; idx <= target.Length; idx++) - { - while (true) - { - int savePos = currentFrame.suffixesReader.Position; - int saveStartBytePos = currentFrame.startBytePos; - int saveSuffix = currentFrame.suffix; - long saveLastSubFP = currentFrame.lastSubFP; - int saveTermBlockOrd = currentFrame.termState.TermBlockOrd; - - bool isSubBlock = currentFrame.Next(); - - //if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff)); - term.Length = currentFrame.prefix + currentFrame.suffix; - if (term.Bytes.Length < term.Length) - { - term.Bytes = ArrayUtil.Grow(term.Bytes, term.Length); - } - System.Buffer.BlockCopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.Bytes, currentFrame.prefix, currentFrame.suffix); - - if (isSubBlock && StringHelper.StartsWith(target, term)) - { - // Recurse - currentFrame = PushFrame(GetState()); - break; - } - else - { - int cmp = term.CompareTo(target); - if (cmp < 0) - { - if (currentFrame.nextEnt == currentFrame.entCount) - { - if (!currentFrame.isLastInFloor) - { - //if (DEBUG) System.out.println(" load floorBlock"); - currentFrame.LoadNextFloorBlock(); - continue; - } - else - { - //if (DEBUG) System.out.println(" return term=" + brToString(term)); - return; - } - } - continue; - } - else if (cmp == 0) - { - //if (DEBUG) System.out.println(" return term=" + brToString(term)); - return; - } - else - { - // Fallback to prior entry: the semantics of - // this method is that the first call to - // next() will return the term after the - // requested term - currentFrame.nextEnt--; - currentFrame.lastSubFP = saveLastSubFP; - currentFrame.startBytePos = saveStartBytePos; - currentFrame.suffix = saveSuffix; - currentFrame.suffixesReader.Position = savePos; - currentFrame.termState.TermBlockOrd = saveTermBlockOrd; - System.Buffer.BlockCopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.Bytes, currentFrame.prefix, currentFrame.suffix); - term.Length = currentFrame.prefix + currentFrame.suffix; - // If the last entry was a block we don't - // need to bother recursing and pushing to - // the last term under it because the first - // next() will simply skip the frame anyway - return; - } - } - } - } - - Debug.Assert(false); - } - - public override BytesRef Next() - { - // if (DEBUG) { - // System.out.println("\nintEnum.next seg=" + segment); - // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); - // } - - while (true) - { - // Pop finished frames - while (currentFrame.nextEnt == currentFrame.entCount) - { - if (!currentFrame.isLastInFloor) - { - //if (DEBUG) System.out.println(" next-floor-block"); - currentFrame.LoadNextFloorBlock(); - //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); - } - else - { - //if (DEBUG) System.out.println(" pop frame"); - if (currentFrame.ord == 0) - { - return null; - } - long lastFP = currentFrame.fpOrig; - currentFrame = stack[currentFrame.ord - 1]; - Debug.Assert(currentFrame.lastSubFP == lastFP); - //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); - } - } - - bool isSubBlock = currentFrame.Next(); - // if (DEBUG) { - // final BytesRef suffixRef = new BytesRef(); - // suffixRef.bytes = currentFrame.suffixBytes; - // suffixRef.offset = currentFrame.startBytePos; - // suffixRef.length = currentFrame.suffix; - // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef)); - // } - - if (currentFrame.suffix != 0) - { - int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff; - while (label > currentFrame.curTransitionMax) - { - if (currentFrame.transitionIndex >= currentFrame.transitions.Length - 1) - { - // Stop processing this frame -- no further - // matches are possible because we've moved - // beyond what the max transition will allow - //if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex])); - - // sneaky! forces a pop above - currentFrame.isLastInFloor = true; - currentFrame.nextEnt = currentFrame.entCount; - goto nextTermContinue; - } - currentFrame.transitionIndex++; - currentFrame.curTransitionMax = currentFrame.transitions[currentFrame.transitionIndex].Max; - //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]); - } - } - - // First test the common suffix, if set: - if (compiledAutomaton.CommonSuffixRef != null && !isSubBlock) - { - int termLen = currentFrame.prefix + currentFrame.suffix; - if (termLen < compiledAutomaton.CommonSuffixRef.Length) - { - // No match - // if (DEBUG) { - // System.out.println(" skip: common suffix length"); - // } - goto nextTermContinue; - } - - byte[] suffixBytes = currentFrame.suffixBytes; - byte[] commonSuffixBytes = compiledAutomaton.CommonSuffixRef.Bytes; - - int lenInPrefix = compiledAutomaton.CommonSuffixRef.Length - currentFrame.suffix; - Debug.Assert(compiledAutomaton.CommonSuffixRef.Offset == 0); - int suffixBytesPos; - int commonSuffixBytesPos = 0; - - if (lenInPrefix > 0) - { - // A prefix of the common suffix overlaps with - // the suffix of the block prefix so we first - // test whether the prefix part matches: - byte[] termBytes = term.Bytes; - int termBytesPos = currentFrame.prefix - lenInPrefix; - Debug.Assert(termBytesPos >= 0); - int termBytesPosEnd = currentFrame.prefix; - while (termBytesPos < termBytesPosEnd) - { - if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) - { - // if (DEBUG) { - // System.out.println(" skip: common suffix mismatch (in prefix)"); - // } - goto nextTermContinue; - } - } - suffixBytesPos = currentFrame.startBytePos; - } - else - { - suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.CommonSuffixRef.Length; - } - - // Test overlapping suffix part: - int commonSuffixBytesPosEnd = compiledAutomaton.CommonSuffixRef.Length; - while (commonSuffixBytesPos < commonSuffixBytesPosEnd) - { - if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) - { - // if (DEBUG) { - // System.out.println(" skip: common suffix mismatch"); - // } - goto nextTermContinue; - } - } - } - - // TODO: maybe we should do the same linear test - // that AutomatonTermsEnum does, so that if we - // reach a part of the automaton where .* is - // "temporarily" accepted, we just blindly .next() - // until the limit - - // See if the term prefix matches the automaton: - int state = currentFrame.state; - for (int idx = 0; idx < currentFrame.suffix; idx++) - { - state = runAutomaton.Step(state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff); - if (state == -1) - { - // No match - //System.out.println(" no s=" + state); - goto nextTermContinue; - } - else - { - //System.out.println(" c s=" + state); - } - } - - if (isSubBlock) - { - // Match! Recurse: - //if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP); - CopyTerm(); - currentFrame = PushFrame(state); - //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); - } - else if (runAutomaton.IsAccept(state)) - { - CopyTerm(); - //if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term)); - if (!(savedStartTerm == null || term.CompareTo(savedStartTerm) > 0)) - { - Debug.Assert(false, "saveStartTerm=" + savedStartTerm.Utf8ToString() + " term=" + term.Utf8ToString()); - } - return term; - } - else - { - //System.out.println(" no s=" + state); - } - nextTermContinue: ; - } - //nextTermBreak:; - } - - internal void CopyTerm() - { - int len = currentFrame.prefix + currentFrame.suffix; - if (term.Bytes.Length < len) - { - term.Bytes = ArrayUtil.Grow(term.Bytes, len); - } - System.Buffer.BlockCopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.Bytes, currentFrame.prefix, currentFrame.suffix); - term.Length = len; - } - - public override IComparer Comparer - { - get - { - return BytesRef.UTF8SortedAsUnicodeComparer; - } - } - - public override bool SeekExact(BytesRef text) - { - throw new System.NotSupportedException(); - } - - public override void SeekExact(long ord) - { - throw new System.NotSupportedException(); - } - - public override long Ord - { - get { throw new System.NotSupportedException(); } - } - - public override SeekStatus SeekCeil(BytesRef text) - { - throw new System.NotSupportedException(); - } - } - - // Iterates through terms in this field - internal sealed class SegmentTermsEnum : TermsEnum - { - private readonly BlockTreeTermsReader.FieldReader outerInstance; - - private IndexInput @in; - - private Frame[] stack; - private readonly Frame staticFrame; - private Frame currentFrame; - private bool termExists; - - private int targetBeforeCurrentLength; - - private readonly ByteArrayDataInput scratchReader = new ByteArrayDataInput(); - - // What prefix of the current term was present in the index: - private int validIndexPrefix; - - // assert only: - private bool eof; - - internal readonly BytesRef term = new BytesRef(); - private readonly FST.BytesReader fstReader; - - private FST.Arc[] arcs = new FST.Arc[1]; - - public SegmentTermsEnum(BlockTreeTermsReader.FieldReader outerInstance) - { - this.outerInstance = outerInstance; - //if (DEBUG) System.out.println("BTTR.init seg=" + segment); - stack = new Frame[0]; - - // Used to hold seek by TermState, or cached seek - staticFrame = new Frame(this, -1); - - if (outerInstance.index == null) - { - fstReader = null; - } - else - { - fstReader = this.outerInstance.index.GetBytesReader(); - } - - // Init w/ root block; don't use index since it may - // not (and need not) have been loaded - for (int arcIdx = 0; arcIdx < arcs.Length; arcIdx++) - { - arcs[arcIdx] = new FST.Arc(); - } - - currentFrame = staticFrame; - FST.Arc arc; - if (outerInstance.index != null) - { - arc = outerInstance.index.GetFirstArc(arcs[0]); - // Empty string prefix must have an output in the index! - Debug.Assert(arc.IsFinal); - } - else - { - arc = null; - } - currentFrame = staticFrame; - //currentFrame = pushFrame(arc, rootCode, 0); - //currentFrame.loadBlock(); - validIndexPrefix = 0; - // if (DEBUG) { - // System.out.println("init frame state " + currentFrame.ord); - // printSeekState(); - // } - - //System.out.println(); - // computeBlockStats().print(System.out); - } - - // Not private to avoid synthetic access$NNN methods - internal void InitIndexInput() - { - if (this.@in == null) - { - this.@in = (IndexInput)outerInstance.outerInstance.@in.Clone(); - } - } - - /// - /// Runs next() through the entire terms dict, - /// computing aggregate statistics. - /// - public Stats ComputeBlockStats() - { - Stats stats = new Stats(outerInstance.outerInstance.segment, outerInstance.fieldInfo.Name); - if (outerInstance.index != null) - { - stats.IndexNodeCount = outerInstance.index.NodeCount; - stats.IndexArcCount = outerInstance.index.ArcCount; - stats.IndexNumBytes = outerInstance.index.GetSizeInBytes(); - } - - currentFrame = staticFrame; - FST.Arc arc; - if (outerInstance.index != null) - { - arc = outerInstance.index.GetFirstArc(arcs[0]); - // Empty string prefix must have an output in the index! - Debug.Assert(arc.IsFinal); - } - else - { - arc = null; - } - - // Empty string prefix must have an output in the - // index! - currentFrame = PushFrame(arc, outerInstance.rootCode, 0); - currentFrame.fpOrig = currentFrame.fp; - currentFrame.LoadBlock(); - validIndexPrefix = 0; - - stats.StartBlock(currentFrame, !(currentFrame.isLastInFloor)); - - while (true) - { - // Pop finished blocks - while (currentFrame.nextEnt == currentFrame.entCount) - { - stats.EndBlock(currentFrame); - if (!currentFrame.isLastInFloor) - { - currentFrame.LoadNextFloorBlock(); - stats.StartBlock(currentFrame, true); - } - else - { - if (currentFrame.ord == 0) - { - goto allTermsBreak; - } - long lastFP = currentFrame.fpOrig; - currentFrame = stack[currentFrame.ord - 1]; - Debug.Assert(lastFP == currentFrame.lastSubFP); - // if (DEBUG) { - // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); - // } - } - } - - while (true) - { - if (currentFrame.Next()) - { - // Push to new block: - currentFrame = PushFrame(null, currentFrame.lastSubFP, term.Length); - currentFrame.fpOrig = currentFrame.fp; - // this is a "next" frame -- even if it's - // floor'd we must pretend it isn't so we don't - // try to scan to the right floor frame: - currentFrame.isFloor = false; - //currentFrame.hasTerms = true; - currentFrame.LoadBlock(); - stats.StartBlock(currentFrame, !currentFrame.isLastInFloor); - } - else - { - stats.Term(term); - break; - } - } - //allTermsContinue:; - } - allTermsBreak: - - stats.Finish(); - - // Put root frame back: - currentFrame = staticFrame; - if (outerInstance.index != null) - { - arc = outerInstance.index.GetFirstArc(arcs[0]); - // Empty string prefix must have an output in the index! - Debug.Assert(arc.IsFinal); - } - else - { - arc = null; - } - currentFrame = PushFrame(arc, outerInstance.rootCode, 0); - currentFrame.Rewind(); - currentFrame.LoadBlock(); - validIndexPrefix = 0; - term.Length = 0; - - return stats; - } - - private Frame GetFrame(int ord) - { - if (ord >= stack.Length) - { - Frame[] next = new Frame[ArrayUtil.Oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - Array.Copy(stack, 0, next, 0, stack.Length); - for (int stackOrd = stack.Length; stackOrd < next.Length; stackOrd++) - { - next[stackOrd] = new Frame(this, stackOrd); - } - stack = next; - } - Debug.Assert(stack[ord].ord == ord); - return stack[ord]; - } - - private FST.Arc GetArc(int ord) - { - if (ord >= arcs.Length) - { - FST.Arc[] next = new FST.Arc[ArrayUtil.Oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - Array.Copy(arcs, 0, next, 0, arcs.Length); - for (int arcOrd = arcs.Length; arcOrd < next.Length; arcOrd++) - { - next[arcOrd] = new FST.Arc(); - } - arcs = next; - } - return arcs[ord]; - } - - public override IComparer Comparer - { - get - { - return BytesRef.UTF8SortedAsUnicodeComparer; - } - } - - // Pushes a frame we seek'd to - internal Frame PushFrame(FST.Arc arc, BytesRef frameData, int length) - { - scratchReader.Reset(frameData.Bytes, frameData.Offset, frameData.Length); - long code = scratchReader.ReadVInt64(); - long fpSeek = (long)((ulong)code >> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS); - Frame f = GetFrame(1 + currentFrame.ord); - f.hasTerms = (code & BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; - f.hasTermsOrig = f.hasTerms; - f.isFloor = (code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; - if (f.isFloor) - { - f.SetFloorData(scratchReader, frameData); - } - PushFrame(arc, fpSeek, length); - - return f; - } - - // Pushes next'd frame or seek'd frame; we later - // lazy-load the frame only when needed - internal Frame PushFrame(FST.Arc arc, long fp, int length) - { - Frame f = GetFrame(1 + currentFrame.ord); - f.arc = arc; - if (f.fpOrig == fp && f.nextEnt != -1) - { - //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); - if (f.prefix > targetBeforeCurrentLength) - { - f.Rewind(); - } - else - { - // if (DEBUG) { - // System.out.println(" skip rewind!"); - // } - } - Debug.Assert(length == f.prefix); - } - else - { - f.nextEnt = -1; - f.prefix = length; - f.state.TermBlockOrd = 0; - f.fpOrig = f.fp = fp; - f.lastSubFP = -1; - // if (DEBUG) { - // final int sav = term.length; - // term.length = length; - // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); - // term.length = sav; - // } - } - - return f; - } - - // asserts only - private bool ClearEOF() - { - eof = false; - return true; - } - - // asserts only - private bool SetEOF() - { - eof = true; - return true; - } - - public override bool SeekExact(BytesRef target) - { - if (outerInstance.index == null) - { - throw new InvalidOperationException("terms index was not loaded"); - } - - if (term.Bytes.Length <= target.Length) - { - term.Bytes = ArrayUtil.Grow(term.Bytes, 1 + target.Length); - } - - Debug.Assert(ClearEOF()); - - FST.Arc arc; - int targetUpto; - BytesRef output; - - targetBeforeCurrentLength = currentFrame.ord; - - if (currentFrame != staticFrame) - { - // We are already seek'd; find the common - // prefix of new seek term vs current term and - // re-use the corresponding seek state. For - // example, if app first seeks to foobar, then - // seeks to foobaz, we can re-use the seek state - // for the first 5 bytes. - - // if (DEBUG) { - // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); - // } - - arc = arcs[0]; - Debug.Assert(arc.IsFinal); - output = arc.Output; - targetUpto = 0; - - Frame lastFrame = stack[0]; - Debug.Assert(validIndexPrefix <= term.Length); - - int targetLimit = Math.Min(target.Length, validIndexPrefix); - - int cmp = 0; - - // TODO: reverse vLong byte order for better FST - // prefix output sharing - - // First compare up to valid seek frames: - while (targetUpto < targetLimit) - { - cmp = (term.Bytes[targetUpto] & 0xFF) - (target.Bytes[target.Offset + targetUpto] & 0xFF); - // if (DEBUG) { - // System.out.println(" cyc