lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r479466 [4/8] - /lucene/java/trunk/docs/
Date Mon, 27 Nov 2006 00:03:14 GMT
Added: lucene/java/trunk/docs/fileformats.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/docs/fileformats.html?view=auto&rev=479466
==============================================================================
--- lucene/java/trunk/docs/fileformats.html (added)
+++ lucene/java/trunk/docs/fileformats.html Sun Nov 26 16:03:13 2006
@@ -0,0 +1,1936 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.7">
+<meta name="Forrest-skin-name" content="pelt">
+<title>
+Apache Lucene - Index File Formats
+		</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://lucene.apache.org/">Lucene</a> &gt; <a href="http://lucene.apache.org/java/">Java</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<div class="header">
+<div class="grouplogo">
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/asf-logo.gif" title="Apache Lucene"></a>
+</div>
+<div class="projectlogo">
+<a href="http://lucene.apache.org/java/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/images/lucene_green_300.gif" title="Apache Lucene is a high-performance, full-featured text search engine library written entirely in
+      Java. It is a technology suitable for nearly any application that requires full-text search, especially cross-platform."></a>
+</div>
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="lucene.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
+                    <input attr="value" name="Search" value="Search" type="submit">
+</form>
+</div>
+<ul id="tabs">
+<li class="current">
+<a class="base-selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="base-not-selected" href="http://wiki.apache.org/jakarta-lucene">Wiki</a>
+</li>
+</ul>
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<div id="level2tabs"></div>
+<script type="text/javascript"><!--
+document.write("<text>Last Published:</text> " + document.lastModified);
+//  --></script>
+</div>
+<div class="breadtrail">
+             
+             &nbsp;
+           </div>
+<div id="menu">
+<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">About</div>
+<div id="menu_1.1" class="menuitemgroup">
+<div class="menuitem">
+<a href="index.html" title="Welcome to Java Lucene">Overview</a>
+</div>
+<div class="menuitem">
+<a href="features.html">Features</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/jakarta-lucene/PoweredBy">Powered by Lucene</a>
+</div>
+<div class="menuitem">
+<a href="whoweare.html">Who We Are</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_selected_1.2', 'skin/')" id="menu_selected_1.2Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Documentation</div>
+<div id="menu_selected_1.2" class="selectedmenuitemgroup" style="display: block;">
+<div class="menuitem">
+<a href="api/">API Docs</a>
+</div>
+<div class="menuitem">
+<a href="benchmarks.html">Benchmarks</a>
+</div>
+<div class="menuitem">
+<a href="contributions.html">Contributions</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/jakarta-lucene/LuceneFAQ">FAQ</a>
+</div>
+<div class="menupage">
+<div class="menupagetitle">File Formats</div>
+</div>
+<div class="menuitem">
+<a href="gettingstarted.html">Getting Started</a>
+</div>
+<div class="menuitem">
+<a href="lucene-sandbox/index.html">Lucene Sandbox</a>
+</div>
+<div class="menuitem">
+<a href="queryparsersyntax.html">Query Syntax</a>
+</div>
+<div class="menuitem">
+<a href="scoring.html">Scoring</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/jakarta-lucene">Wiki</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://issues.apache.org/jira/browse/LUCENE">Issue Tracking</a>
+</div>
+<div class="menuitem">
+<a href="mailinglists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="releases.html">Downloads</a>
+</div>
+<div class="menuitem">
+<a href="http://svn.apache.org/viewcvs.cgi/lucene/java/">Version Control</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Site Versions</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="./">Official</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Related Projects</div>
+<div id="menu_1.5" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://lucene.apache.org">Lucene (Top-Level)</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/hadoop/">Hadoop</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/lucy/">Lucy</a>
+</div>
+<div class="menuitem">
+<a href="http://incubator.apache.org/projects/lucene.net.html">Lucene.NET</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/nutch/">Nutch</a>
+</div>
+<div class="menuitem">
+<a href="http://incubator.apache.org/solr/">SOLR</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<div id="credit2"></div>
+</div>
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="fileformats.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>
+Apache Lucene - Index File Formats
+		</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#Index File Formats">Index File Formats</a>
+</li>
+<li>
+<a href="#Definitions">Definitions</a>
+<ul class="minitoc">
+<li>
+<a href="#Inverted Indexing">Inverted Indexing</a>
+</li>
+<li>
+<a href="#Types of Fields">Types of Fields</a>
+</li>
+<li>
+<a href="#Segments">Segments</a>
+</li>
+<li>
+<a href="#Document Numbers">Document Numbers</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#Overview">Overview</a>
+</li>
+<li>
+<a href="#File Naming">File Naming</a>
+</li>
+<li>
+<a href="#Primitive Types">Primitive Types</a>
+<ul class="minitoc">
+<li>
+<a href="#Byte">Byte</a>
+</li>
+<li>
+<a href="#UInt32">UInt32</a>
+</li>
+<li>
+<a href="#Uint64">Uint64</a>
+</li>
+<li>
+<a href="#VInt">VInt</a>
+</li>
+<li>
+<a href="#Chars">Chars</a>
+</li>
+<li>
+<a href="#String">String</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#Per-Index Files">Per-Index Files</a>
+<ul class="minitoc">
+<li>
+<a href="#Segments File">Segments File</a>
+</li>
+<li>
+<a href="#Lock Files">Lock Files</a>
+</li>
+<li>
+<a href="#Deletable File">Deletable File</a>
+</li>
+<li>
+<a href="#Compound Files">Compound Files</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#Per-Segment Files">Per-Segment Files</a>
+<ul class="minitoc">
+<li>
+<a href="#Fields">Fields</a>
+</li>
+<li>
+<a href="#Term Dictionary">Term Dictionary</a>
+</li>
+<li>
+<a href="#Frequencies">Frequencies</a>
+</li>
+<li>
+<a href="#Positions">Positions</a>
+</li>
+<li>
+<a href="#Normalization Factors">Normalization Factors</a>
+</li>
+<li>
+<a href="#Term Vectors">Term Vectors</a>
+</li>
+<li>
+<a href="#Deleted Documents">Deleted Documents</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#Limitations">Limitations</a>
+</li>
+</ul>
+</div>
+        
+<a name="N10016"></a><a name="Index File Formats"></a>
+<h2 class="boxed">Index File Formats</h2>
+<div class="section">
+<p>
+                This document defines the index file formats used
+                in Lucene version 2.0.  If you are using a different
+		version of Lucene, please consult the copy of
+		<span class="codefrag">docs/fileformats.html</span> that was distributed
+		with the version you are using.
+            </p>
+<p>
+                Apache Lucene is written in Java, but several
+                efforts are underway to write
+                <a href="http://wiki.apache.org/jakarta-lucene/LuceneImplementations">versions
+                of Lucene in other programming
+                languages</a>.  If these versions are to remain compatible with Apache
+                Lucene, then a language-independent definition of the Lucene index
+                format is required.  This document thus attempts to provide a
+                complete and independent definition of the Apache Lucene 1.4 file
+                formats.
+            </p>
+<p>
+                As Lucene evolves, this document should evolve.
+                Versions of Lucene in different programming languages should endeavor
+                to agree on file formats, and generate new versions of this document.
+            </p>
+<p>
+                Compatibility notes are provided in this document,
+                describing how file formats have changed from prior versions.
+            </p>
+</div>
+
+        
+<a name="N10030"></a><a name="Definitions"></a>
+<h2 class="boxed">Definitions</h2>
+<div class="section">
+<p>
+                The fundamental concepts in Lucene are index,
+                document, field and term.
+            </p>
+<p>
+                An index contains a sequence of documents.
+            </p>
+<ul>
+                
+<li>
+                    
+<p>
+                        A document is a sequence of fields.
+                    </p>
+                
+</li>
+
+                
+<li>
+                    
+<p>
+                        A field is a named sequence of terms.
+                    </p>
+                
+</li>
+
+                
+<li>
+                    A term is a string.
+                </li>
+            
+</ul>
+<p>
+                The same string in two different fields is
+                considered a different term.  Thus terms are represented as a pair of
+                strings, the first naming the field, and the second naming text
+                within the field.
+            </p>
+<a name="N10051"></a><a name="Inverted Indexing"></a>
+<h3 class="boxed">Inverted Indexing</h3>
+<p>
+                    The index stores statistics about terms in order
+                    to make term-based search more efficient.  Lucene's
+                    index falls into the family of indexes known as an <i>inverted
+                        index.</i> This is because it can list, for a term, the documents that contain
+                    it.  This is the inverse of the natural relationship, in which
+                    documents list terms.
+                </p>
+<a name="N1005E"></a><a name="Types of Fields"></a>
+<h3 class="boxed">Types of Fields</h3>
+<p>
+                    In Lucene, fields may be <i>stored</i>, in which
+                    case their text is stored in the index literally, in a non-inverted
+                    manner.  Fields that are inverted are called <i>indexed</i>. A field
+                    may be both stored and indexed.</p>
+<p>The text of a field may be <i>tokenized</i> into terms to be
+                    indexed, or the text of a field may be used literally as a term to be indexed.
+                    Most fields are
+                    tokenized, but sometimes it is useful for certain identifier fields
+                    to be indexed literally.
+                </p>
+<p>See the <a href="http://lucene.apache.org/java/docs/api/org/apache/lucene/document/Field.html">Field</a> java docs for more information on Fields.</p>
+<a name="N1007B"></a><a name="Segments"></a>
+<h3 class="boxed">Segments</h3>
+<p>
+                    Lucene indexes may be composed of multiple sub-indexes, or<i>
+                        segments</i>. Each segment is a fully independent index, which could be searched
+                    separately. Indexes evolve by:
+                </p>
+<ol>
+                    
+<li>
+<p>Creating new segments for newly added documents.</p>
+                    
+</li>
+                    
+<li>
+<p>Merging existing segments.</p>
+                    
+</li>
+                
+</ol>
+<p>
+                    Searches may involve multiple segments and/or multiple indexes, each
+                    index potentially composed of a set of segments.
+                </p>
+<a name="N10098"></a><a name="Document Numbers"></a>
+<h3 class="boxed">Document Numbers</h3>
+<p>
+                    Internally, Lucene refers to documents by an integer <i>document
+                        number</i>. The first document added to an index is numbered zero, and each
+                    subsequent document added gets a number one greater than the previous.
+                </p>
+<p>
+                    
+<br>
+                
+</p>
+<p>
+                    Note that a document's number may change, so caution should be taken
+                    when storing these numbers outside of Lucene.  In particular, numbers may
+                    change in the following situations:
+                </p>
+<ul>
+                    
+<li>
+                        
+<p>
+                            The
+                            numbers stored in each segment are unique only within the segment,
+                            and must be converted before they can be used in a larger context.
+                            The standard technique is to allocate each segment a range of
+                            values, based on the range of numbers used in that segment.  To
+                            convert a document number from a segment to an external value, the
+                            segment's <i>base</i> document
+                            number is added.  To convert an external value back to a
+                            segment-specific value, the  segment is identified by the range that
+                            the external value is in, and the segment's base value is
+                            subtracted.  For example two five document segments might be
+                            combined, so that the first segment has a base value of zero, and
+                            the second of five.  Document three from the second segment would
+                            have an external value of eight.
+                        </p>
+                    
+</li>
+                    
+<li>
+                        
+<p>
+                            When documents are deleted, gaps are created
+                            in the numbering.  These are eventually removed as the index evolves
+                            through merging.  Deleted documents are dropped when segments are
+                            merged.  A freshly-merged segment thus has no gaps in its numbering.
+                        </p>
+                    
+</li>
+                
+</ul>
+</div>
+
+        
+<a name="N100C0"></a><a name="Overview"></a>
+<h2 class="boxed">Overview</h2>
+<div class="section">
+<p>
+                Each segment index maintains the following:
+            </p>
+<ul>
+                
+<li>
+<p>Field names.  This
+                        contains the set of field names used in the index.
+
+                    </p>
+                
+</li>
+                
+<li>
+<p>Stored Field
+                        values.  This contains, for each document, a list of attribute-value
+                        pairs, where the attributes are field names.  These are used to
+                        store auxiliary information about the document, such as its title,
+                        url, or an identifier to access a
+                        database. The set of stored fields are what is returned for each hit
+                        when searching.  This is keyed by document number.
+                    </p>
+                
+</li>
+                
+<li>
+<p>Term dictionary.
+                        A dictionary containing all of the terms used in all of the indexed
+                        fields of all of the documents.  The dictionary also contains the
+                        number of documents which contain the term, and pointers to the
+                        term's frequency and proximity data.
+                    </p>
+                
+</li>
+
+                
+<li>
+<p>Term Frequency
+                        data.  For each term in the dictionary, the numbers of all the
+                        documents that contain that term, and the frequency of the term in
+                        that document.
+                    </p>
+                
+</li>
+
+                
+<li>
+<p>Term Proximity
+                        data.  For each term in the dictionary, the positions that the term
+                        occurs in each document.
+                    </p>
+                
+</li>
+
+                
+<li>
+<p>Normalization
+                        factors.  For each field in each document, a value is stored that is
+                        multiplied into the score for hits on that field.
+                    </p>
+                
+</li>
+                
+<li>
+<p>Term Vectors.  For each field in each document, the term vector
+                       (sometimes called document vector) may be stored.  A term vector consists
+                       of term text and term frequency.  To add Term Vectors to your index see the
+                    <a href="http://lucene.apache.org/java/docs/api/org/apache/lucene/document/Field.html">Field</a> constructors
+                    </p>
+                
+</li>              
+                
+<li>
+<p>Deleted documents.
+                        An optional file indicating which documents are deleted.
+                    </p>
+                
+</li>
+            
+</ul>
+<p>Details on each of these are provided in subsequent sections.
+            </p>
+</div>
+
+        
+<a name="N100FC"></a><a name="File Naming"></a>
+<h2 class="boxed">File Naming</h2>
+<div class="section">
+<p>
+                All files belonging to a segment have the same name with varying
+                extensions.  The extensions correspond to the different file formats
+                described below. When using the Compound File format (default in 1.4 and greater) these files are
+                collapsed into a single .cfs file (see below for details)
+            </p>
+<p>
+                Typically, all segments
+                in an index are stored in a single directory, although this is not
+                required.
+            </p>
+</div>
+
+        
+<a name="N10109"></a><a name="Primitive Types"></a>
+<h2 class="boxed">Primitive Types</h2>
+<div class="section">
+<a name="N1010F"></a><a name="Byte"></a>
+<h3 class="boxed">Byte</h3>
+<p>
+                    The most primitive type
+                    is an eight-bit byte.  Files are accessed as sequences of bytes.  All
+                    other data types are defined as sequences
+                    of bytes, so file formats are byte-order independent.
+                </p>
+<a name="N10119"></a><a name="UInt32"></a>
+<h3 class="boxed">UInt32</h3>
+<p>
+                    32-bit unsigned integers are written as four
+                    bytes, high-order bytes first.
+                </p>
+<p>
+                    UInt32    --&gt; &lt;Byte&gt;<sup>4</sup>
+                
+</p>
+<a name="N10129"></a><a name="Uint64"></a>
+<h3 class="boxed">Uint64</h3>
+<p>
+                    64-bit unsigned integers are written as eight
+                    bytes, high-order bytes first.
+                </p>
+<p>UInt64    --&gt; &lt;Byte&gt;<sup>8</sup>
+                
+</p>
+<a name="N10139"></a><a name="VInt"></a>
+<h3 class="boxed">VInt</h3>
+<p>
+                    A variable-length format for positive integers is
+                    defined where the high-order bit of each byte indicates whether more
+                    bytes remain to be read.  The low-order seven bits are appended as
+                    increasingly more significant bits in the resulting integer value.
+                    Thus values from zero to 127 may be stored in a single byte, values
+                    from 128 to 16,383 may be stored in two bytes, and so on.
+                </p>
+<p>
+<b>VInt Encoding Example</b>
+</p>
+<table class="ForrestTable" cellspacing="0" cellpadding="4" border="0">
+                    
+<col width="64*">
+                    
+<col width="64*">
+                    
+<col width="64*">
+                    
+<col width="64*">
+                    
+<tr valign="TOP">
+                        
+<td width="25%">
+                            
+<p align="RIGHT">
+<b>Value</b>
+                            
+</p>
+                        
+</td>
+                        <td width="25%">
+                            
+<p align="RIGHT">
+<b>First byte</b>
+                            
+</p>
+                        
+</td>
+                        <td width="25%">
+                            
+<p align="RIGHT">
+<b>Second byte</b>
+                            
+</p>
+                        
+</td>
+                        <td width="25%">
+                            
+<p align="RIGHT">
+<b>Third byte</b>
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="0" width="25%">
+                            
+<p align="RIGHT">0
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="0" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                00000000
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.07cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="1" width="25%">
+                            
+<p align="RIGHT">1
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="1" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                00000001
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.07cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="2" width="25%">
+                            
+<p align="RIGHT">2
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="10" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                00000010
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.07cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr>
+                        
+<td valign="TOP" width="25%">
+                            
+<p align="RIGHT">...
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" style="margin-left: 0.11cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.07cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="127" width="25%">
+                            
+<p align="RIGHT">127
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="1111111" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                01111111
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.07cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="128" width="25%">
+                            
+<p align="RIGHT">128
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="10000000" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                10000000
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="1" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.07cm;                                margin-right: 0.01cm">
+                                00000001
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="129" width="25%">
+                            
+<p align="RIGHT">129
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="10000001" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                10000001
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="1" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.07cm;                                margin-right: 0.01cm">
+                                00000001
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="130" width="25%">
+                            
+<p align="RIGHT">130
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="10000010" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                10000010
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="1" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.07cm;                                margin-right: 0.01cm">
+                                00000001
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr>
+                        
+<td valign="TOP" width="25%">
+                            
+<p align="RIGHT">...
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" style="margin-left: 0.11cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.07cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="16383" width="25%">
+                            
+<p align="RIGHT">16,383
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="11111111" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                11111111
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="1111111" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.07cm;                                margin-right: 0.01cm">
+                                01111111
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" width="25%">
+                            
+<p align="RIGHT" style="margin-left: -0.47cm; margin-right:                                0.01cm">
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="16384" width="25%">
+                            
+<p align="RIGHT">16,384
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="10000000" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                10000000
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="10000000" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.07cm;                                margin-right: 0.01cm">
+                                10000000
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="1" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.47cm;                                margin-right: 0.01cm">
+                                00000001
+                            </p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr valign="BOTTOM">
+                        
+<td sdnum="1033;0;#,##0" sdval="16385" width="25%">
+                            
+<p align="RIGHT">16,385
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="10000001" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                10000001
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="10000000" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.07cm;                                margin-right: 0.01cm">
+                                10000000
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" sdval="1" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.47cm;                                margin-right: 0.01cm">
+                                00000001
+                            </p>
+                        
+</td>
+                    
+</tr>
+                    
+<tr>
+                        
+<td valign="TOP" width="25%">
+                            
+<p align="RIGHT">...
+                            </p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: 0.11cm;                                margin-right: 0.01cm">
+                                
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.07cm;                                margin-right: 0.01cm">
+                                
+<br>
+
+                            
+</p>
+                        
+</td>
+                        <td sdnum="1033;0;00000000" valign="BOTTOM" width="25%">
+                            
+<p align="RIGHT" class="western" style="margin-left: -0.47cm;                                margin-right: 0.01cm">
+                                
+<br>
+
+                            
+</p>
+                        
+</td>
+                    
+</tr>
+                
+</table>
+<p>
+                    This provides compression while still being
+                    efficient to decode.
+                </p>
+<a name="N10407"></a><a name="Chars"></a>
+<h3 class="boxed">Chars</h3>
+<p>
+                    Lucene writes unicode
+                    character sequences using Java's
+                    <a href="http://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8">"modified
+                    UTF-8 encoding"</a>.
+                </p>
+<a name="N10415"></a><a name="String"></a>
+<h3 class="boxed">String</h3>
+<p>
+                    Lucene writes strings as a VInt representing the length, followed by
+                    the character data.
+                </p>
+<p>
+                    String --&gt; VInt, Chars
+                </p>
+</div>
+
+        
+<a name="N10423"></a><a name="Per-Index Files"></a>
+<h2 class="boxed">Per-Index Files</h2>
+<div class="section">
+<p>
+                The files in this section exist one-per-index.
+            </p>
+<a name="N1042C"></a><a name="Segments File"></a>
+<h3 class="boxed">Segments File</h3>
+<p>
+                    The active segments in the index are stored in the
+                    segment info file.  An index only has
+                    a single file in this format, and it is named "segments".
+                    This lists each segment by name, and also contains the size of each
+                    segment.
+                </p>
+<p>
+                    Segments    --&gt; Format, Version, NameCounter, SegCount, &lt;SegName, SegSize&gt;<sup>SegCount</sup>
+                
+</p>
+<p>
+                    Format, NameCounter, SegCount, SegSize    --&gt; UInt32
+                </p>
+<p>
+                    Version --&gt; UInt64
+                </p>
+<p>
+                    SegName    --&gt; String
+                </p>
+<p>
+                    Format is -1 in Lucene 1.4.
+                </p>
+<p>
+                    Version counts how often the index has been
+                    changed by adding or deleting documents.
+                </p>
+<p>
+                    NameCounter is used to generate names for new segment files.
+                </p>
+<p>
+                    SegName is the name of the segment, and is used as the file name prefix
+                    for all of the files that compose the segment's index.
+                </p>
+<p>
+                    SegSize is the number of documents contained in the segment index.
+                </p>
+<a name="N10454"></a><a name="Lock Files"></a>
+<h3 class="boxed">Lock Files</h3>
+<p>
+                    Several files are used to indicate that another
+                    process is using an index.  Note that these files are not
+                    stored in the index directory itself, but rather in the
+                    system's temporary directory, as indicated in the Java
+                    system property "java.io.tmpdir".
+                </p>
+<ul>
+                    
+<li>
+                        
+<p>
+                            When a file named "commit.lock"
+                            is present, a process is currently re-writing the "segments"
+                            file and deleting outdated segment index files, or a process is
+                            reading the "segments"
+                            file and opening the files of the segments it names.  This lock file
+                            prevents files from being deleted by another process after a process
+                            has read the "segments"
+                            file but before it has managed to open all of the files of the
+                            segments named therein.
+                        </p>
+                    
+</li>
+
+                    
+<li>
+                        
+<p>
+                            When a file named "write.lock"
+                            is present, a process is currently adding documents to an index, or
+                            removing files from that index.  This lock file prevents several
+                            processes from attempting to modify an index at the same time.
+                        </p>
+                    
+</li>
+                
+</ul>
+<a name="N1046D"></a><a name="Deletable File"></a>
+<h3 class="boxed">Deletable File</h3>
+<p>
+                    A file named "deletable"
+                    contains the names of files that are no longer used by the index, but
+                    which could not be deleted.  This is only used on Win32, where a
+                    file may not be deleted while it is still open. On other platforms
+                    the file contains only null bytes.
+                </p>
+<p>
+                    Deletable    --&gt; DeletableCount,
+                    &lt;DelableName&gt;<sup>DeletableCount</sup>
+                
+</p>
+<p>DeletableCount    --&gt; UInt32
+                </p>
+<p>DeletableName    --&gt;
+                    String
+                </p>
+<a name="N10483"></a><a name="Compound Files"></a>
+<h3 class="boxed">Compound Files</h3>
+<p>Starting with Lucene 1.4 the compound file format became default. This
+            	is simply a container for all files described in the next section.</p>
+<p>Compound (.cfs) --&gt; FileCount, &lt;DataOffset, FileName&gt;<sup>FileCount</sup>,
+            		FileData<sup>FileCount</sup>
+</p>
+<p>FileCount --&gt; VInt</p>
+<p>DataOffset --&gt; Long</p>
+<p>FileName --&gt; String</p>
+<p>FileData --&gt; raw file data</p>
+<p>The raw file data is the data from the individual files named above.</p>
+</div>
+
+        
+<a name="N104A5"></a><a name="Per-Segment Files"></a>
+<h2 class="boxed">Per-Segment Files</h2>
+<div class="section">
+<p>
+                The remaining files are all per-segment, and are
+                thus defined by suffix.
+            </p>
+<a name="N104AE"></a><a name="Fields"></a>
+<h3 class="boxed">Fields</h3>
+<p>
+<br>
+<b>Field Info</b>
+<br>
+</p>
+<p>
+                    Field names are
+                    stored in the field info file, with suffix .fnm.
+                </p>
+<p>
+                    FieldInfos
+                    (.fnm)    --&gt; FieldsCount, &lt;FieldName,
+                    FieldBits&gt;<sup>FieldsCount</sup>
+                
+</p>
+<p>
+                    FieldsCount    --&gt; VInt
+                </p>
+<p>
+                    FieldName    --&gt; String
+                </p>
+<p>
+                    FieldBits    --&gt; Byte
+                </p>
+<p>
+	          
+<ul>
+                    
+<li>
+                    The low-order bit is one for
+		    indexed fields, and zero for non-indexed fields.
+                    </li>
+		    
+<li>
+		    The second lowest-order
+                    bit is one for fields that have term vectors stored, and zero for fields
+                    without term vectors.  
+	            </li>
+                        
+<p>
+<b>Lucene &gt;= 1.9:</b>
+</p>
+		    
+<li> If the third lowest-order bit is set (0x04), term positions are stored with the term vectors. </li>
+		    
+<li> If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors. </li>
+		    
+<li> If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field. </li>
+		  
+</ul>
+                
+</p>
+<p>
+                    Fields are numbered by their order in this file.  Thus field zero is
+                    the
+                    first field in the file, field one the next, and so on.  Note that,
+                    like document numbers, field numbers are segment relative.
+                </p>
+<p>
+<br>
+<b>Stored Fields</b>
+<br>
+</p>
+<p>
+                    Stored fields are represented by two files:
+                </p>
+<ol>
+                    
+<li>
+                        
+<p>
+                            The field index, or .fdx file.
+                        </p>
+
+                        
+<p>
+                            This contains, for each document, a pointer to
+                            its field data, as follows:
+                        </p>
+
+                        
+<p>
+                            FieldIndex
+                            (.fdx)    --&gt;
+                            &lt;FieldValuesPosition&gt;<sup>SegSize</sup>
+                        
+</p>
+                        
+<p>FieldValuesPosition
+                            --&gt; Uint64
+                        </p>
+                        
+<p>This
+                            is used to find the location within the field data file of the
+                            fields of a particular document.  Because it contains fixed-length
+                            data, this file may be easily randomly accessed.  The position of
+                            document<i> n</i>'s<i> </i>field data is the Uint64 at <i>n*8</i> in
+                            this file.
+                        </p>
+                    
+</li>
+                    
+<li>
+                        
+<p>
+                            The field data, or .fdt file.
+
+                        </p>
+
+                        
+<p>
+                            This contains the stored fields of each document,
+                            as follows:
+                        </p>
+
+                        
+<p>
+                            FieldData (.fdt)    --&gt;
+                            &lt;DocFieldData&gt;<sup>SegSize</sup>
+                        
+</p>
+                        
+<p>DocFieldData    --&gt;
+                            FieldCount, &lt;FieldNum, Bits, Value&gt;<sup>FieldCount</sup>
+                        
+</p>
+                        
+<p>FieldCount  --&gt;
+                            VInt
+                        </p>
+                        
+<p>FieldNum    --&gt;
+                            VInt
+                        </p>
+                        
+                        
+<p>
+<b>Lucene &lt;= 1.4:</b>
+</p>
+                        
+<p>Bits        --&gt;
+                            Byte
+                        </p>
+                        
+<p>Value        --&gt;
+                            String
+                        </p>
+                        
+<p>Only the low-order bit of Bits is used.  It is one for
+                            tokenized fields, and zero for non-tokenized fields.
+                        </p>
+                        
+<p>
+<b>Lucene &gt;= 1.9:</b>
+</p>
+                        
+<p>Bits        --&gt;
+                            Byte
+                        </p>
+                        
+<p>
+                        
+<ul>
+                        	
+<li>low order bit is one for tokenized fields</li>
+                        	
+<li>second bit is one for fields containing binary data</li>
+                        	
+<li>third bit is one for fields with compression option enabled
+                        		(if compression is enabled, the algorithm used is ZLIB)</li>
+                        
+</ul>
+                        
+</p>
+                        
+<p>Value        --&gt;
+                            String | BinaryValue (depending on Bits)
+                        </p>
+                        
+<p>BinaryValue        --&gt;
+                            ValueSize, &lt;Byte&gt;^ValueSize
+                        </p>
+                        
+<p>ValueSize        --&gt;
+                            VInt
+                        </p>
+
+                    
+</li>
+                
+</ol>
+<a name="N1055A"></a><a name="Term Dictionary"></a>
+<h3 class="boxed">Term Dictionary</h3>
+<p>
+                    The term dictionary is represented as two files:
+                </p>
+<ol>
+                    
+<li>
+                        
+<p>
+                            The term infos, or tis file.
+                        </p>
+
+                        
+<p>
+                            TermInfoFile (.tis)--&gt;
+                            TIVersion, TermCount, IndexInterval, SkipInterval, TermInfos
+                        </p>
+                        
+<p>TIVersion    --&gt;
+                            UInt32
+                        </p>
+                        
+<p>TermCount    --&gt;
+                            UInt64
+                        </p>
+                        
+<p>IndexInterval    --&gt;
+                            UInt32
+                        </p>
+                        
+<p>SkipInterval   --&gt;
+                            UInt32
+                        </p>
+                        
+<p>TermInfos    --&gt;
+                            &lt;TermInfo&gt;<sup>TermCount</sup>
+                        
+</p>
+                        
+<p>TermInfo    --&gt;
+                            &lt;Term, DocFreq, FreqDelta, ProxDelta, SkipDelta&gt;
+                        </p>
+                        
+<p>Term        --&gt;
+                            &lt;PrefixLength, Suffix, FieldNum&gt;
+                        </p>
+                        
+<p>Suffix        --&gt;
+                            String
+                        </p>
+                        
+<p>PrefixLength,
+                            DocFreq, FreqDelta, ProxDelta, SkipDelta<br>        --&gt; VInt
+                        </p>
+                        
+<p>This
+                            file is sorted by Term.  Terms are ordered first lexicographically
+                            by the term's field name, and within that lexicographically by the
+                            term's text.
+                        </p>
+                        
+<p>TIVersion names the version of the format
+                            of this file and is -2 in Lucene 1.4.
+                        </p>
+                        
+<p>Term
+                            text prefixes are shared.  The PrefixLength is the number of initial
+                            characters from the previous term which must be pre-pended to a
+                            term's suffix in order to form the term's text.  Thus, if the
+                            previous term's text was "bone" and the term is "boy",
+                            the PrefixLength is two and the suffix is "y".
+                        </p>
+                        
+<p>FieldNumber
+                            determines the term's field, whose name is stored in the .fdt file.
+                        </p>
+                        
+<p>DocFreq
+                            is the count of documents which contain the term.
+                        </p>
+                        
+<p>FreqDelta
+                            determines the position of this term's TermFreqs within the .frq
+                            file.  In particular, it is the difference between the position of
+                            this term's data in that file and the position of the previous
+                            term's data (or zero, for the first term in the file).
+                        </p>
+                        
+<p>ProxDelta
+                            determines the position of this term's TermPositions within the .prx
+                            file.  In particular, it is the difference between the position of
+                            this term's data in that file and the position of the previous
+                            term's data (or zero, for the first term in the file.
+                        </p>
+                        
+<p>SkipDelta determines the position of this
+                            term's SkipData within the .frq file.  In
+                            particular, it is the number of bytes
+                            after TermFreqs that the SkipData starts.
+                            In other words, it is the length of the
+                            TermFreq data.
+                        </p>
+                    
+</li>
+                    
+<li>
+                        
+<p>
+                            The term info index, or .tii file.
+                        </p>
+
+                        
+<p>
+                            This contains every IndexInterval<sup>th</sup> entry from the .tis
+                            file, along with its location in the "tis" file.  This is
+                            designed to be read entirely into memory and used to provide random
+                            access to the "tis" file.
+                        </p>
+
+                        
+<p>
+                            The structure of this file is very similar to the
+                            .tis file, with the addition of one item per record, the IndexDelta.
+                        </p>
+
+                        
+<p>
+                            TermInfoIndex (.tii)--&gt;
+                            TIVersion, IndexTermCount, IndexInterval, SkipInterval, TermIndices 
+                        </p>
+                        
+<p>TIVersion --&gt;
+                        	UInt32
+                        </p>
+                        
+<p>IndexTermCount    --&gt;
+                            UInt64
+                        </p>
+                        
+<p>IndexInterval --&gt;
+                        	UInt32
+                        </p>
+                        
+<p>SkipInterval --&gt;
+                        	UInt32
+                        </p>
+                        
+<p>TermIndices    --&gt;
+                            &lt;TermInfo, IndexDelta&gt;<sup>IndexTermCount</sup>
+                        
+</p>
+                        
+<p>IndexDelta    --&gt;
+                            VLong
+                        </p>
+                        
+<p>IndexDelta
+                            determines the position of this term's TermInfo within the .tis file.  In
+                            particular, it is the difference between the position of this term's
+                            entry in that file and the position of the previous term's entry.
+                        </p>
+                        
+<p>SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate TermDocs.skipTo(int).
+                            Larger values result in smaller indexes, greater acceleration, but fewer accelerable cases, while
+                            smaller values result in bigger indexes, less acceleration and more
+                            accelerable cases.</p>
+                    
+</li>
+                
+</ol>
+<a name="N105D5"></a><a name="Frequencies"></a>
+<h3 class="boxed">Frequencies</h3>
+<p>
+                    The .frq file contains the lists of documents
+                    which contain each term, along with the frequency of the term in that
+                    document.
+                </p>
+<p>FreqFile (.frq)    --&gt;
+                    &lt;TermFreqs, SkipData&gt;<sup>TermCount</sup>
+                
+</p>
+<p>TermFreqs    --&gt;
+                    &lt;TermFreq&gt;<sup>DocFreq</sup>
+                
+</p>
+<p>TermFreq        --&gt;
+                    DocDelta, Freq?
+                </p>
+<p>SkipData        --&gt;
+                    &lt;SkipDatum&gt;<sup>DocFreq/SkipInterval</sup>
+                
+</p>
+<p>SkipDatum    --&gt;
+                    DocSkip,FreqSkip,ProxSkip
+                </p>
+<p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip    --&gt;
+                    VInt
+                </p>
+<p>TermFreqs
+                    are ordered by term (the term is implicit, from the .tis file).
+                </p>
+<p>TermFreq
+                    entries are ordered by increasing document number.
+                </p>
+<p>DocDelta
+                    determines both the document number and the frequency.  In
+                    particular, DocDelta/2 is the difference between this document number
+                    and the previous document number (or zero when this is the first
+                    document in a TermFreqs).  When DocDelta is odd, the frequency is
+                    one.  When DocDelta is even, the frequency is read as another VInt.
+                </p>
+<p>For
+                    example, the TermFreqs for a term which occurs once in document seven
+                    and three times in document eleven would be the following sequence of
+                    VInts:
+                </p>
+<p>    15,
+                    8, 3
+                </p>
+<p>DocSkip records the document number before every
+                    SkipInterval<sup>th</sup> document in TermFreqs.
+                    Document numbers are represented as differences
+                    from the previous value in the sequence.  FreqSkip
+                    and ProxSkip record the position of every
+                    SkipInterval<sup>th</sup> entry in FreqFile and
+                    ProxFile, respectively.  File positions are
+                    relative to the start of TermFreqs and Positions,
+                    to the previous SkipDatum in the sequence.
+                </p>
+<p>For example, if DocFreq=35 and SkipInterval=16,
+                    then there are two SkipData entries, containing
+                    the 15<sup>th</sup> and 31<sup>st</sup> document
+                    numbers in TermFreqs.  The first FreqSkip names
+                    the number of bytes after the beginning of
+                    TermFreqs that the 16<sup>th</sup> SkipDatum
+                    starts, and the second the number of bytes after
+                    that that the 32<sup>nd</sup> starts.  The first
+                    ProxSkip names the number of bytes after the
+                    beginning of Positions that the 16<sup>th</sup>
+                    SkipDatum starts, and the second the number of
+                    bytes after that that the 32<sup>nd</sup> starts.
+                </p>
+<a name="N10627"></a><a name="Positions"></a>
+<h3 class="boxed">Positions</h3>
+<p>
+                    The .prx file contains the lists of positions that
+                    each term occurs at within documents.
+                </p>
+<p>ProxFile (.prx)    --&gt;
+                    &lt;TermPositions&gt;<sup>TermCount</sup>
+                
+</p>
+<p>TermPositions    --&gt;
+                    &lt;Positions&gt;<sup>DocFreq</sup>
+                
+</p>
+<p>Positions        --&gt;
+                    &lt;PositionDelta&gt;<sup>Freq</sup>
+                
+</p>
+<p>PositionDelta    --&gt;
+                    VInt
+                </p>
+<p>TermPositions
+                    are ordered by term (the term is implicit, from the .tis file).
+                </p>
+<p>Positions
+                    entries are ordered by increasing document number (the document
+                    number is implicit from the .frq file).
+                </p>
+<p>PositionDelta
+                    is the difference between the position of the current occurrence in
+                    the document and the previous occurrence (or zero, if this is the
+                    first occurrence in this document).
+                </p>
+<p>
+                    For example, the TermPositions for a
+                    term which occurs as the fourth term in one document, and as the
+                    fifth and ninth term in a subsequent document, would be the following
+                    sequence of VInts:
+                </p>
+<p>    4,
+                    5, 4
+                </p>
+<a name="N10655"></a><a name="Normalization Factors"></a>
+<h3 class="boxed">Normalization Factors</h3>
+<p>There's a norm file for each indexed field with a byte for
+                   each document.  The .f[0-9]* file contains,
+                    for each document, a byte that encodes a value that is multiplied
+                    into the score for hits on that field:
+                </p>
+<p>Norms
+                    (.f[0-9]*)    --&gt; &lt;Byte&gt;<sup>SegSize</sup>
+                
+</p>
+<p>Each
+                    byte encodes a floating point value.  Bits 0-2 contain the 3-bit
+                    mantissa, and bits 3-8 contain the 5-bit exponent.
+                </p>
+<p>These
+                    are converted to an IEEE single float value as follows:
+                </p>
+<ol>
+                    
+<li>
+<p>If
+                            the byte is zero, use a zero float.
+                        </p>
+                    
+</li>
+                    
+<li>
+<p>Otherwise,
+                            set the sign bit of the float to zero;
+                        </p>
+                    
+</li>
+                    
+<li>
+<p>add
+                            48 to the exponent and use this as the float's exponent;
+                        </p>
+                    
+</li>
+                    
+<li>
+<p>map
+                            the mantissa to the high-order 3 bits of the float's mantissa; and
+
+                        </p>
+                    
+</li>
+                    
+<li>
+<p>set
+                            the low-order 21 bits of the float's mantissa to zero.
+                        </p>
+                    
+</li>
+                
+</ol>
+<a name="N10687"></a><a name="Term Vectors"></a>
+<h3 class="boxed">Term Vectors</h3>
+<ol>
+                
+<li>
+                  
+<p>The Document Index or .tvx file.</p>
+                  
+<p>This contains, for each document, a pointer to the document data in the Document 
+                    (.tvd) file.
+                  </p>
+                  
+<p>DocumentIndex (.tvx) --&gt; TVXVersion&lt;DocumentPosition&gt;<sup>NumDocs</sup>
+</p>
+                  
+<p>TVXVersion --&gt; Int</p>
+                  
+<p>DocumentPosition   --&gt; UInt64</p>
+                  
+<p>This is used to find the position of the Document in the .tvd file.</p>
+                
+</li>
+                
+<li>
+                  
+<p>The Document or .tvd file.</p>
+                  
+<p>This contains, for each document, the number of fields, a list of the fields with
+                  term vector info and finally a list of pointers to the field information in the .tvf 
+                  (Term Vector Fields) file.</p>
+                  
+<p>
+                    Document (.tvd) --&gt; TVDVersion&lt;NumFields, FieldNums, FieldPositions,&gt;<sup>NumDocs</sup>
+                  
+</p>
+                  
+<p>TVDVersion --&gt; Int</p>
+                  
+<p>NumFields --&gt; VInt</p>
+                  
+<p>FieldNums --&gt; &lt;FieldNumDelta&gt;<sup>NumFields</sup>
+</p>
+                  
+<p>FieldNumDelta --&gt; VInt</p>
+                  
+<p>FieldPositions --&gt; &lt;FieldPosition&gt;<sup>NumFields</sup>
+</p>
+                  
+<p>FieldPosition --&gt; VLong</p>
+                  
+<p>The .tvd file is used to map out the fields that have term vectors stored and
+                  where the field information is in the .tvf file.</p>
+                
+</li>
+                
+<li>
+                  
+<p>The Field or .tvf file.</p>
+                  
+<p>This file contains, for each field that has a term vector stored, a list of
+                  the terms and their frequencies.</p>
+                  
+<p>Field (.tvf) --&gt; TVFVersion&lt;NumTerms, NumDistinct, TermFreqs&gt;<sup>NumFields</sup>
+</p>
+                  
+<p>TVFVersion --&gt; Int</p>
+                  
+<p>NumTerms --&gt; VInt</p>
+                  
+<p>NumDistinct --&gt; VInt -- Future Use</p>
+                  
+<p>TermFreqs --&gt; &lt;TermText, TermFreq&gt;<sup>NumTerms</sup>
+</p>
+                  
+<p>TermText --&gt; &lt;PrefixLength, Suffix&gt;</p>
+                  
+<p>PrefixLength --&gt; VInt</p>
+                  
+<p>Suffix --&gt; String</p>
+                  
+<p>TermFreq --&gt; VInt</p>
+                  
+<p>Term
+                      text prefixes are shared.  The PrefixLength is the number of initial
+                      characters from the previous term which must be pre-pended to a
+                      term's suffix in order to form the term's text.  Thus, if the
+                      previous term's text was "bone" and the term is "boy",
+                      the PrefixLength is two and the suffix is "y".
+                  </p>
+                
+</li>
+              
+</ol>
+<a name="N106FB"></a><a name="Deleted Documents"></a>
+<h3 class="boxed">Deleted Documents</h3>
+<p>The .del file is
+                    optional, and only exists when a segment contains deletions:
+                </p>
+<p>Deletions
+                    (.del)    --&gt; ByteCount,BitCount,Bits
+                </p>
+<p>ByteSize,BitCount    --&gt;
+                    Uint32
+                </p>
+<p>Bits        --&gt;
+                    &lt;Byte&gt;<sup>ByteCount</sup>
+                
+</p>
+<p>ByteCount
+                    indicates the number of bytes in Bits.  It is typically
+                    (SegSize/8)+1.
+                </p>
+<p>
+                    BitCount
+                    indicates the number of bits that are currently set in Bits.
+                </p>
+<p>Bits
+                    contains one bit for each document indexed.  When the bit
+                    corresponding to a document number is set, that document is marked as
+                    deleted.  Bit ordering is from least to most significant.  Thus, if
+                    Bits contains two bytes, 0x00 and 0x02, then document 9 is marked as
+                    deleted.
+                </p>
+</div>
+
+        
+<a name="N1071B"></a><a name="Limitations"></a>
+<h2 class="boxed">Limitations</h2>
+<div class="section">
+<p>There
+                are a few places where these file formats limit the maximum number of
+                terms and documents to a 32-bit quantity, or to approximately 4
+                billion.  This is not today a problem, but, in the long term,
+                probably will be.  These should therefore be replaced with either
+                UInt64 values, or better yet, with VInt values which have no limit.
+            </p>
+</div>
+
+    
+</div>
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("<text>Last Published:</text> " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2006 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+</div>
+</body>
+</html>



Mime
View raw message