lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject svn commit: r413584 [1/3] - in /lucene/java/trunk/contrib/memory/src: java/org/apache/lucene/index/memory/ test/org/apache/lucene/index/memory/
Date Mon, 12 Jun 2006 05:46:17 GMT
Author: otis
Date: Sun Jun 11 22:46:16 2006
New Revision: 413584

URL: http://svn.apache.org/viewvc?rev=413584&view=rev
Log:
- perl -pi -e 's/\t/  /g'

Modified:
    lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java
    lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
    lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
    lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
    lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
    lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
    lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java

Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java Sun Jun 11 22:46:16 2006
@@ -39,345 +39,345 @@
  * @author whoschek.AT.lbl.DOT.gov
  */
 public class AnalyzerUtil {
-	
-	private AnalyzerUtil() {};
+  
+  private AnalyzerUtil() {};
 
-	/**
-	 * Returns a simple analyzer wrapper that logs all tokens produced by the
-	 * underlying child analyzer to the given log stream (typically System.err);
-	 * Otherwise behaves exactly like the child analyzer, delivering the very
-	 * same tokens; useful for debugging purposes on custom indexing and/or
-	 * querying.
-	 * 
-	 * @param child
-	 *            the underlying child analyzer
-	 * @param log
-	 *            the print stream to log to (typically System.err)
-	 * @param logName
-	 *            a name for this logger (typically "log" or similar)
-	 * @return a logging analyzer
-	 */
-	public static Analyzer getLoggingAnalyzer(final Analyzer child, 
-			final PrintStream log, final String logName) {
-		
-		if (child == null) 
-			throw new IllegalArgumentException("child analyzer must not be null");
-		if (log == null) 
-			throw new IllegalArgumentException("logStream must not be null");
+  /**
+   * Returns a simple analyzer wrapper that logs all tokens produced by the
+   * underlying child analyzer to the given log stream (typically System.err);
+   * Otherwise behaves exactly like the child analyzer, delivering the very
+   * same tokens; useful for debugging purposes on custom indexing and/or
+   * querying.
+   * 
+   * @param child
+   *            the underlying child analyzer
+   * @param log
+   *            the print stream to log to (typically System.err)
+   * @param logName
+   *            a name for this logger (typically "log" or similar)
+   * @return a logging analyzer
+   */
+  public static Analyzer getLoggingAnalyzer(final Analyzer child, 
+      final PrintStream log, final String logName) {
+    
+    if (child == null) 
+      throw new IllegalArgumentException("child analyzer must not be null");
+    if (log == null) 
+      throw new IllegalArgumentException("logStream must not be null");
 
-		return new Analyzer() {
-			public TokenStream tokenStream(final String fieldName, Reader reader) {
-				return new TokenFilter(child.tokenStream(fieldName, reader)) {
-					private int position = -1;
-					
-					public Token next() throws IOException {
-						Token token = input.next(); // from filter super class
-						log.println(toString(token));
-						return token;
-					}
-					
-					private String toString(Token token) {
-						if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
-						
-						position += token.getPositionIncrement();
-						return "[" + logName + ":" + position + ":" + fieldName + ":"
-								+ token.termText() + ":" + token.startOffset()
-								+ "-" + token.endOffset() + ":" + token.type()
-								+ "]";
-					}					
-				};
-			}
-		};
-	}
-	
-	
-	/**
-	 * Returns an analyzer wrapper that returns at most the first
-	 * <code>maxTokens</code> tokens from the underlying child analyzer,
-	 * ignoring all remaining tokens.
-	 * 
-	 * @param child
-	 *            the underlying child analyzer
-	 * @param maxTokens
-	 *            the maximum number of tokens to return from the underlying
-	 *            analyzer (a value of Integer.MAX_VALUE indicates unlimited)
-	 * @return an analyzer wrapper
-	 */
-	public static Analyzer getMaxTokenAnalyzer(
-			final Analyzer child, final int maxTokens) {
-		
-		if (child == null) 
-			throw new IllegalArgumentException("child analyzer must not be null");
-		if (maxTokens < 0) 
-			throw new IllegalArgumentException("maxTokens must not be negative");
-		if (maxTokens == Integer.MAX_VALUE) 
-			return child; // no need to wrap
-	
-		return new Analyzer() {
-			public TokenStream tokenStream(String fieldName, Reader reader) {
-				return new TokenFilter(child.tokenStream(fieldName, reader)) {
-					private int todo = maxTokens;
-					
-					public Token next() throws IOException {
-						return --todo >= 0 ? input.next() : null;
-					}
-				};
-			}
-		};
-	}
-	
-	
-	/**
-	 * Returns an English stemming analyzer that stems tokens from the
-	 * underlying child analyzer according to the Porter stemming algorithm. The
-	 * child analyzer must deliver tokens in lower case for the stemmer to work
-	 * properly.
-	 * <p>
-	 * Background: Stemming reduces token terms to their linguistic root form
-	 * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
-	 * "famili", as well as "complete" and "completion" to "complet". Note that
-	 * the root form is not necessarily a meaningful word in itself, and that
-	 * this is not a bug but rather a feature, if you lean back and think about
-	 * fuzzy word matching for a bit.
-	 * <p>
-	 * See the Lucene contrib packages for stemmers (and stop words) for German,
-	 * Russian and many more languages.
-	 * 
-	 * @param child
-	 *            the underlying child analyzer
-	 * @return an analyzer wrapper
-	 */
-	public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
-		
-		if (child == null) 
-			throw new IllegalArgumentException("child analyzer must not be null");
-	
-		return new Analyzer() {
-			public TokenStream tokenStream(String fieldName, Reader reader) {
-				return new PorterStemFilter(
-						child.tokenStream(fieldName, reader));
-// 				/* PorterStemFilter and SnowballFilter have the same behaviour, 
-// 				but PorterStemFilter is much faster. */
-//				return new org.apache.lucene.analysis.snowball.SnowballFilter(
-//						child.tokenStream(fieldName, reader), "English");
-			}
-		};
-	}
-	
-	
-	/**
-	 * Returns an analyzer wrapper that wraps the underlying child analyzer's
-	 * token stream into a {@link SynonymTokenFilter}.
-	 * 
-	 * @param child
-	 *            the underlying child analyzer
-	 * @param synonyms
-	 *            the map used to extract synonyms for terms
-	 * @param maxSynonyms
-	 *            the maximum number of synonym tokens to return per underlying
-	 *            token word (a value of Integer.MAX_VALUE indicates unlimited)
-	 * @return a new analyzer
-	 */
-	public static Analyzer getSynonymAnalyzer(final Analyzer child, 
-			final SynonymMap synonyms, final int maxSynonyms) {
-		
-		if (child == null) 
-			throw new IllegalArgumentException("child analyzer must not be null");
-		if (synonyms == null)
-			throw new IllegalArgumentException("synonyms must not be null");
-		if (maxSynonyms < 0) 
-			throw new IllegalArgumentException("maxSynonyms must not be negative");
-		if (maxSynonyms == 0)
-			return child; // no need to wrap
-	
-		return new Analyzer() {
-			public TokenStream tokenStream(String fieldName, Reader reader) {
-				return new SynonymTokenFilter(
-					child.tokenStream(fieldName, reader), synonyms, maxSynonyms);
-			}
-		};
-	}
+    return new Analyzer() {
+      public TokenStream tokenStream(final String fieldName, Reader reader) {
+        return new TokenFilter(child.tokenStream(fieldName, reader)) {
+          private int position = -1;
+          
+          public Token next() throws IOException {
+            Token token = input.next(); // from filter super class
+            log.println(toString(token));
+            return token;
+          }
+          
+          private String toString(Token token) {
+            if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
+            
+            position += token.getPositionIncrement();
+            return "[" + logName + ":" + position + ":" + fieldName + ":"
+                + token.termText() + ":" + token.startOffset()
+                + "-" + token.endOffset() + ":" + token.type()
+                + "]";
+          }         
+        };
+      }
+    };
+  }
+  
+  
+  /**
+   * Returns an analyzer wrapper that returns at most the first
+   * <code>maxTokens</code> tokens from the underlying child analyzer,
+   * ignoring all remaining tokens.
+   * 
+   * @param child
+   *            the underlying child analyzer
+   * @param maxTokens
+   *            the maximum number of tokens to return from the underlying
+   *            analyzer (a value of Integer.MAX_VALUE indicates unlimited)
+   * @return an analyzer wrapper
+   */
+  public static Analyzer getMaxTokenAnalyzer(
+      final Analyzer child, final int maxTokens) {
+    
+    if (child == null) 
+      throw new IllegalArgumentException("child analyzer must not be null");
+    if (maxTokens < 0) 
+      throw new IllegalArgumentException("maxTokens must not be negative");
+    if (maxTokens == Integer.MAX_VALUE) 
+      return child; // no need to wrap
+  
+    return new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new TokenFilter(child.tokenStream(fieldName, reader)) {
+          private int todo = maxTokens;
+          
+          public Token next() throws IOException {
+            return --todo >= 0 ? input.next() : null;
+          }
+        };
+      }
+    };
+  }
+  
+  
+  /**
+   * Returns an English stemming analyzer that stems tokens from the
+   * underlying child analyzer according to the Porter stemming algorithm. The
+   * child analyzer must deliver tokens in lower case for the stemmer to work
+   * properly.
+   * <p>
+   * Background: Stemming reduces token terms to their linguistic root form
+   * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
+   * "famili", as well as "complete" and "completion" to "complet". Note that
+   * the root form is not necessarily a meaningful word in itself, and that
+   * this is not a bug but rather a feature, if you lean back and think about
+   * fuzzy word matching for a bit.
+   * <p>
+   * See the Lucene contrib packages for stemmers (and stop words) for German,
+   * Russian and many more languages.
+   * 
+   * @param child
+   *            the underlying child analyzer
+   * @return an analyzer wrapper
+   */
+  public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
+    
+    if (child == null) 
+      throw new IllegalArgumentException("child analyzer must not be null");
+  
+    return new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new PorterStemFilter(
+            child.tokenStream(fieldName, reader));
+//        /* PorterStemFilter and SnowballFilter have the same behaviour, 
+//        but PorterStemFilter is much faster. */
+//        return new org.apache.lucene.analysis.snowball.SnowballFilter(
+//            child.tokenStream(fieldName, reader), "English");
+      }
+    };
+  }
+  
+  
+  /**
+   * Returns an analyzer wrapper that wraps the underlying child analyzer's
+   * token stream into a {@link SynonymTokenFilter}.
+   * 
+   * @param child
+   *            the underlying child analyzer
+   * @param synonyms
+   *            the map used to extract synonyms for terms
+   * @param maxSynonyms
+   *            the maximum number of synonym tokens to return per underlying
+   *            token word (a value of Integer.MAX_VALUE indicates unlimited)
+   * @return a new analyzer
+   */
+  public static Analyzer getSynonymAnalyzer(final Analyzer child, 
+      final SynonymMap synonyms, final int maxSynonyms) {
+    
+    if (child == null) 
+      throw new IllegalArgumentException("child analyzer must not be null");
+    if (synonyms == null)
+      throw new IllegalArgumentException("synonyms must not be null");
+    if (maxSynonyms < 0) 
+      throw new IllegalArgumentException("maxSynonyms must not be negative");
+    if (maxSynonyms == 0)
+      return child; // no need to wrap
+  
+    return new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new SynonymTokenFilter(
+          child.tokenStream(fieldName, reader), synonyms, maxSynonyms);
+      }
+    };
+  }
 
-	
-	/**
-	 * Returns (frequency:term) pairs for the top N distinct terms (aka words),
-	 * sorted descending by frequency (and ascending by term, if tied).
-	 * <p>
-	 * Example XQuery:
-	 * <pre>
-	 * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
-	 * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
-	 * 
-	 * for $pair in util:get-most-frequent-terms(
-	 *    analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
-	 * return &lt;word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
-	 * </pre>
-	 * 
-	 * @param analyzer
-	 *            the analyzer to use for splitting text into terms (aka words)
-	 * @param text
-	 *            the text to analyze
-	 * @param limit
-	 *            the maximum number of pairs to return; zero indicates 
-	 *            "as many as possible".
-	 * @return an array of (frequency:term) pairs in the form of (freq0:term0,
-	 *         freq1:term1, ..., freqN:termN). Each pair is a single string
-	 *         separated by a ':' delimiter.
-	 */
-	public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {
-		if (analyzer == null) 
-			throw new IllegalArgumentException("analyzer must not be null");
-		if (text == null) 
-			throw new IllegalArgumentException("text must not be null");
-		if (limit <= 0) limit = Integer.MAX_VALUE;
-		
-		// compute frequencies of distinct terms
-		HashMap map = new HashMap();
-		TokenStream stream = analyzer.tokenStream("", new StringReader(text));
-		try {
-			Token token;
-			while ((token = stream.next()) != null) {
-				MutableInteger freq = (MutableInteger) map.get(token.termText());
-				if (freq == null) {
-					freq = new MutableInteger(1);
-					map.put(token.termText(), freq);
-				} else {
-					freq.setValue(freq.intValue() + 1);
-				}
-			}
-		} catch (IOException e) {
-			throw new RuntimeException(e);
-		} finally {
-			try {
-				stream.close();
-			} catch (IOException e2) {
-				throw new RuntimeException(e2);
-			}
-		}
-		
-		// sort by frequency, text
-		Map.Entry[] entries = new Map.Entry[map.size()];
-		map.entrySet().toArray(entries);
-		Arrays.sort(entries, new Comparator() {
-			public int compare(Object o1, Object o2) {
-				Map.Entry e1 = (Map.Entry) o1;
-				Map.Entry e2 = (Map.Entry) o2;
-				int f1 = ((MutableInteger) e1.getValue()).intValue();
-				int f2 = ((MutableInteger) e2.getValue()).intValue();
-				if (f2 - f1 != 0) return f2 - f1;
-				String s1 = (String) e1.getKey();
-				String s2 = (String) e2.getKey();
-				return s1.compareTo(s2);
-			}
-		});
-		
-		// return top N entries
-		int size = Math.min(limit, entries.length);
-		String[] pairs = new String[size];
-		for (int i=0; i < size; i++) {
-			pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();
-		}
-		return pairs;
-	}
-	
-	private static final class MutableInteger {
-		private int value;
-		public MutableInteger(int value) { this.value = value; }
-		public int intValue() { return value; }
-		public void setValue(int value) { this.value = value; }
-		public String toString() { return String.valueOf(value); }
-	};
-	
-	
-	
-	// TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
-	/** (Line terminator followed by zero or more whitespace) two or more times */
-	private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
-	
-	/**
-	 * Returns at most the first N paragraphs of the given text. Delimiting
-	 * characters are excluded from the results. Each returned paragraph is
-	 * whitespace-trimmed via String.trim(), potentially an empty string.
-	 * 
-	 * @param text
-	 *            the text to tokenize into paragraphs
-	 * @param limit
-	 *            the maximum number of paragraphs to return; zero indicates "as
-	 *            many as possible".
-	 * @return the first N paragraphs
-	 */
-	public static String[] getParagraphs(String text, int limit) {
-		return tokenize(PARAGRAPHS, text, limit);
-	}
-		
-	private static String[] tokenize(Pattern pattern, String text, int limit) {
-		String[] tokens = pattern.split(text, limit);
-		for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();
-		return tokens;
-	}
-	
-	
-	// TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
-	/** Divides text into sentences; Includes inverted spanish exclamation and question mark */
-	private static final Pattern SENTENCES  = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");
+  
+  /**
+   * Returns (frequency:term) pairs for the top N distinct terms (aka words),
+   * sorted descending by frequency (and ascending by term, if tied).
+   * <p>
+   * Example XQuery:
+   * <pre>
+   * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
+   * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
+   * 
+   * for $pair in util:get-most-frequent-terms(
+   *    analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
+   * return &lt;word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
+   * </pre>
+   * 
+   * @param analyzer
+   *            the analyzer to use for splitting text into terms (aka words)
+   * @param text
+   *            the text to analyze
+   * @param limit
+   *            the maximum number of pairs to return; zero indicates 
+   *            "as many as possible".
+   * @return an array of (frequency:term) pairs in the form of (freq0:term0,
+   *         freq1:term1, ..., freqN:termN). Each pair is a single string
+   *         separated by a ':' delimiter.
+   */
+  public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {
+    if (analyzer == null) 
+      throw new IllegalArgumentException("analyzer must not be null");
+    if (text == null) 
+      throw new IllegalArgumentException("text must not be null");
+    if (limit <= 0) limit = Integer.MAX_VALUE;
+    
+    // compute frequencies of distinct terms
+    HashMap map = new HashMap();
+    TokenStream stream = analyzer.tokenStream("", new StringReader(text));
+    try {
+      Token token;
+      while ((token = stream.next()) != null) {
+        MutableInteger freq = (MutableInteger) map.get(token.termText());
+        if (freq == null) {
+          freq = new MutableInteger(1);
+          map.put(token.termText(), freq);
+        } else {
+          freq.setValue(freq.intValue() + 1);
+        }
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    } finally {
+      try {
+        stream.close();
+      } catch (IOException e2) {
+        throw new RuntimeException(e2);
+      }
+    }
+    
+    // sort by frequency, text
+    Map.Entry[] entries = new Map.Entry[map.size()];
+    map.entrySet().toArray(entries);
+    Arrays.sort(entries, new Comparator() {
+      public int compare(Object o1, Object o2) {
+        Map.Entry e1 = (Map.Entry) o1;
+        Map.Entry e2 = (Map.Entry) o2;
+        int f1 = ((MutableInteger) e1.getValue()).intValue();
+        int f2 = ((MutableInteger) e2.getValue()).intValue();
+        if (f2 - f1 != 0) return f2 - f1;
+        String s1 = (String) e1.getKey();
+        String s2 = (String) e2.getKey();
+        return s1.compareTo(s2);
+      }
+    });
+    
+    // return top N entries
+    int size = Math.min(limit, entries.length);
+    String[] pairs = new String[size];
+    for (int i=0; i < size; i++) {
+      pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();
+    }
+    return pairs;
+  }
+  
+  private static final class MutableInteger {
+    private int value;
+    public MutableInteger(int value) { this.value = value; }
+    public int intValue() { return value; }
+    public void setValue(int value) { this.value = value; }
+    public String toString() { return String.valueOf(value); }
+  };
+  
+  
+  
+  // TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
+  /** (Line terminator followed by zero or more whitespace) two or more times */
+  private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
+  
+  /**
+   * Returns at most the first N paragraphs of the given text. Delimiting
+   * characters are excluded from the results. Each returned paragraph is
+   * whitespace-trimmed via String.trim(), potentially an empty string.
+   * 
+   * @param text
+   *            the text to tokenize into paragraphs
+   * @param limit
+   *            the maximum number of paragraphs to return; zero indicates "as
+   *            many as possible".
+   * @return the first N paragraphs
+   */
+  public static String[] getParagraphs(String text, int limit) {
+    return tokenize(PARAGRAPHS, text, limit);
+  }
+    
+  private static String[] tokenize(Pattern pattern, String text, int limit) {
+    String[] tokens = pattern.split(text, limit);
+    for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();
+    return tokens;
+  }
+  
+  
+  // TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
+  /** Divides text into sentences; Includes inverted spanish exclamation and question mark */
+  private static final Pattern SENTENCES  = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");
 
-	/**
-	 * Returns at most the first N sentences of the given text. Delimiting
-	 * characters are excluded from the results. Each returned sentence is
-	 * whitespace-trimmed via String.trim(), potentially an empty string.
-	 * 
-	 * @param text
-	 *            the text to tokenize into sentences
-	 * @param limit
-	 *            the maximum number of sentences to return; zero indicates "as
-	 *            many as possible".
-	 * @return the first N sentences
-	 */
-	public static String[] getSentences(String text, int limit) {
-//		return tokenize(SENTENCES, text, limit); // equivalent but slower
-		int len = text.length();
-		if (len == 0) return new String[] { text };
-		if (limit <= 0) limit = Integer.MAX_VALUE;
-		
-		// average sentence length heuristic
-		String[] tokens = new String[Math.min(limit, 1 + len/40)];
-		int size = 0;
-		int i = 0;
-		
-		while (i < len && size < limit) {
-			
-			// scan to end of current sentence
-			int start = i;
-			while (i < len && !isSentenceSeparator(text.charAt(i))) i++;
-			
-			if (size == tokens.length) { // grow array
-				String[] tmp = new String[tokens.length << 1];
-				System.arraycopy(tokens, 0, tmp, 0, size);
-				tokens = tmp;
-			}
-			// add sentence (potentially empty)
-			tokens[size++] = text.substring(start, i).trim();
+  /**
+   * Returns at most the first N sentences of the given text. Delimiting
+   * characters are excluded from the results. Each returned sentence is
+   * whitespace-trimmed via String.trim(), potentially an empty string.
+   * 
+   * @param text
+   *            the text to tokenize into sentences
+   * @param limit
+   *            the maximum number of sentences to return; zero indicates "as
+   *            many as possible".
+   * @return the first N sentences
+   */
+  public static String[] getSentences(String text, int limit) {
+//    return tokenize(SENTENCES, text, limit); // equivalent but slower
+    int len = text.length();
+    if (len == 0) return new String[] { text };
+    if (limit <= 0) limit = Integer.MAX_VALUE;
+    
+    // average sentence length heuristic
+    String[] tokens = new String[Math.min(limit, 1 + len/40)];
+    int size = 0;
+    int i = 0;
+    
+    while (i < len && size < limit) {
+      
+      // scan to end of current sentence
+      int start = i;
+      while (i < len && !isSentenceSeparator(text.charAt(i))) i++;
+      
+      if (size == tokens.length) { // grow array
+        String[] tmp = new String[tokens.length << 1];
+        System.arraycopy(tokens, 0, tmp, 0, size);
+        tokens = tmp;
+      }
+      // add sentence (potentially empty)
+      tokens[size++] = text.substring(start, i).trim();
 
-			// scan to beginning of next sentence
-			while (i < len && isSentenceSeparator(text.charAt(i))) i++;
-		}
-		
-		if (size == tokens.length) return tokens;
-		String[] results = new String[size];
-		System.arraycopy(tokens, 0, results, 0, size);
-		return results;
-	}
+      // scan to beginning of next sentence
+      while (i < len && isSentenceSeparator(text.charAt(i))) i++;
+    }
+    
+    if (size == tokens.length) return tokens;
+    String[] results = new String[size];
+    System.arraycopy(tokens, 0, results, 0, size);
+    return results;
+  }
 
-	private static boolean isSentenceSeparator(char c) {
-		// regex [!\\.\\?\\xA1\\xBF]
-		switch (c) {
-			case '!': return true;
-			case '.': return true;
-			case '?': return true;
-			case 0xA1: return true; // spanish inverted exclamation mark
-			case 0xBF: return true; // spanish inverted question mark
-			default: return false;
-		}		
-	}
-	
+  private static boolean isSentenceSeparator(char c) {
+    // regex [!\\.\\?\\xA1\\xBF]
+    switch (c) {
+      case '!': return true;
+      case '.': return true;
+      case '?': return true;
+      case 0xA1: return true; // spanish inverted exclamation mark
+      case 0xBF: return true; // spanish inverted question mark
+      default: return false;
+    }   
+  }
+  
 }

Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Sun Jun 11 22:46:16 2006
@@ -160,898 +160,898 @@
  */
 public class MemoryIndex {
 
-	/** info for each field: Map<String fieldName, Info field> */
-	private final HashMap fields = new HashMap();
-	
-	/** fields sorted ascending by fieldName; lazily computed on demand */
-	private transient Map.Entry[] sortedFields; 
-	
-	/** pos: positions[3*i], startOffset: positions[3*i +1], endOffset: positions[3*i +2] */
-	private final int stride;
-	
-	private static final long serialVersionUID = 2782195016849084649L;
-
-	private static final boolean DEBUG = false;
-	
-	/**
-	 * Sorts term entries into ascending order; also works for
-	 * Arrays.binarySearch() and Arrays.sort()
-	 */
-	private static final Comparator termComparator = new Comparator() {
-		public int compare(Object o1, Object o2) {
-			if (o1 instanceof Map.Entry) o1 = ((Map.Entry) o1).getKey();
-			if (o2 instanceof Map.Entry) o2 = ((Map.Entry) o2).getKey();
-			if (o1 == o2) return 0;
-			return ((String) o1).compareTo((String) o2);
-		}
-	};
-
-	/**
-	 * Constructs an empty instance.
-	 */
-	public MemoryIndex() {
-		this(false);
-	}
-	
-	/**
-	 * Constructs an empty instance that can optionally store the start and end
-	 * character offset of each token term in the text. This can be useful for
-	 * highlighting of hit locations with the Lucene highlighter package.
-	 * Private until the highlighter package matures, so that this can actually
-	 * be meaningfully integrated.
-	 * 
-	 * @param storeOffsets
-	 *            whether or not to store the start and end character offset of
-	 *            each token term in the text
-	 */
-	private MemoryIndex(boolean storeOffsets) {
-		this.stride = storeOffsets ? 3 : 1;
-	}
-	
-	/**
-	 * Convenience method; Tokenizes the given field text and adds the resulting
-	 * terms to the index; Equivalent to adding a tokenized, indexed,
-	 * termVectorStored, unstored, non-keyword Lucene
-	 * {@link org.apache.lucene.document.Field}.
-	 * 
-	 * @param fieldName
-	 *            a name to be associated with the text
-	 * @param text
-	 *            the text to tokenize and index.
-	 * @param analyzer
-	 *            the analyzer to use for tokenization
-	 */
-	public void addField(String fieldName, String text, Analyzer analyzer) {
-		if (fieldName == null)
-			throw new IllegalArgumentException("fieldName must not be null");
-		if (text == null)
-			throw new IllegalArgumentException("text must not be null");
-		if (analyzer == null)
-			throw new IllegalArgumentException("analyzer must not be null");
-		
-		TokenStream stream;
-		if (analyzer instanceof PatternAnalyzer) {
-			stream = ((PatternAnalyzer) analyzer).tokenStream(fieldName, text);
-		} else {
-			stream = analyzer.tokenStream(fieldName, 
-					new PatternAnalyzer.FastStringReader(text));
-		}
-		addField(fieldName, stream);
-	}
-	
-	/**
-	 * Convenience method; Creates and returns a token stream that generates a
-	 * token for each keyword in the given collection, "as is", without any
-	 * transforming text analysis. The resulting token stream can be fed into
-	 * {@link #addField(String, TokenStream)}, perhaps wrapped into another
-	 * {@link org.apache.lucene.analysis.TokenFilter}, as desired.
-	 * 
-	 * @param keywords
-	 *            the keywords to generate tokens for
-	 * @return the corresponding token stream
-	 */
-	public TokenStream keywordTokenStream(final Collection keywords) {
-		// TODO: deprecate & move this method into AnalyzerUtil?
-		if (keywords == null)
-			throw new IllegalArgumentException("keywords must not be null");
-		
-		return new TokenStream() {
-			private Iterator iter = keywords.iterator();
-			private int start = 0;
-			public Token next() {
-				if (!iter.hasNext()) return null;
-				
-				Object obj = iter.next();
-				if (obj == null) 
-					throw new IllegalArgumentException("keyword must not be null");
-				
-				String term = obj.toString();
-				Token token = new Token(term, start, start + term.length());
-				start += term.length() + 1; // separate words by 1 (blank) character
-				return token;
-			}
-		};
-	}
-	
-	/**
-	 * Iterates over the given token stream and adds the resulting terms to the index;
-	 * Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
-	 * Lucene {@link org.apache.lucene.document.Field}.
-	 * Finally closes the token stream. Note that untokenized keywords can be added with this method via 
-	 * {@link #keywordTokenStream(Collection)}, the Lucene contrib <code>KeywordTokenizer</code> or similar utilities.
-	 * 
-	 * @param fieldName
-	 *            a name to be associated with the text
-	 * @param stream
-	 *            the token stream to retrieve tokens from.
-	 */
-	public void addField(String fieldName, TokenStream stream) {
-		/*
-		 * Note that this method signature avoids having a user call new
-		 * o.a.l.d.Field(...) which would be much too expensive due to the
-		 * String.intern() usage of that class.
-		 * 
-		 * More often than not, String.intern() leads to serious performance
-		 * degradations rather than improvements! If you're curious why, check
-		 * out the JDK's native code, see how it oscillates multiple times back
-		 * and forth between Java code and native code on each intern() call,
-		 * only to end up using a plain vanilla java.util.HashMap on the Java
-		 * heap for it's interned strings! String.equals() has a small cost
-		 * compared to String.intern(), trust me. Application level interning
-		 * (e.g. a HashMap per Directory/Index) typically leads to better
-		 * solutions than frequent hidden low-level calls to String.intern().
-		 * 
-		 * Perhaps with some luck, Lucene's Field.java (and Term.java) and
-		 * cousins could be fixed to not use String.intern(). Sigh :-(
-		 */
-		try {
-			if (fieldName == null)
-				throw new IllegalArgumentException("fieldName must not be null");
-			if (stream == null)
-				throw new IllegalArgumentException("token stream must not be null");
-			if (fields.get(fieldName) != null)
-				throw new IllegalArgumentException("field must not be added more than once");
-			
-			HashMap terms = new HashMap();
-			int numTokens = 0;
-			int pos = -1;
-			Token token;
-			
-			while ((token = stream.next()) != null) {
-				String term = token.termText();
-				if (term.length() == 0) continue; // nothing to do
-//				if (DEBUG) System.err.println("token='" + term + "'");
-				numTokens++;
-				pos += token.getPositionIncrement();
-				
-				ArrayIntList positions = (ArrayIntList) terms.get(term);
-				if (positions == null) { // term not seen before
-					positions = new ArrayIntList(stride);
-					terms.put(term, positions);
-				}
-				if (stride == 1) {
-					positions.add(pos);
-				} else {
-					positions.add(pos, token.startOffset(), token.endOffset());
-				}
-			}
-			
-			// ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
-			if (numTokens > 0) {
-				fields.put(fieldName, new Info(terms, numTokens));
-				sortedFields = null;    // invalidate sorted view, if any
-			}
-		} catch (IOException e) { // can never happen
-			throw new RuntimeException(e);
-		} finally {
-			try {
-				if (stream != null) stream.close();
-			} catch (IOException e2) {
-				throw new RuntimeException(e2);
-			}
-		}
-	}
-	
-	/**
-	 * Creates and returns a searcher that can be used to execute arbitrary
-	 * Lucene queries and to collect the resulting query results as hits.
-	 * 
-	 * @return a searcher
-	 */
-	public IndexSearcher createSearcher() {
-		MemoryIndexReader reader = new MemoryIndexReader();
-		IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !!
-		reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity()
-		return searcher;
-	}
-	
-	/**
-	 * Convenience method that efficiently returns the relevance score by
-	 * matching this index against the given Lucene query expression.
-	 * 
-	 * @param query
-	 *            an arbitrary Lucene query to run against this index
-	 * @return the relevance score of the matchmaking; A number in the range
-	 *         [0.0 .. 1.0], with 0.0 indicating no match. The higher the number
-	 *         the better the match.
-	 * @see org.apache.lucene.queryParser.QueryParser#parse(String)
-	 */
-	public float search(Query query) {
-		if (query == null) 
-			throw new IllegalArgumentException("query must not be null");
-		
-		Searcher searcher = createSearcher();
-		try {
-			final float[] scores = new float[1]; // inits to 0.0f (no match)
-			searcher.search(query, new HitCollector() {
-				public void collect(int doc, float score) {
-					scores[0] = score;
-				}
-			});
-			float score = scores[0];
-			return score;
-		} catch (IOException e) { // can never happen (RAMDirectory)
-			throw new RuntimeException(e);
-		} finally {
-			// searcher.close();
-			/*
-			 * Note that it is harmless and important for good performance to
-			 * NOT close the index reader!!! This avoids all sorts of
-			 * unnecessary baggage and locking in the Lucene IndexReader
-			 * superclass, all of which is completely unnecessary for this main
-			 * memory index data structure without thread-safety claims.
-			 * 
-			 * Wishing IndexReader would be an interface...
-			 * 
-			 * Actually with the new tight createSearcher() API auto-closing is now
-			 * made impossible, hence searcher.close() would be harmless...
-			 */
-		}		
-	}
-	
-	/**
-	 * Returns a reasonable approximation of the main memory [bytes] consumed by
-	 * this instance. Useful for smart memory sensititve caches/pools. Assumes
-	 * fieldNames are interned, whereas tokenized terms are memory-overlaid. For
-	 * simplicity, assumes no VM word boundary alignment of instance vars.
-	 * 
-	 * @return the main memory consumption
-	 */
-	public int getMemorySize() {
-		// for example usage in a smart cache see nux.xom.pool.Pool
-		int HEADER = 12; // object header of any java object
-		int PTR = 4; // pointer on 32 bit VMs
-		int ARR = HEADER + 4;
-		int STR = HEADER + 3*4 + PTR + ARR; // string
-		int INTARRLIST = HEADER + 4 + PTR + ARR;
-		int HASHMAP = HEADER + 4*PTR + 4*4 + ARR;
-		
-		int size = 0;
-		size += HEADER + 2*PTR + 4; // memory index
-		if (sortedFields != null) size += ARR + PTR * sortedFields.length;
-		
-		size += HASHMAP + fields.size() * (PTR + HEADER + 3*PTR + 4); // Map.entries
-		Iterator iter = fields.entrySet().iterator();
-		while (iter.hasNext()) { // for each Field Info
-			Map.Entry entry = (Map.Entry) iter.next();			
-			Info info = (Info) entry.getValue();
-			size += HEADER + 4 + PTR + PTR + PTR; // Info instance vars
-			if (info.sortedTerms != null) size += ARR + PTR * info.sortedTerms.length;
-			
-			int len = info.terms.size();
-			size += HASHMAP + len * (PTR + HEADER + 3*PTR + 4); // Map.entries
-			Iterator iter2 = info.terms.entrySet().iterator();
-			while (--len >= 0) { // for each term
-				Map.Entry e = (Map.Entry) iter2.next();
-				size += STR - ARR; // assumes substring() memory overlay
-//				size += STR + 2 * ((String) e.getKey()).length();
-				ArrayIntList positions = (ArrayIntList) e.getValue();
-				size += INTARRLIST + 4*positions.size();
-			}
-		}
-		return size;
-	}	
-
-	private int numPositions(ArrayIntList positions) {
-		return positions.size() / stride;
-	}
-	
-	/** sorts into ascending order (on demand), reusing memory along the way */
-	private void sortFields() {
-		if (sortedFields == null) sortedFields = sort(fields);
-	}
-	
-	/** returns a view of the given map's entries, sorted ascending by key */
-	private static Map.Entry[] sort(HashMap map) {
-		int size = map.size();
-		Map.Entry[] entries = new Map.Entry[size];
-		
-		Iterator iter = map.entrySet().iterator();
-		for (int i=0; i < size; i++) {
-			entries[i] = (Map.Entry) iter.next();
-		}
-		
-		if (size > 1) Arrays.sort(entries, termComparator);
-		return entries;
-	}
-	
-	/**
-	 * Returns a String representation of the index data for debugging purposes.
-	 * 
-	 * @return the string representation
-	 */
-	public String toString() {
-		StringBuffer result = new StringBuffer(256);		
-		sortFields();		
-		int sumChars = 0;
-		int sumPositions = 0;
-		int sumTerms = 0;
-		
-		for (int i=0; i < sortedFields.length; i++) {
-			Map.Entry entry = sortedFields[i];
-			String fieldName = (String) entry.getKey();
-			Info info = (Info) entry.getValue();
-			info.sortTerms();
-			result.append(fieldName + ":\n");
-			
-			int numChars = 0;
-			int numPositions = 0;
-			for (int j=0; j < info.sortedTerms.length; j++) {
-				Map.Entry e = info.sortedTerms[j];
-				String term = (String) e.getKey();
-				ArrayIntList positions = (ArrayIntList) e.getValue();
-				result.append("\t'" + term + "':" + numPositions(positions) + ":");
-				result.append(positions.toString(stride)); // ignore offsets
-				result.append("\n");
-				numPositions += numPositions(positions);
-				numChars += term.length();
-			}
-			
-			result.append("\tterms=" + info.sortedTerms.length);
-			result.append(", positions=" + numPositions);
-			result.append(", Kchars=" + (numChars/1000.0f));
-			result.append("\n");
-			sumPositions += numPositions;
-			sumChars += numChars;
-			sumTerms += info.sortedTerms.length;
-		}
-		
-		result.append("\nfields=" + sortedFields.length);
-		result.append(", terms=" + sumTerms);
-		result.append(", positions=" + sumPositions);
-		result.append(", Kchars=" + (sumChars/1000.0f));
-		return result.toString();
-	}
-	
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	/**
-	 * Index data structure for a field; Contains the tokenized term texts and
-	 * their positions.
-	 */
-	private static final class Info implements Serializable {
-		
-		/**
-		 * Term strings and their positions for this field: Map <String
-		 * termText, ArrayIntList positions>
-		 */
-		private final HashMap terms; 
-		
-		/** Terms sorted ascending by term text; computed on demand */
-		private transient Map.Entry[] sortedTerms;
-		
-		/** Number of added tokens for this field */
-		private final int numTokens;
-		
-		/** Term for this field's fieldName, lazily computed on demand */
-		public transient Term template;
-
-		private static final long serialVersionUID = 2882195016849084649L;	
-
-		public Info(HashMap terms, int numTokens) {
-			this.terms = terms;
-			this.numTokens = numTokens;
-		}
-		
-		/**
-		 * Sorts hashed terms into ascending order, reusing memory along the
-		 * way. Note that sorting is lazily delayed until required (often it's
-		 * not required at all). If a sorted view is required then hashing +
-		 * sort + binary search is still faster and smaller than TreeMap usage
-		 * (which would be an alternative and somewhat more elegant approach,
-		 * apart from more sophisticated Tries / prefix trees).
-		 */
-		public void sortTerms() {
-			if (sortedTerms == null) sortedTerms = sort(terms);
-		}
-				
-		/** note that the frequency can be calculated as numPosition(getPositions(x)) */
-		public ArrayIntList getPositions(String term) {
-			return (ArrayIntList) terms.get(term);
-		}
-
-		/** note that the frequency can be calculated as numPosition(getPositions(x)) */
-		public ArrayIntList getPositions(int pos) {
-			return (ArrayIntList) sortedTerms[pos].getValue();
-		}
-		
-	}
-	
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	/**
-	 * Efficient resizable auto-expanding list holding <code>int</code> elements;
-	 * implemented with arrays.
-	 */
-	private static final class ArrayIntList implements Serializable {
-
-		private int[] elements;
-		private int size = 0;
-		
-		private static final long serialVersionUID = 2282195016849084649L;	
-			
-		public ArrayIntList() {
-			this(10);
-		}
-
-		public ArrayIntList(int initialCapacity) {
-			elements = new int[initialCapacity];
-		}
-
-		public void add(int elem) {
-			if (size == elements.length) ensureCapacity(size + 1);
-			elements[size++] = elem;
-		}
-
-		public void add(int pos, int start, int end) {
-			if (size + 3 > elements.length) ensureCapacity(size + 3);
-			elements[size] = pos;
-			elements[size+1] = start;
-			elements[size+2] = end;
-			size += 3;
-		}
-
-		public int get(int index) {
-			if (index >= size) throwIndex(index);
-			return elements[index];
-		}
-		
-		public int size() {
-			return size;
-		}
-		
-		public int[] toArray(int stride) {
-			int[] arr = new int[size() / stride];
-			if (stride == 1)
-				System.arraycopy(elements, 0, arr, 0, size); // fast path
-			else 
-				for (int i=0, j=0; j < size; i++, j += stride) arr[i] = elements[j];
-			return arr;
-		}
-		
-		private void ensureCapacity(int minCapacity) {
-			int newCapacity = Math.max(minCapacity, (elements.length * 3) / 2 + 1);
-			int[] newElements = new int[newCapacity];
-			System.arraycopy(elements, 0, newElements, 0, size);
-			elements = newElements;
-		}
-
-		private void throwIndex(int index) {
-			throw new IndexOutOfBoundsException("index: " + index
-						+ ", size: " + size);
-		}
-		
-		/** returns the first few positions (without offsets); debug only */
-		public String toString(int stride) {
-			int s = size() / stride;
-			int len = Math.min(10, s); // avoid printing huge lists
-			StringBuffer buf = new StringBuffer(4*len);
-			buf.append("[");
-			for (int i = 0; i < len; i++) {
-				buf.append(get(i*stride));
-				if (i < len-1) buf.append(", ");
-			}
-			if (len != s) buf.append(", ..."); // and some more...
-			buf.append("]");
-			return buf.toString();
-		}		
-	}
-	
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	private static final Term MATCH_ALL_TERM = new Term("", "");
-		
-	/**
-	 * Search support for Lucene framework integration; implements all methods
-	 * required by the Lucene IndexReader contracts.
-	 */
-	private final class MemoryIndexReader extends IndexReader {
-		
-		private Searcher searcher; // needed to find searcher.getSimilarity() 
-		
-		private MemoryIndexReader() {
-			super(null); // avoid as much superclass baggage as possible
-		}
-		
-		// lucene >= 1.9 or lucene-1.4.3 with patch removing "final" in superclass
-		protected void finalize() {}
-		
-		private Info getInfo(String fieldName) {
-			return (Info) fields.get(fieldName);
-		}
-		
-		private Info getInfo(int pos) {
-			return (Info) sortedFields[pos].getValue();
-		}
-		
-		public int docFreq(Term term) {
-			Info info = getInfo(term.field());
-			int freq = 0;
-			if (info != null) freq = info.getPositions(term.text()) != null ? 1 : 0;
-			if (DEBUG) System.err.println("MemoryIndexReader.docFreq: " + term + ", freq:" + freq);
-			return freq;
-		}
-	
-		public TermEnum terms() {
-			if (DEBUG) System.err.println("MemoryIndexReader.terms()");
-			return terms(MATCH_ALL_TERM);
-		}
-		
-		public TermEnum terms(Term term) {
-			if (DEBUG) System.err.println("MemoryIndexReader.terms: " + term);
-	
-			int i; // index into info.sortedTerms
-			int j; // index into sortedFields
-			
-			sortFields();
-			if (sortedFields.length == 1 && sortedFields[0].getKey() == term.field()) {
-				j = 0; // fast path
-			} else {
-				j = Arrays.binarySearch(sortedFields, term.field(), termComparator);
-			}
-			
-			if (j < 0) { // not found; choose successor
-				j = -j -1; 
-				i = 0;
-				if (j < sortedFields.length) getInfo(j).sortTerms();
-			}
-			else { // found
-				Info info = getInfo(j);
-				info.sortTerms();
-				i = Arrays.binarySearch(info.sortedTerms, term.text(), termComparator);
-				if (i < 0) { // not found; choose successor
-					i = -i -1;
-					if (i >= info.sortedTerms.length) { // move to next successor
-						j++;
-						i = 0;
-						if (j < sortedFields.length) getInfo(j).sortTerms();
-					}
-				}
-			}
-			final int ix = i;
-			final int jx = j;
-	
-			return new TermEnum() {
-	
-				private int i = ix; // index into info.sortedTerms
-				private int j = jx; // index into sortedFields
-					
-				public boolean next() {
-					if (DEBUG) System.err.println("TermEnum.next");
-					if (j >= sortedFields.length) return false;
-					Info info = getInfo(j);
-					if (++i < info.sortedTerms.length) return true;
-	
-					// move to successor
-					j++;
-					i = 0;
-					if (j >= sortedFields.length) return false;
-					getInfo(j).sortTerms();
-					return true;
-				}
-	
-				public Term term() {
-					if (DEBUG) System.err.println("TermEnum.term: " + i);
-					if (j >= sortedFields.length) return null;
-					Info info = getInfo(j);
-					if (i >= info.sortedTerms.length) return null;
-//					if (DEBUG) System.err.println("TermEnum.term: " + i + ", " + info.sortedTerms[i].getKey());
-					return createTerm(info, j, (String) info.sortedTerms[i].getKey());
-				}
-				
-				public int docFreq() {
-					if (DEBUG) System.err.println("TermEnum.docFreq");
-					if (j >= sortedFields.length) return 0;
-					Info info = getInfo(j);
-					if (i >= info.sortedTerms.length) return 0;
-					return numPositions(info.getPositions(i));
-				}
-	
-				public void close() {
-					if (DEBUG) System.err.println("TermEnum.close");
-				}
-				
-				/** Returns a new Term object, minimizing String.intern() overheads. */
-				private Term createTerm(Info info, int pos, String text) { 
-					// Assertion: sortFields has already been called before
-					Term template = info.template;
-					if (template == null) { // not yet cached?
-						String fieldName = (String) sortedFields[pos].getKey();
-						template = new Term(fieldName, "");
-						info.template = template;
-					}
-					
-					return template.createTerm(text);
-				}
-				
-			};
-		}
-	
-		public TermPositions termPositions() {
-			if (DEBUG) System.err.println("MemoryIndexReader.termPositions");
-			
-			return new TermPositions() {
-	
-				private boolean hasNext;
-				private int cursor = 0;
-				private ArrayIntList current;
-				
-				public void seek(Term term) {
-					if (DEBUG) System.err.println(".seek: " + term);
-					Info info = getInfo(term.field());
-					current = info == null ? null : info.getPositions(term.text());
-					hasNext = (current != null);
-					cursor = 0;
-				}
-	
-				public void seek(TermEnum termEnum) {
-					if (DEBUG) System.err.println(".seekEnum");
-					seek(termEnum.term());
-				}
-	
-				public int doc() {
-					if (DEBUG) System.err.println(".doc");
-					return 0;
-				}
-	
-				public int freq() {
-					int freq = current != null ? numPositions(current) : 0;
-					if (DEBUG) System.err.println(".freq: " + freq);
-					return freq;
-				}
-	
-				public boolean next() {
-					if (DEBUG) System.err.println(".next: " + current + ", oldHasNext=" + hasNext);
-					boolean next = hasNext;
-					hasNext = false;
-					return next;
-				}
-	
-				public int read(int[] docs, int[] freqs) {
-					if (DEBUG) System.err.println(".read: " + docs.length);
-					if (!hasNext) return 0;
-					hasNext = false;
-					docs[0] = 0;
-					freqs[0] = freq();
-					return 1;
-				}
-	
-				public boolean skipTo(int target) {
-					if (DEBUG) System.err.println(".skipTo: " + target);
-					return next();
-				}
-	
-				public void close() {
-					if (DEBUG) System.err.println(".close");
-				}
-				
-				public int nextPosition() { // implements TermPositions
-					int pos = current.get(cursor);
-					cursor += stride;
-					if (DEBUG) System.err.println(".nextPosition: " + pos);
-					return pos;
-				}
-			};
-		}
-	
-		public TermDocs termDocs() {
-			if (DEBUG) System.err.println("MemoryIndexReader.termDocs");
-			return termPositions();
-		}
-	
-		public TermFreqVector[] getTermFreqVectors(int docNumber) {
-			if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors");
-			TermFreqVector[] vectors = new TermFreqVector[fields.size()];
-//			if (vectors.length == 0) return null;
-			Iterator iter = fields.keySet().iterator();
-			for (int i=0; i < vectors.length; i++) {
-				String fieldName = (String) iter.next();
-				vectors[i] = getTermFreqVector(docNumber, fieldName);
-			}
-			return vectors;
-		}
-		
-		public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
-			if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
-			final Info info = getInfo(fieldName);
-			if (info == null) return null; // TODO: or return empty vector impl???
-			info.sortTerms();
-			
-			return new TermPositionVector() { 
-	
-				private final Map.Entry[] sortedTerms = info.sortedTerms;
-				
-				public String getField() {
-					return fieldName;
-				}
-	
-				public int size() {
-					return sortedTerms.length;
-				}
-	
-				public String[] getTerms() {
-					String[] terms = new String[sortedTerms.length];
-					for (int i=sortedTerms.length; --i >= 0; ) {
-						terms[i] = (String) sortedTerms[i].getKey();
-					}
-					return terms;
-				}
-	
-				public int[] getTermFrequencies() {
-					int[] freqs = new int[sortedTerms.length];
-					for (int i=sortedTerms.length; --i >= 0; ) {
-						freqs[i] = numPositions((ArrayIntList) sortedTerms[i].getValue());
-					}
-					return freqs;
-				}
-	
-				public int indexOf(String term) {
-					int i = Arrays.binarySearch(sortedTerms, term, termComparator);
-					return i >= 0 ? i : -1;
-				}
-	
-				public int[] indexesOf(String[] terms, int start, int len) {
-					int[] indexes = new int[len];
-					for (int i=0; i < len; i++) {
-						indexes[i] = indexOf(terms[start++]);
-					}
-					return indexes;
-				}
-				
-				// lucene >= 1.4.3
-				public int[] getTermPositions(int index) {
-					return ((ArrayIntList) sortedTerms[index].getValue()).toArray(stride);
-				} 
-				
-				// lucene >= 1.9 (remove this method for lucene-1.4.3)
-				public org.apache.lucene.index.TermVectorOffsetInfo[] getOffsets(int index) {
-					if (stride == 1) return null; // no offsets stored
-					
-					ArrayIntList positions = (ArrayIntList) sortedTerms[index].getValue();
-					int size = positions.size();
-					org.apache.lucene.index.TermVectorOffsetInfo[] offsets = 
-						new org.apache.lucene.index.TermVectorOffsetInfo[size / stride];
-					
-					for (int i=0, j=1; j < size; i++, j += stride) {
-						int start = positions.get(j);
-						int end = positions.get(j+1);
-						offsets[i] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end);
-					}
-					return offsets;
-				}
-
-			};
-		}
-
-		private Similarity getSimilarity() {
-			if (searcher != null) return searcher.getSimilarity();
-			return Similarity.getDefault();
-		}
-		
-		private void setSearcher(Searcher searcher) {
-			this.searcher = searcher;
-		}
-		
-		/** performance hack: cache norms to avoid repeated expensive calculations */
-		private byte[] cachedNorms;
-		private String cachedFieldName;
-		private Similarity cachedSimilarity;
-		
-		public byte[] norms(String fieldName) {
-			byte[] norms = cachedNorms;
-			Similarity sim = getSimilarity();
-			if (fieldName != cachedFieldName || sim != cachedSimilarity) { // not cached?
-				Info info = getInfo(fieldName);
-				int numTokens = info != null ? info.numTokens : 0;
-				float n = sim.lengthNorm(fieldName, numTokens);
-				byte norm = Similarity.encodeNorm(n);
-				norms = new byte[] {norm};
-				
-				cachedNorms = norms;
-				cachedFieldName = fieldName;
-				cachedSimilarity = sim;
-				if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens);
-			}
-			return norms;
-		}
-	
-		public void norms(String fieldName, byte[] bytes, int offset) {
-			if (DEBUG) System.err.println("MemoryIndexReader.norms*: " + fieldName);
-			byte[] norms = norms(fieldName);
-			System.arraycopy(norms, 0, bytes, offset, norms.length);
-		}
-	
-		protected void doSetNorm(int doc, String fieldName, byte value) {
-			throw new UnsupportedOperationException();
-		}
-	
-		public int numDocs() {
-			if (DEBUG) System.err.println("MemoryIndexReader.numDocs");
-			return fields.size() > 0 ? 1 : 0;
-		}
-	
-		public int maxDoc() {
-			if (DEBUG) System.err.println("MemoryIndexReader.maxDoc");
-			return 1;
-		}
-	
-		public Document document(int n) {
-			if (DEBUG) System.err.println("MemoryIndexReader.document");
-			return new Document(); // there are no stored fields
-		}
+  /** info for each field: Map<String fieldName, Info field> */
+  private final HashMap fields = new HashMap();
+  
+  /** fields sorted ascending by fieldName; lazily computed on demand */
+  private transient Map.Entry[] sortedFields; 
+  
+  /** pos: positions[3*i], startOffset: positions[3*i +1], endOffset: positions[3*i +2] */
+  private final int stride;
+  
+  private static final long serialVersionUID = 2782195016849084649L;
+
+  private static final boolean DEBUG = false;
+  
+  /**
+   * Sorts term entries into ascending order; also works for
+   * Arrays.binarySearch() and Arrays.sort()
+   */
+  private static final Comparator termComparator = new Comparator() {
+    public int compare(Object o1, Object o2) {
+      if (o1 instanceof Map.Entry) o1 = ((Map.Entry) o1).getKey();
+      if (o2 instanceof Map.Entry) o2 = ((Map.Entry) o2).getKey();
+      if (o1 == o2) return 0;
+      return ((String) o1).compareTo((String) o2);
+    }
+  };
+
+  /**
+   * Constructs an empty instance.
+   */
+  public MemoryIndex() {
+    this(false);
+  }
+  
+  /**
+   * Constructs an empty instance that can optionally store the start and end
+   * character offset of each token term in the text. This can be useful for
+   * highlighting of hit locations with the Lucene highlighter package.
+   * Private until the highlighter package matures, so that this can actually
+   * be meaningfully integrated.
+   * 
+   * @param storeOffsets
+   *            whether or not to store the start and end character offset of
+   *            each token term in the text
+   */
+  private MemoryIndex(boolean storeOffsets) {
+    this.stride = storeOffsets ? 3 : 1;
+  }
+  
+  /**
+   * Convenience method; Tokenizes the given field text and adds the resulting
+   * terms to the index; Equivalent to adding a tokenized, indexed,
+   * termVectorStored, unstored, non-keyword Lucene
+   * {@link org.apache.lucene.document.Field}.
+   * 
+   * @param fieldName
+   *            a name to be associated with the text
+   * @param text
+   *            the text to tokenize and index.
+   * @param analyzer
+   *            the analyzer to use for tokenization
+   */
+  public void addField(String fieldName, String text, Analyzer analyzer) {
+    if (fieldName == null)
+      throw new IllegalArgumentException("fieldName must not be null");
+    if (text == null)
+      throw new IllegalArgumentException("text must not be null");
+    if (analyzer == null)
+      throw new IllegalArgumentException("analyzer must not be null");
+    
+    TokenStream stream;
+    if (analyzer instanceof PatternAnalyzer) {
+      stream = ((PatternAnalyzer) analyzer).tokenStream(fieldName, text);
+    } else {
+      stream = analyzer.tokenStream(fieldName, 
+          new PatternAnalyzer.FastStringReader(text));
+    }
+    addField(fieldName, stream);
+  }
+  
+  /**
+   * Convenience method; Creates and returns a token stream that generates a
+   * token for each keyword in the given collection, "as is", without any
+   * transforming text analysis. The resulting token stream can be fed into
+   * {@link #addField(String, TokenStream)}, perhaps wrapped into another
+   * {@link org.apache.lucene.analysis.TokenFilter}, as desired.
+   * 
+   * @param keywords
+   *            the keywords to generate tokens for
+   * @return the corresponding token stream
+   */
+  public TokenStream keywordTokenStream(final Collection keywords) {
+    // TODO: deprecate & move this method into AnalyzerUtil?
+    if (keywords == null)
+      throw new IllegalArgumentException("keywords must not be null");
+    
+    return new TokenStream() {
+      private Iterator iter = keywords.iterator();
+      private int start = 0;
+      public Token next() {
+        if (!iter.hasNext()) return null;
+        
+        Object obj = iter.next();
+        if (obj == null) 
+          throw new IllegalArgumentException("keyword must not be null");
+        
+        String term = obj.toString();
+        Token token = new Token(term, start, start + term.length());
+        start += term.length() + 1; // separate words by 1 (blank) character
+        return token;
+      }
+    };
+  }
+  
+  /**
+   * Iterates over the given token stream and adds the resulting terms to the index;
+   * Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
+   * Lucene {@link org.apache.lucene.document.Field}.
+   * Finally closes the token stream. Note that untokenized keywords can be added with this method via 
+   * {@link #keywordTokenStream(Collection)}, the Lucene contrib <code>KeywordTokenizer</code> or similar utilities.
+   * 
+   * @param fieldName
+   *            a name to be associated with the text
+   * @param stream
+   *            the token stream to retrieve tokens from.
+   */
+  public void addField(String fieldName, TokenStream stream) {
+    /*
+     * Note that this method signature avoids having a user call new
+     * o.a.l.d.Field(...) which would be much too expensive due to the
+     * String.intern() usage of that class.
+     * 
+     * More often than not, String.intern() leads to serious performance
+     * degradations rather than improvements! If you're curious why, check
+     * out the JDK's native code, see how it oscillates multiple times back
+     * and forth between Java code and native code on each intern() call,
+     * only to end up using a plain vanilla java.util.HashMap on the Java
+     * heap for it's interned strings! String.equals() has a small cost
+     * compared to String.intern(), trust me. Application level interning
+     * (e.g. a HashMap per Directory/Index) typically leads to better
+     * solutions than frequent hidden low-level calls to String.intern().
+     * 
+     * Perhaps with some luck, Lucene's Field.java (and Term.java) and
+     * cousins could be fixed to not use String.intern(). Sigh :-(
+     */
+    try {
+      if (fieldName == null)
+        throw new IllegalArgumentException("fieldName must not be null");
+      if (stream == null)
+        throw new IllegalArgumentException("token stream must not be null");
+      if (fields.get(fieldName) != null)
+        throw new IllegalArgumentException("field must not be added more than once");
+      
+      HashMap terms = new HashMap();
+      int numTokens = 0;
+      int pos = -1;
+      Token token;
+      
+      while ((token = stream.next()) != null) {
+        String term = token.termText();
+        if (term.length() == 0) continue; // nothing to do
+//        if (DEBUG) System.err.println("token='" + term + "'");
+        numTokens++;
+        pos += token.getPositionIncrement();
+        
+        ArrayIntList positions = (ArrayIntList) terms.get(term);
+        if (positions == null) { // term not seen before
+          positions = new ArrayIntList(stride);
+          terms.put(term, positions);
+        }
+        if (stride == 1) {
+          positions.add(pos);
+        } else {
+          positions.add(pos, token.startOffset(), token.endOffset());
+        }
+      }
+      
+      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
+      if (numTokens > 0) {
+        fields.put(fieldName, new Info(terms, numTokens));
+        sortedFields = null;    // invalidate sorted view, if any
+      }
+    } catch (IOException e) { // can never happen
+      throw new RuntimeException(e);
+    } finally {
+      try {
+        if (stream != null) stream.close();
+      } catch (IOException e2) {
+        throw new RuntimeException(e2);
+      }
+    }
+  }
+  
+  /**
+   * Creates and returns a searcher that can be used to execute arbitrary
+   * Lucene queries and to collect the resulting query results as hits.
+   * 
+   * @return a searcher
+   */
+  public IndexSearcher createSearcher() {
+    MemoryIndexReader reader = new MemoryIndexReader();
+    IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !!
+    reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity()
+    return searcher;
+  }
+  
+  /**
+   * Convenience method that efficiently returns the relevance score by
+   * matching this index against the given Lucene query expression.
+   * 
+   * @param query
+   *            an arbitrary Lucene query to run against this index
+   * @return the relevance score of the matchmaking; A number in the range
+   *         [0.0 .. 1.0], with 0.0 indicating no match. The higher the number
+   *         the better the match.
+   * @see org.apache.lucene.queryParser.QueryParser#parse(String)
+   */
+  public float search(Query query) {
+    if (query == null) 
+      throw new IllegalArgumentException("query must not be null");
+    
+    Searcher searcher = createSearcher();
+    try {
+      final float[] scores = new float[1]; // inits to 0.0f (no match)
+      searcher.search(query, new HitCollector() {
+        public void collect(int doc, float score) {
+          scores[0] = score;
+        }
+      });
+      float score = scores[0];
+      return score;
+    } catch (IOException e) { // can never happen (RAMDirectory)
+      throw new RuntimeException(e);
+    } finally {
+      // searcher.close();
+      /*
+       * Note that it is harmless and important for good performance to
+       * NOT close the index reader!!! This avoids all sorts of
+       * unnecessary baggage and locking in the Lucene IndexReader
+       * superclass, all of which is completely unnecessary for this main
+       * memory index data structure without thread-safety claims.
+       * 
+       * Wishing IndexReader would be an interface...
+       * 
+       * Actually with the new tight createSearcher() API auto-closing is now
+       * made impossible, hence searcher.close() would be harmless...
+       */
+    }   
+  }
+  
+  /**
+   * Returns a reasonable approximation of the main memory [bytes] consumed by
+   * this instance. Useful for smart memory sensititve caches/pools. Assumes
+   * fieldNames are interned, whereas tokenized terms are memory-overlaid. For
+   * simplicity, assumes no VM word boundary alignment of instance vars.
+   * 
+   * @return the main memory consumption
+   */
+  public int getMemorySize() {
+    // for example usage in a smart cache see nux.xom.pool.Pool
+    int HEADER = 12; // object header of any java object
+    int PTR = 4; // pointer on 32 bit VMs
+    int ARR = HEADER + 4;
+    int STR = HEADER + 3*4 + PTR + ARR; // string
+    int INTARRLIST = HEADER + 4 + PTR + ARR;
+    int HASHMAP = HEADER + 4*PTR + 4*4 + ARR;
+    
+    int size = 0;
+    size += HEADER + 2*PTR + 4; // memory index
+    if (sortedFields != null) size += ARR + PTR * sortedFields.length;
+    
+    size += HASHMAP + fields.size() * (PTR + HEADER + 3*PTR + 4); // Map.entries
+    Iterator iter = fields.entrySet().iterator();
+    while (iter.hasNext()) { // for each Field Info
+      Map.Entry entry = (Map.Entry) iter.next();      
+      Info info = (Info) entry.getValue();
+      size += HEADER + 4 + PTR + PTR + PTR; // Info instance vars
+      if (info.sortedTerms != null) size += ARR + PTR * info.sortedTerms.length;
+      
+      int len = info.terms.size();
+      size += HASHMAP + len * (PTR + HEADER + 3*PTR + 4); // Map.entries
+      Iterator iter2 = info.terms.entrySet().iterator();
+      while (--len >= 0) { // for each term
+        Map.Entry e = (Map.Entry) iter2.next();
+        size += STR - ARR; // assumes substring() memory overlay
+//        size += STR + 2 * ((String) e.getKey()).length();
+        ArrayIntList positions = (ArrayIntList) e.getValue();
+        size += INTARRLIST + 4*positions.size();
+      }
+    }
+    return size;
+  } 
+
+  private int numPositions(ArrayIntList positions) {
+    return positions.size() / stride;
+  }
+  
+  /** sorts into ascending order (on demand), reusing memory along the way */
+  private void sortFields() {
+    if (sortedFields == null) sortedFields = sort(fields);
+  }
+  
+  /** returns a view of the given map's entries, sorted ascending by key */
+  private static Map.Entry[] sort(HashMap map) {
+    int size = map.size();
+    Map.Entry[] entries = new Map.Entry[size];
+    
+    Iterator iter = map.entrySet().iterator();
+    for (int i=0; i < size; i++) {
+      entries[i] = (Map.Entry) iter.next();
+    }
+    
+    if (size > 1) Arrays.sort(entries, termComparator);
+    return entries;
+  }
+  
+  /**
+   * Returns a String representation of the index data for debugging purposes.
+   * 
+   * @return the string representation
+   */
+  public String toString() {
+    StringBuffer result = new StringBuffer(256);    
+    sortFields();   
+    int sumChars = 0;
+    int sumPositions = 0;
+    int sumTerms = 0;
+    
+    for (int i=0; i < sortedFields.length; i++) {
+      Map.Entry entry = sortedFields[i];
+      String fieldName = (String) entry.getKey();
+      Info info = (Info) entry.getValue();
+      info.sortTerms();
+      result.append(fieldName + ":\n");
+      
+      int numChars = 0;
+      int numPositions = 0;
+      for (int j=0; j < info.sortedTerms.length; j++) {
+        Map.Entry e = info.sortedTerms[j];
+        String term = (String) e.getKey();
+        ArrayIntList positions = (ArrayIntList) e.getValue();
+        result.append("\t'" + term + "':" + numPositions(positions) + ":");
+        result.append(positions.toString(stride)); // ignore offsets
+        result.append("\n");
+        numPositions += numPositions(positions);
+        numChars += term.length();
+      }
+      
+      result.append("\tterms=" + info.sortedTerms.length);
+      result.append(", positions=" + numPositions);
+      result.append(", Kchars=" + (numChars/1000.0f));
+      result.append("\n");
+      sumPositions += numPositions;
+      sumChars += numChars;
+      sumTerms += info.sortedTerms.length;
+    }
+    
+    result.append("\nfields=" + sortedFields.length);
+    result.append(", terms=" + sumTerms);
+    result.append(", positions=" + sumPositions);
+    result.append(", Kchars=" + (sumChars/1000.0f));
+    return result.toString();
+  }
+  
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  /**
+   * Index data structure for a field; Contains the tokenized term texts and
+   * their positions.
+   */
+  private static final class Info implements Serializable {
+    
+    /**
+     * Term strings and their positions for this field: Map <String
+     * termText, ArrayIntList positions>
+     */
+    private final HashMap terms; 
+    
+    /** Terms sorted ascending by term text; computed on demand */
+    private transient Map.Entry[] sortedTerms;
+    
+    /** Number of added tokens for this field */
+    private final int numTokens;
+    
+    /** Term for this field's fieldName, lazily computed on demand */
+    public transient Term template;
+
+    private static final long serialVersionUID = 2882195016849084649L;  
+
+    public Info(HashMap terms, int numTokens) {
+      this.terms = terms;
+      this.numTokens = numTokens;
+    }
+    
+    /**
+     * Sorts hashed terms into ascending order, reusing memory along the
+     * way. Note that sorting is lazily delayed until required (often it's
+     * not required at all). If a sorted view is required then hashing +
+     * sort + binary search is still faster and smaller than TreeMap usage
+     * (which would be an alternative and somewhat more elegant approach,
+     * apart from more sophisticated Tries / prefix trees).
+     */
+    public void sortTerms() {
+      if (sortedTerms == null) sortedTerms = sort(terms);
+    }
+        
+    /** note that the frequency can be calculated as numPosition(getPositions(x)) */
+    public ArrayIntList getPositions(String term) {
+      return (ArrayIntList) terms.get(term);
+    }
+
+    /** note that the frequency can be calculated as numPosition(getPositions(x)) */
+    public ArrayIntList getPositions(int pos) {
+      return (ArrayIntList) sortedTerms[pos].getValue();
+    }
+    
+  }
+  
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  /**
+   * Efficient resizable auto-expanding list holding <code>int</code> elements;
+   * implemented with arrays.
+   */
+  private static final class ArrayIntList implements Serializable {
+
+    private int[] elements;
+    private int size = 0;
+    
+    private static final long serialVersionUID = 2282195016849084649L;  
+      
+    public ArrayIntList() {
+      this(10);
+    }
+
+    public ArrayIntList(int initialCapacity) {
+      elements = new int[initialCapacity];
+    }
+
+    public void add(int elem) {
+      if (size == elements.length) ensureCapacity(size + 1);
+      elements[size++] = elem;
+    }
+
+    public void add(int pos, int start, int end) {
+      if (size + 3 > elements.length) ensureCapacity(size + 3);
+      elements[size] = pos;
+      elements[size+1] = start;
+      elements[size+2] = end;
+      size += 3;
+    }
+
+    public int get(int index) {
+      if (index >= size) throwIndex(index);
+      return elements[index];
+    }
+    
+    public int size() {
+      return size;
+    }
+    
+    public int[] toArray(int stride) {
+      int[] arr = new int[size() / stride];
+      if (stride == 1)
+        System.arraycopy(elements, 0, arr, 0, size); // fast path
+      else 
+        for (int i=0, j=0; j < size; i++, j += stride) arr[i] = elements[j];
+      return arr;
+    }
+    
+    private void ensureCapacity(int minCapacity) {
+      int newCapacity = Math.max(minCapacity, (elements.length * 3) / 2 + 1);
+      int[] newElements = new int[newCapacity];
+      System.arraycopy(elements, 0, newElements, 0, size);
+      elements = newElements;
+    }
+
+    private void throwIndex(int index) {
+      throw new IndexOutOfBoundsException("index: " + index
+            + ", size: " + size);
+    }
+    
+    /** returns the first few positions (without offsets); debug only */
+    public String toString(int stride) {
+      int s = size() / stride;
+      int len = Math.min(10, s); // avoid printing huge lists
+      StringBuffer buf = new StringBuffer(4*len);
+      buf.append("[");
+      for (int i = 0; i < len; i++) {
+        buf.append(get(i*stride));
+        if (i < len-1) buf.append(", ");
+      }
+      if (len != s) buf.append(", ..."); // and some more...
+      buf.append("]");
+      return buf.toString();
+    }   
+  }
+  
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  private static final Term MATCH_ALL_TERM = new Term("", "");
+    
+  /**
+   * Search support for Lucene framework integration; implements all methods
+   * required by the Lucene IndexReader contracts.
+   */
+  private final class MemoryIndexReader extends IndexReader {
+    
+    private Searcher searcher; // needed to find searcher.getSimilarity() 
+    
+    private MemoryIndexReader() {
+      super(null); // avoid as much superclass baggage as possible
+    }
+    
+    // lucene >= 1.9 or lucene-1.4.3 with patch removing "final" in superclass
+    protected void finalize() {}
+    
+    private Info getInfo(String fieldName) {
+      return (Info) fields.get(fieldName);
+    }
+    
+    private Info getInfo(int pos) {
+      return (Info) sortedFields[pos].getValue();
+    }
+    
+    public int docFreq(Term term) {
+      Info info = getInfo(term.field());
+      int freq = 0;
+      if (info != null) freq = info.getPositions(term.text()) != null ? 1 : 0;
+      if (DEBUG) System.err.println("MemoryIndexReader.docFreq: " + term + ", freq:" + freq);
+      return freq;
+    }
+  
+    public TermEnum terms() {
+      if (DEBUG) System.err.println("MemoryIndexReader.terms()");
+      return terms(MATCH_ALL_TERM);
+    }
+    
+    public TermEnum terms(Term term) {
+      if (DEBUG) System.err.println("MemoryIndexReader.terms: " + term);
+  
+      int i; // index into info.sortedTerms
+      int j; // index into sortedFields
+      
+      sortFields();
+      if (sortedFields.length == 1 && sortedFields[0].getKey() == term.field()) {
+        j = 0; // fast path
+      } else {
+        j = Arrays.binarySearch(sortedFields, term.field(), termComparator);
+      }
+      
+      if (j < 0) { // not found; choose successor
+        j = -j -1; 
+        i = 0;
+        if (j < sortedFields.length) getInfo(j).sortTerms();
+      }
+      else { // found
+        Info info = getInfo(j);
+        info.sortTerms();
+        i = Arrays.binarySearch(info.sortedTerms, term.text(), termComparator);
+        if (i < 0) { // not found; choose successor
+          i = -i -1;
+          if (i >= info.sortedTerms.length) { // move to next successor
+            j++;
+            i = 0;
+            if (j < sortedFields.length) getInfo(j).sortTerms();
+          }
+        }
+      }
+      final int ix = i;
+      final int jx = j;
+  
+      return new TermEnum() {
+  
+        private int i = ix; // index into info.sortedTerms
+        private int j = jx; // index into sortedFields
+          
+        public boolean next() {
+          if (DEBUG) System.err.println("TermEnum.next");
+          if (j >= sortedFields.length) return false;
+          Info info = getInfo(j);
+          if (++i < info.sortedTerms.length) return true;
+  
+          // move to successor
+          j++;
+          i = 0;
+          if (j >= sortedFields.length) return false;
+          getInfo(j).sortTerms();
+          return true;
+        }
+  
+        public Term term() {
+          if (DEBUG) System.err.println("TermEnum.term: " + i);
+          if (j >= sortedFields.length) return null;
+          Info info = getInfo(j);
+          if (i >= info.sortedTerms.length) return null;
+//          if (DEBUG) System.err.println("TermEnum.term: " + i + ", " + info.sortedTerms[i].getKey());
+          return createTerm(info, j, (String) info.sortedTerms[i].getKey());
+        }
+        
+        public int docFreq() {
+          if (DEBUG) System.err.println("TermEnum.docFreq");
+          if (j >= sortedFields.length) return 0;
+          Info info = getInfo(j);
+          if (i >= info.sortedTerms.length) return 0;
+          return numPositions(info.getPositions(i));
+        }
+  
+        public void close() {
+          if (DEBUG) System.err.println("TermEnum.close");
+        }
+        
+        /** Returns a new Term object, minimizing String.intern() overheads. */
+        private Term createTerm(Info info, int pos, String text) { 
+          // Assertion: sortFields has already been called before
+          Term template = info.template;
+          if (template == null) { // not yet cached?
+            String fieldName = (String) sortedFields[pos].getKey();
+            template = new Term(fieldName, "");
+            info.template = template;
+          }
+          
+          return template.createTerm(text);
+        }
+        
+      };
+    }
+  
+    public TermPositions termPositions() {
+      if (DEBUG) System.err.println("MemoryIndexReader.termPositions");
+      
+      return new TermPositions() {
+  
+        private boolean hasNext;
+        private int cursor = 0;
+        private ArrayIntList current;
+        
+        public void seek(Term term) {
+          if (DEBUG) System.err.println(".seek: " + term);
+          Info info = getInfo(term.field());
+          current = info == null ? null : info.getPositions(term.text());
+          hasNext = (current != null);
+          cursor = 0;
+        }
+  
+        public void seek(TermEnum termEnum) {
+          if (DEBUG) System.err.println(".seekEnum");
+          seek(termEnum.term());
+        }
+  
+        public int doc() {
+          if (DEBUG) System.err.println(".doc");
+          return 0;
+        }
+  
+        public int freq() {
+          int freq = current != null ? numPositions(current) : 0;
+          if (DEBUG) System.err.println(".freq: " + freq);
+          return freq;
+        }
+  
+        public boolean next() {
+          if (DEBUG) System.err.println(".next: " + current + ", oldHasNext=" + hasNext);
+          boolean next = hasNext;
+          hasNext = false;
+          return next;
+        }
+  
+        public int read(int[] docs, int[] freqs) {
+          if (DEBUG) System.err.println(".read: " + docs.length);
+          if (!hasNext) return 0;
+          hasNext = false;
+          docs[0] = 0;
+          freqs[0] = freq();
+          return 1;
+        }
+  
+        public boolean skipTo(int target) {
+          if (DEBUG) System.err.println(".skipTo: " + target);
+          return next();
+        }
+  
+        public void close() {
+          if (DEBUG) System.err.println(".close");
+        }
+        
+        public int nextPosition() { // implements TermPositions
+          int pos = current.get(cursor);
+          cursor += stride;
+          if (DEBUG) System.err.println(".nextPosition: " + pos);
+          return pos;
+        }
+      };
+    }
+  
+    public TermDocs termDocs() {
+      if (DEBUG) System.err.println("MemoryIndexReader.termDocs");
+      return termPositions();
+    }
+  
+    public TermFreqVector[] getTermFreqVectors(int docNumber) {
+      if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors");
+      TermFreqVector[] vectors = new TermFreqVector[fields.size()];
+//      if (vectors.length == 0) return null;
+      Iterator iter = fields.keySet().iterator();
+      for (int i=0; i < vectors.length; i++) {
+        String fieldName = (String) iter.next();
+        vectors[i] = getTermFreqVector(docNumber, fieldName);
+      }
+      return vectors;
+    }
+    
+    public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
+      if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
+      final Info info = getInfo(fieldName);
+      if (info == null) return null; // TODO: or return empty vector impl???
+      info.sortTerms();
+      
+      return new TermPositionVector() { 
+  
+        private final Map.Entry[] sortedTerms = info.sortedTerms;
+        
+        public String getField() {
+          return fieldName;
+        }
+  
+        public int size() {
+          return sortedTerms.length;
+        }
+  
+        public String[] getTerms() {
+          String[] terms = new String[sortedTerms.length];
+          for (int i=sortedTerms.length; --i >= 0; ) {
+            terms[i] = (String) sortedTerms[i].getKey();
+          }
+          return terms;
+        }
+  
+        public int[] getTermFrequencies() {
+          int[] freqs = new int[sortedTerms.length];
+          for (int i=sortedTerms.length; --i >= 0; ) {
+            freqs[i] = numPositions((ArrayIntList) sortedTerms[i].getValue());
+          }
+          return freqs;
+        }
+  
+        public int indexOf(String term) {
+          int i = Arrays.binarySearch(sortedTerms, term, termComparator);
+          return i >= 0 ? i : -1;
+        }
+  
+        public int[] indexesOf(String[] terms, int start, int len) {
+          int[] indexes = new int[len];
+          for (int i=0; i < len; i++) {
+            indexes[i] = indexOf(terms[start++]);
+          }
+          return indexes;
+        }
+        
+        // lucene >= 1.4.3
+        public int[] getTermPositions(int index) {
+          return ((ArrayIntList) sortedTerms[index].getValue()).toArray(stride);
+        } 
+        
+        // lucene >= 1.9 (remove this method for lucene-1.4.3)
+        public org.apache.lucene.index.TermVectorOffsetInfo[] getOffsets(int index) {
+          if (stride == 1) return null; // no offsets stored
+          
+          ArrayIntList positions = (ArrayIntList) sortedTerms[index].getValue();
+          int size = positions.size();
+          org.apache.lucene.index.TermVectorOffsetInfo[] offsets = 
+            new org.apache.lucene.index.TermVectorOffsetInfo[size / stride];
+          
+          for (int i=0, j=1; j < size; i++, j += stride) {
+            int start = positions.get(j);
+            int end = positions.get(j+1);
+            offsets[i] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end);
+          }
+          return offsets;
+        }
+
+      };
+    }
+
+    private Similarity getSimilarity() {
+      if (searcher != null) return searcher.getSimilarity();
+      return Similarity.getDefault();
+    }
+    
+    private void setSearcher(Searcher searcher) {
+      this.searcher = searcher;
+    }
+    
+    /** performance hack: cache norms to avoid repeated expensive calculations */
+    private byte[] cachedNorms;
+    private String cachedFieldName;
+    private Similarity cachedSimilarity;
+    
+    public byte[] norms(String fieldName) {
+      byte[] norms = cachedNorms;
+      Similarity sim = getSimilarity();
+      if (fieldName != cachedFieldName || sim != cachedSimilarity) { // not cached?
+        Info info = getInfo(fieldName);
+        int numTokens = info != null ? info.numTokens : 0;
+        float n = sim.lengthNorm(fieldName, numTokens);
+        byte norm = Similarity.encodeNorm(n);
+        norms = new byte[] {norm};
+        
+        cachedNorms = norms;
+        cachedFieldName = fieldName;
+        cachedSimilarity = sim;
+        if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens);
+      }
+      return norms;
+    }
+  
+    public void norms(String fieldName, byte[] bytes, int offset) {
+      if (DEBUG) System.err.println("MemoryIndexReader.norms*: " + fieldName);
+      byte[] norms = norms(fieldName);
+      System.arraycopy(norms, 0, bytes, offset, norms.length);
+    }
+  
+    protected void doSetNorm(int doc, String fieldName, byte value) {
+      throw new UnsupportedOperationException();
+    }
+  
+    public int numDocs() {
+      if (DEBUG) System.err.println("MemoryIndexReader.numDocs");
+      return fields.size() > 0 ? 1 : 0;
+    }
+  
+    public int maxDoc() {
+      if (DEBUG) System.err.println("MemoryIndexReader.maxDoc");
+      return 1;
+    }
+  
+    public Document document(int n) {
+      if (DEBUG) System.err.println("MemoryIndexReader.document");
+      return new Document(); // there are no stored fields
+    }
 
     //When we convert to JDK 1.5 make this Set<String>
     public Document document(int n, FieldSelector fieldSelector) throws IOException {
       if (DEBUG) System.err.println("MemoryIndexReader.document");
-			return new Document(); // there are no stored fields
+      return new Document(); // there are no stored fields
     }
 
     public boolean isDeleted(int n) {
-			if (DEBUG) System.err.println("MemoryIndexReader.isDeleted");
-			return false;
-		}
-	
-		public boolean hasDeletions() {
-			if (DEBUG) System.err.println("MemoryIndexReader.hasDeletions");
-			return false;
-		}
-	
-		protected void doDelete(int docNum) {
-			throw new UnsupportedOperationException();
-		}
-	
-		protected void doUndeleteAll() {
-			throw new UnsupportedOperationException();
-		}
-	
-		protected void doCommit() {
-			if (DEBUG) System.err.println("MemoryIndexReader.doCommit");
-		}
-	
-		protected void doClose() {
-			if (DEBUG) System.err.println("MemoryIndexReader.doClose");
-		}
-		
-		// lucene >= 1.9 (remove this method for lucene-1.4.3)
-		public Collection getFieldNames(FieldOption fieldOption) {
-			if (DEBUG) System.err.println("MemoryIndexReader.getFieldNamesOption");
-			if (fieldOption == FieldOption.UNINDEXED) 
-				return Collections.EMPTY_SET;
-			if (fieldOption == FieldOption.INDEXED_NO_TERMVECTOR) 
-				return Collections.EMPTY_SET;
-			if (fieldOption == FieldOption.TERMVECTOR_WITH_OFFSET && stride == 1) 
-				return Collections.EMPTY_SET;
-			if (fieldOption == FieldOption.TERMVECTOR_WITH_POSITION_OFFSET && stride == 1) 
-				return Collections.EMPTY_SET;
-			
-			return Collections.unmodifiableSet(fields.keySet());
-		}
-	}
+      if (DEBUG) System.err.println("MemoryIndexReader.isDeleted");
+      return false;
+    }
+  
+    public boolean hasDeletions() {
+      if (DEBUG) System.err.println("MemoryIndexReader.hasDeletions");
+      return false;
+    }
+  
+    protected void doDelete(int docNum) {
+      throw new UnsupportedOperationException();
+    }
+  
+    protected void doUndeleteAll() {
+      throw new UnsupportedOperationException();
+    }
+  
+    protected void doCommit() {
+      if (DEBUG) System.err.println("MemoryIndexReader.doCommit");
+    }
+  
+    protected void doClose() {
+      if (DEBUG) System.err.println("MemoryIndexReader.doClose");
+    }
+    
+    // lucene >= 1.9 (remove this method for lucene-1.4.3)
+    public Collection getFieldNames(FieldOption fieldOption) {
+      if (DEBUG) System.err.println("MemoryIndexReader.getFieldNamesOption");
+      if (fieldOption == FieldOption.UNINDEXED) 
+        return Collections.EMPTY_SET;
+      if (fieldOption == FieldOption.INDEXED_NO_TERMVECTOR) 
+        return Collections.EMPTY_SET;
+      if (fieldOption == FieldOption.TERMVECTOR_WITH_OFFSET && stride == 1) 
+        return Collections.EMPTY_SET;
+      if (fieldOption == FieldOption.TERMVECTOR_WITH_POSITION_OFFSET && stride == 1) 
+        return Collections.EMPTY_SET;
+      
+      return Collections.unmodifiableSet(fields.keySet());
+    }
+  }
 
 }



Mime
View raw message