lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject svn commit: r413584 [2/3] - in /lucene/java/trunk/contrib/memory/src: java/org/apache/lucene/index/memory/ test/org/apache/lucene/index/memory/
Date Mon, 12 Jun 2006 05:46:17 GMT
Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java Sun Jun 11 22:46:16 2006
@@ -63,397 +63,397 @@
  * @author whoschek.AT.lbl.DOT.gov
  */
 public class PatternAnalyzer extends Analyzer {
-	
-	/** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
-	public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
-	
-	/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
-	public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
-	
-	private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
-		"a", "about", "above", "across", "adj", "after", "afterwards",
-		"again", "against", "albeit", "all", "almost", "alone", "along",
-		"already", "also", "although", "always", "among", "amongst", "an",
-		"and", "another", "any", "anyhow", "anyone", "anything",
-		"anywhere", "are", "around", "as", "at", "be", "became", "because",
-		"become", "becomes", "becoming", "been", "before", "beforehand",
-		"behind", "being", "below", "beside", "besides", "between",
-		"beyond", "both", "but", "by", "can", "cannot", "co", "could",
-		"down", "during", "each", "eg", "either", "else", "elsewhere",
-		"enough", "etc", "even", "ever", "every", "everyone", "everything",
-		"everywhere", "except", "few", "first", "for", "former",
-		"formerly", "from", "further", "had", "has", "have", "he", "hence",
-		"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
-		"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
-		"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
-		"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
-		"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
-		"must", "my", "myself", "namely", "neither", "never",
-		"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
-		"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
-		"once one", "only", "onto", "or", "other", "others", "otherwise",
-		"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
-		"rather", "s", "same", "seem", "seemed", "seeming", "seems",
-		"several", "she", "should", "since", "so", "some", "somehow",
-		"someone", "something", "sometime", "sometimes", "somewhere",
-		"still", "such", "t", "than", "that", "the", "their", "them",
-		"themselves", "then", "thence", "there", "thereafter", "thereby",
-		"therefor", "therein", "thereupon", "these", "they", "this",
-		"those", "though", "through", "throughout", "thru", "thus", "to",
-		"together", "too", "toward", "towards", "under", "until", "up",
-		"upon", "us", "very", "via", "was", "we", "well", "were", "what",
-		"whatever", "whatsoever", "when", "whence", "whenever",
-		"whensoever", "where", "whereafter", "whereas", "whereat",
-		"whereby", "wherefrom", "wherein", "whereinto", "whereof",
-		"whereon", "whereto", "whereunto", "whereupon", "wherever",
-		"wherewith", "whether", "which", "whichever", "whichsoever",
-		"while", "whilst", "whither", "who", "whoever", "whole", "whom",
-		"whomever", "whomsoever", "whose", "whosoever", "why", "will",
-		"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
-		"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
-		"yourselves"});
-		
-	/**
-	 * A lower-casing word analyzer with English stop words (can be shared
-	 * freely across threads without harm); global per class loader.
-	 */
-	public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
-		NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
-		
-	/**
-	 * A lower-casing word analyzer with <b>extended </b> English stop words
-	 * (can be shared freely across threads without harm); global per class
-	 * loader. The stop words are borrowed from
-	 * http://thomas.loc.gov/home/stopwords.html, see
-	 * http://thomas.loc.gov/home/all.about.inquery.html
-	 */
-	public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
-		NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
-		
-	private final Pattern pattern;
-	private final boolean toLowerCase;
-	private final Set stopWords;
-	
-	/**
-	 * Constructs a new instance with the given parameters.
-	 * 
-	 * @param pattern
-	 *            a regular expression delimiting tokens
-	 * @param toLowerCase
-	 *            if <code>true</code> returns tokens after applying
-	 *            String.toLowerCase()
-	 * @param stopWords
-	 *            if non-null, ignores all tokens that are contained in the
-	 *            given stop set (after previously having applied toLowerCase()
-	 *            if applicable). For example, created via
-	 *            {@link StopFilter#makeStopSet(String[])}and/or
-	 *            {@link org.apache.lucene.analysis.WordlistLoader}as in
-	 *            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
-	 *            or <a href="http://www.unine.ch/info/clef/">other stop words
-	 *            lists </a>.
-	 */
-	public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
-		if (pattern == null) 
-			throw new IllegalArgumentException("pattern must not be null");
-		
-		if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
-		else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
-		
-		if (stopWords != null && stopWords.size() == 0) stopWords = null;
-		
-		this.pattern = pattern;
-		this.toLowerCase = toLowerCase;
-		this.stopWords = stopWords;
-	}
-	
-	/**
-	 * Creates a token stream that tokenizes the given string into token terms
-	 * (aka words).
-	 * 
-	 * @param fieldName
-	 *            the name of the field to tokenize (currently ignored).
-	 * @param text
-	 *            the string to tokenize
-	 * @return a new token stream
-	 */
-	public TokenStream tokenStream(String fieldName, String text) {
-		// Ideally the Analyzer superclass should have a method with the same signature, 
-		// with a default impl that simply delegates to the StringReader flavour. 
-		if (text == null) 
-			throw new IllegalArgumentException("text must not be null");
-		
-		TokenStream stream;
-		if (pattern == NON_WORD_PATTERN) { // fast path
-			stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
-		}
-		else if (pattern == WHITESPACE_PATTERN) { // fast path
-			stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
-		}
-		else {
-			stream = new PatternTokenizer(text, pattern, toLowerCase);
-			if (stopWords != null) stream = new StopFilter(stream, stopWords);
-		}
-		
-		return stream;
-	}
-	
-	/**
-	 * Creates a token stream that tokenizes all the text in the given Reader;
-	 * This implementation forwards to <code>tokenStream(String, String)</code> and is
-	 * less efficient than <code>tokenStream(String, String)</code>.
-	 * 
-	 * @param fieldName
-	 *            the name of the field to tokenize (currently ignored).
-	 * @param reader
-	 *            the reader delivering the text
-	 * @return a new token stream
-	 */
-	public TokenStream tokenStream(String fieldName, Reader reader) {
-		if (reader instanceof FastStringReader) { // fast path
-			return tokenStream(fieldName, ((FastStringReader)reader).getString());
-		}
-		
-		try {
-			String text = toString(reader);
-			return tokenStream(fieldName, text);
-		} catch (IOException e) {
-			throw new RuntimeException(e);
-		}
-	}
-	
-	/**
-	 * Indicates whether some other object is "equal to" this one.
-	 * 
-	 * @param other
-	 *            the reference object with which to compare.
-	 * @return true if equal, false otherwise
-	 */
-	public boolean equals(Object other) {
-		if (this  == other) return true;
-		if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
-		if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
-		
-		if (other instanceof PatternAnalyzer) {
-			PatternAnalyzer p2 = (PatternAnalyzer) other;
-			return 
-				toLowerCase == p2.toLowerCase &&
-				eqPattern(pattern, p2.pattern) &&
-				eq(stopWords, p2.stopWords);
-		}
-		return false;
-	}
-	
-	/**
-	 * Returns a hash code value for the object.
-	 * 
-	 * @return the hash code.
-	 */
-	public int hashCode() {
-		if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
-		if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
-		
-		int h = 1;
-		h = 31*h + pattern.pattern().hashCode();
-		h = 31*h + pattern.flags();
-		h = 31*h + (toLowerCase ? 1231 : 1237);
-		h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
-		return h;
-	}
-	
-	/** equality where o1 and/or o2 can be null */
-	private static boolean eq(Object o1, Object o2) {
-		return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
-	}
-	
-	/** assumes p1 and p2 are not null */
-	private static boolean eqPattern(Pattern p1, Pattern p2) {
-		return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
-	}
-		
-	/**
-	 * Reads until end-of-stream and returns all read chars, finally closes the stream.
-	 * 
-	 * @param input the input stream
-	 * @throws IOException if an I/O error occurs while reading the stream
-	 */
-	private static String toString(Reader input) throws IOException {
-		try {
-			int len = 256;
-			char[] buffer = new char[len];
-			char[] output = new char[len];
-			
-			len = 0;
-			int n;
-			while ((n = input.read(buffer)) >= 0) {
-				if (len + n > output.length) { // grow capacity
-					char[] tmp = new char[Math.max(output.length << 1, len + n)];
-					System.arraycopy(output, 0, tmp, 0, len);
-					System.arraycopy(buffer, 0, tmp, len, n);
-					buffer = output; // use larger buffer for future larger bulk reads
-					output = tmp;
-				} else {
-					System.arraycopy(buffer, 0, output, len, n);
-				}
-				len += n;
-			}
+  
+  /** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
+  public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
+  
+  /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
+  public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
+  
+  private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
+    "a", "about", "above", "across", "adj", "after", "afterwards",
+    "again", "against", "albeit", "all", "almost", "alone", "along",
+    "already", "also", "although", "always", "among", "amongst", "an",
+    "and", "another", "any", "anyhow", "anyone", "anything",
+    "anywhere", "are", "around", "as", "at", "be", "became", "because",
+    "become", "becomes", "becoming", "been", "before", "beforehand",
+    "behind", "being", "below", "beside", "besides", "between",
+    "beyond", "both", "but", "by", "can", "cannot", "co", "could",
+    "down", "during", "each", "eg", "either", "else", "elsewhere",
+    "enough", "etc", "even", "ever", "every", "everyone", "everything",
+    "everywhere", "except", "few", "first", "for", "former",
+    "formerly", "from", "further", "had", "has", "have", "he", "hence",
+    "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+    "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
+    "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
+    "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
+    "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
+    "must", "my", "myself", "namely", "neither", "never",
+    "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
+    "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+    "once one", "only", "onto", "or", "other", "others", "otherwise",
+    "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
+    "rather", "s", "same", "seem", "seemed", "seeming", "seems",
+    "several", "she", "should", "since", "so", "some", "somehow",
+    "someone", "something", "sometime", "sometimes", "somewhere",
+    "still", "such", "t", "than", "that", "the", "their", "them",
+    "themselves", "then", "thence", "there", "thereafter", "thereby",
+    "therefor", "therein", "thereupon", "these", "they", "this",
+    "those", "though", "through", "throughout", "thru", "thus", "to",
+    "together", "too", "toward", "towards", "under", "until", "up",
+    "upon", "us", "very", "via", "was", "we", "well", "were", "what",
+    "whatever", "whatsoever", "when", "whence", "whenever",
+    "whensoever", "where", "whereafter", "whereas", "whereat",
+    "whereby", "wherefrom", "wherein", "whereinto", "whereof",
+    "whereon", "whereto", "whereunto", "whereupon", "wherever",
+    "wherewith", "whether", "which", "whichever", "whichsoever",
+    "while", "whilst", "whither", "who", "whoever", "whole", "whom",
+    "whomever", "whomsoever", "whose", "whosoever", "why", "will",
+    "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
+    "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
+    "yourselves"});
+    
+  /**
+   * A lower-casing word analyzer with English stop words (can be shared
+   * freely across threads without harm); global per class loader.
+   */
+  public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
+    NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
+    
+  /**
+   * A lower-casing word analyzer with <b>extended </b> English stop words
+   * (can be shared freely across threads without harm); global per class
+   * loader. The stop words are borrowed from
+   * http://thomas.loc.gov/home/stopwords.html, see
+   * http://thomas.loc.gov/home/all.about.inquery.html
+   */
+  public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
+    NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
+    
+  private final Pattern pattern;
+  private final boolean toLowerCase;
+  private final Set stopWords;
+  
+  /**
+   * Constructs a new instance with the given parameters.
+   * 
+   * @param pattern
+   *            a regular expression delimiting tokens
+   * @param toLowerCase
+   *            if <code>true</code> returns tokens after applying
+   *            String.toLowerCase()
+   * @param stopWords
+   *            if non-null, ignores all tokens that are contained in the
+   *            given stop set (after previously having applied toLowerCase()
+   *            if applicable). For example, created via
+   *            {@link StopFilter#makeStopSet(String[])}and/or
+   *            {@link org.apache.lucene.analysis.WordlistLoader}as in
+   *            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
+   *            or <a href="http://www.unine.ch/info/clef/">other stop words
+   *            lists </a>.
+   */
+  public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
+    if (pattern == null) 
+      throw new IllegalArgumentException("pattern must not be null");
+    
+    if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
+    else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
+    
+    if (stopWords != null && stopWords.size() == 0) stopWords = null;
+    
+    this.pattern = pattern;
+    this.toLowerCase = toLowerCase;
+    this.stopWords = stopWords;
+  }
+  
+  /**
+   * Creates a token stream that tokenizes the given string into token terms
+   * (aka words).
+   * 
+   * @param fieldName
+   *            the name of the field to tokenize (currently ignored).
+   * @param text
+   *            the string to tokenize
+   * @return a new token stream
+   */
+  public TokenStream tokenStream(String fieldName, String text) {
+    // Ideally the Analyzer superclass should have a method with the same signature, 
+    // with a default impl that simply delegates to the StringReader flavour. 
+    if (text == null) 
+      throw new IllegalArgumentException("text must not be null");
+    
+    TokenStream stream;
+    if (pattern == NON_WORD_PATTERN) { // fast path
+      stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
+    }
+    else if (pattern == WHITESPACE_PATTERN) { // fast path
+      stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
+    }
+    else {
+      stream = new PatternTokenizer(text, pattern, toLowerCase);
+      if (stopWords != null) stream = new StopFilter(stream, stopWords);
+    }
+    
+    return stream;
+  }
+  
+  /**
+   * Creates a token stream that tokenizes all the text in the given Reader;
+   * This implementation forwards to <code>tokenStream(String, String)</code> and is
+   * less efficient than <code>tokenStream(String, String)</code>.
+   * 
+   * @param fieldName
+   *            the name of the field to tokenize (currently ignored).
+   * @param reader
+   *            the reader delivering the text
+   * @return a new token stream
+   */
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    if (reader instanceof FastStringReader) { // fast path
+      return tokenStream(fieldName, ((FastStringReader)reader).getString());
+    }
+    
+    try {
+      String text = toString(reader);
+      return tokenStream(fieldName, text);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+  
+  /**
+   * Indicates whether some other object is "equal to" this one.
+   * 
+   * @param other
+   *            the reference object with which to compare.
+   * @return true if equal, false otherwise
+   */
+  public boolean equals(Object other) {
+    if (this  == other) return true;
+    if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
+    if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
+    
+    if (other instanceof PatternAnalyzer) {
+      PatternAnalyzer p2 = (PatternAnalyzer) other;
+      return 
+        toLowerCase == p2.toLowerCase &&
+        eqPattern(pattern, p2.pattern) &&
+        eq(stopWords, p2.stopWords);
+    }
+    return false;
+  }
+  
+  /**
+   * Returns a hash code value for the object.
+   * 
+   * @return the hash code.
+   */
+  public int hashCode() {
+    if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
+    if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
+    
+    int h = 1;
+    h = 31*h + pattern.pattern().hashCode();
+    h = 31*h + pattern.flags();
+    h = 31*h + (toLowerCase ? 1231 : 1237);
+    h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
+    return h;
+  }
+  
+  /** equality where o1 and/or o2 can be null */
+  private static boolean eq(Object o1, Object o2) {
+    return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
+  }
+  
+  /** assumes p1 and p2 are not null */
+  private static boolean eqPattern(Pattern p1, Pattern p2) {
+    return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
+  }
+    
+  /**
+   * Reads until end-of-stream and returns all read chars, finally closes the stream.
+   * 
+   * @param input the input stream
+   * @throws IOException if an I/O error occurs while reading the stream
+   */
+  private static String toString(Reader input) throws IOException {
+    try {
+      int len = 256;
+      char[] buffer = new char[len];
+      char[] output = new char[len];
+      
+      len = 0;
+      int n;
+      while ((n = input.read(buffer)) >= 0) {
+        if (len + n > output.length) { // grow capacity
+          char[] tmp = new char[Math.max(output.length << 1, len + n)];
+          System.arraycopy(output, 0, tmp, 0, len);
+          System.arraycopy(buffer, 0, tmp, len, n);
+          buffer = output; // use larger buffer for future larger bulk reads
+          output = tmp;
+        } else {
+          System.arraycopy(buffer, 0, output, len, n);
+        }
+        len += n;
+      }
 
-			return new String(output, 0, output.length);
-		} finally {
-			if (input != null) input.close();
-		}
-	}
-		
-	/** somewhat oversized to minimize hash collisions */
-	private static Set makeStopSet(String[] stopWords) {
-		Set stops = new HashSet(stopWords.length * 2, 0.3f); 
-		stops.addAll(Arrays.asList(stopWords));
-		return stops;
-//		return Collections.unmodifiableSet(stops);
-	}
+      return new String(output, 0, output.length);
+    } finally {
+      if (input != null) input.close();
+    }
+  }
+    
+  /** somewhat oversized to minimize hash collisions */
+  private static Set makeStopSet(String[] stopWords) {
+    Set stops = new HashSet(stopWords.length * 2, 0.3f); 
+    stops.addAll(Arrays.asList(stopWords));
+    return stops;
+//    return Collections.unmodifiableSet(stops);
+  }
 
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	/**
-	 * The work horse; performance isn't fantastic, but it's not nearly as bad
-	 * as one might think - kudos to the Sun regex developers.
-	 */
-	private static final class PatternTokenizer extends TokenStream {
-		
-		private final String str;
-		private final boolean toLowerCase;
-		private Matcher matcher;
-		private int pos = 0;
-		private static final Locale locale = Locale.getDefault();
-		
-		public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
-			this.str = str;
-			this.matcher = pattern.matcher(str);
-			this.toLowerCase = toLowerCase;
-		}
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  /**
+   * The work horse; performance isn't fantastic, but it's not nearly as bad
+   * as one might think - kudos to the Sun regex developers.
+   */
+  private static final class PatternTokenizer extends TokenStream {
+    
+    private final String str;
+    private final boolean toLowerCase;
+    private Matcher matcher;
+    private int pos = 0;
+    private static final Locale locale = Locale.getDefault();
+    
+    public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
+      this.str = str;
+      this.matcher = pattern.matcher(str);
+      this.toLowerCase = toLowerCase;
+    }
 
-		public Token next() {
-			if (matcher == null) return null;
-			
-			while (true) { // loop takes care of leading and trailing boundary cases
-				int start = pos;
-				int end;
-				boolean isMatch = matcher.find();
-				if (isMatch) {
-					end = matcher.start();
-					pos = matcher.end();
-				} else { 
-					end = str.length();
-					matcher = null; // we're finished
-				}
-				
-				if (start != end) { // non-empty match (header/trailer)
-					String text = str.substring(start, end);
-					if (toLowerCase) text = text.toLowerCase(locale);
-					return new Token(text, start, end);
-				}
-				if (!isMatch) return null;
-			}
-		}
-		
-	}	
-	
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	/**
-	 * Special-case class for best performance in common cases; this class is
-	 * otherwise unnecessary.
-	 */
-	private static final class FastStringTokenizer extends TokenStream {
-		
-		private final String str;
-		private int pos;
-		private final boolean isLetter;
-		private final boolean toLowerCase;
-		private final Set stopWords;
-		private static final Locale locale = Locale.getDefault();
-		
-		public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
-			this.str = str;
-			this.isLetter = isLetter;
-			this.toLowerCase = toLowerCase;
-			this.stopWords = stopWords;
-		}
+    public Token next() {
+      if (matcher == null) return null;
+      
+      while (true) { // loop takes care of leading and trailing boundary cases
+        int start = pos;
+        int end;
+        boolean isMatch = matcher.find();
+        if (isMatch) {
+          end = matcher.start();
+          pos = matcher.end();
+        } else { 
+          end = str.length();
+          matcher = null; // we're finished
+        }
+        
+        if (start != end) { // non-empty match (header/trailer)
+          String text = str.substring(start, end);
+          if (toLowerCase) text = text.toLowerCase(locale);
+          return new Token(text, start, end);
+        }
+        if (!isMatch) return null;
+      }
+    }
+    
+  } 
+  
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  /**
+   * Special-case class for best performance in common cases; this class is
+   * otherwise unnecessary.
+   */
+  private static final class FastStringTokenizer extends TokenStream {
+    
+    private final String str;
+    private int pos;
+    private final boolean isLetter;
+    private final boolean toLowerCase;
+    private final Set stopWords;
+    private static final Locale locale = Locale.getDefault();
+    
+    public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
+      this.str = str;
+      this.isLetter = isLetter;
+      this.toLowerCase = toLowerCase;
+      this.stopWords = stopWords;
+    }
 
-		public Token next() {
-			// cache loop instance vars (performance)
-			String s = str;
-			int len = s.length();
-			int i = pos;
-			boolean letter = isLetter;
-			
-			int start = 0;
-			String text;
-			do {
-				// find beginning of token
-				text = null;
-				while (i < len && !isTokenChar(s.charAt(i), letter)) {
-					i++;
-				}
-				
-				if (i < len) { // found beginning; now find end of token
-					start = i;
-					while (i < len && isTokenChar(s.charAt(i), letter)) {
-						i++;
-					}
-					
-					text = s.substring(start, i);
-					if (toLowerCase) text = text.toLowerCase(locale);
-//					if (toLowerCase) {						
-////						use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
-////						see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
-//						text = s.substring(start, i).toLowerCase(); 
-////						char[] chars = new char[i-start];
-////						for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
-////						text = new String(chars);
-//					} else {
-//						text = s.substring(start, i);
-//					}
-				}
-			} while (text != null && isStopWord(text));
-			
-			pos = i;
-			return text != null ? new Token(text, start, i) : null;
-		}
-		
-		private boolean isTokenChar(char c, boolean isLetter) {
-			return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
-		}
-		
-		private boolean isStopWord(String text) {
-			return stopWords != null && stopWords.contains(text);
-		}
-		
-	}
+    public Token next() {
+      // cache loop instance vars (performance)
+      String s = str;
+      int len = s.length();
+      int i = pos;
+      boolean letter = isLetter;
+      
+      int start = 0;
+      String text;
+      do {
+        // find beginning of token
+        text = null;
+        while (i < len && !isTokenChar(s.charAt(i), letter)) {
+          i++;
+        }
+        
+        if (i < len) { // found beginning; now find end of token
+          start = i;
+          while (i < len && isTokenChar(s.charAt(i), letter)) {
+            i++;
+          }
+          
+          text = s.substring(start, i);
+          if (toLowerCase) text = text.toLowerCase(locale);
+//          if (toLowerCase) {            
+////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
+////            see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
+//            text = s.substring(start, i).toLowerCase(); 
+////            char[] chars = new char[i-start];
+////            for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
+////            text = new String(chars);
+//          } else {
+//            text = s.substring(start, i);
+//          }
+        }
+      } while (text != null && isStopWord(text));
+      
+      pos = i;
+      return text != null ? new Token(text, start, i) : null;
+    }
+    
+    private boolean isTokenChar(char c, boolean isLetter) {
+      return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
+    }
+    
+    private boolean isStopWord(String text) {
+      return stopWords != null && stopWords.contains(text);
+    }
+    
+  }
 
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	/**
-	 * A StringReader that exposes it's contained string for fast direct access.
-	 * Might make sense to generalize this to CharSequence and make it public?
-	 */
-	static final class FastStringReader extends StringReader {
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  /**
+   * A StringReader that exposes it's contained string for fast direct access.
+   * Might make sense to generalize this to CharSequence and make it public?
+   */
+  static final class FastStringReader extends StringReader {
 
-		private final String s;
-		
-		FastStringReader(String s) {
-			super(s);
-			this.s = s;
-		}
-		
-		String getString() {
-			return s;
-		}
-	}
-	
+    private final String s;
+    
+    FastStringReader(String s) {
+      super(s);
+      this.s = s;
+    }
+    
+    String getString() {
+      return s;
+    }
+  }
+  
 }

Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java Sun Jun 11 22:46:16 2006
@@ -75,325 +75,325 @@
  */
 public class SynonymMap {
 
-	/** the index data; Map<String word, String[] synonyms> */
-	private final HashMap table;
-	
-	private static final String[] EMPTY = new String[0];
-	
-	private static final boolean DEBUG = false;
+  /** the index data; Map<String word, String[] synonyms> */
+  private final HashMap table;
+  
+  private static final String[] EMPTY = new String[0];
+  
+  private static final boolean DEBUG = false;
 
-	/**
-	 * Constructs an instance, loading WordNet synonym data from the given input
-	 * stream. Finally closes the stream. The words in the stream must be in
-	 * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
-	 * 
-	 * @param input
-	 *            the stream to read from (null indicates an empty synonym map)
-	 * @throws IOException
-	 *             if an error occured while reading the stream.
-	 */
-	public SynonymMap(InputStream input) throws IOException {
-		this.table = input == null ? new HashMap(0) : read(toByteArray(input));
-	}
-	
-	/**
-	 * Returns the synonym set for the given word, sorted ascending.
-	 * 
-	 * @param word
-	 *            the word to lookup (must be in lowercase).
-	 * @return the synonyms; a set of zero or more words, sorted ascending, each
-	 *         word containing lowercase characters that satisfy
-	 *         <code>Character.isLetter()</code>.
-	 */
-	public String[] getSynonyms(String word) {
-		Object syns = table.get(word);
-		if (syns == null) return EMPTY;
-		if (syns instanceof String) return new String[] {(String) syns};
-		
-		String[] synonyms = (String[]) syns;
-		String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
-		System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
-		return copy;
-	}
-	
-	/**
-	 * Returns a String representation of the index data for debugging purposes.
-	 * 
-	 * @return a String representation
-	 */
-	public String toString() {
-		StringBuffer buf = new StringBuffer();
-		Iterator iter = new TreeMap(table).keySet().iterator();
-		int count = 0;
-		int f0 = 0;
-		int f1 = 0;
-		int f2 = 0;
-		int f3 = 0;
-		
-		while (iter.hasNext()) {
-			String word = (String) iter.next();
-			buf.append(word + ":");
-			String[] synonyms = getSynonyms(word);
-			buf.append(Arrays.asList(synonyms));
-			buf.append("\n");
-			count += synonyms.length;
-			if (synonyms.length == 0) f0++;
-			if (synonyms.length == 1) f1++;
-			if (synonyms.length == 2) f2++;
-			if (synonyms.length == 3) f3++;
-		}
-		
-		buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
-		return buf.toString();
-	}
-	
-	/**
-	 * Analyzes/transforms the given word on input stream loading. This default implementation simply
-	 * lowercases the word. Override this method with a custom stemming
-	 * algorithm or similar, if desired.
-	 * 
-	 * @param word
-	 *            the word to analyze
-	 * @return the same word, or a different word (or null to indicate that the
-	 *         word should be ignored)
-	 */
-	protected String analyze(String word) {
-		return word.toLowerCase();
-	}
+  /**
+   * Constructs an instance, loading WordNet synonym data from the given input
+   * stream. Finally closes the stream. The words in the stream must be in
+   * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
+   * 
+   * @param input
+   *            the stream to read from (null indicates an empty synonym map)
+   * @throws IOException
+   *             if an error occured while reading the stream.
+   */
+  public SynonymMap(InputStream input) throws IOException {
+    this.table = input == null ? new HashMap(0) : read(toByteArray(input));
+  }
+  
+  /**
+   * Returns the synonym set for the given word, sorted ascending.
+   * 
+   * @param word
+   *            the word to lookup (must be in lowercase).
+   * @return the synonyms; a set of zero or more words, sorted ascending, each
+   *         word containing lowercase characters that satisfy
+   *         <code>Character.isLetter()</code>.
+   */
+  public String[] getSynonyms(String word) {
+    Object syns = table.get(word);
+    if (syns == null) return EMPTY;
+    if (syns instanceof String) return new String[] {(String) syns};
+    
+    String[] synonyms = (String[]) syns;
+    String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
+    System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
+    return copy;
+  }
+  
+  /**
+   * Returns a String representation of the index data for debugging purposes.
+   * 
+   * @return a String representation
+   */
+  public String toString() {
+    StringBuffer buf = new StringBuffer();
+    Iterator iter = new TreeMap(table).keySet().iterator();
+    int count = 0;
+    int f0 = 0;
+    int f1 = 0;
+    int f2 = 0;
+    int f3 = 0;
+    
+    while (iter.hasNext()) {
+      String word = (String) iter.next();
+      buf.append(word + ":");
+      String[] synonyms = getSynonyms(word);
+      buf.append(Arrays.asList(synonyms));
+      buf.append("\n");
+      count += synonyms.length;
+      if (synonyms.length == 0) f0++;
+      if (synonyms.length == 1) f1++;
+      if (synonyms.length == 2) f2++;
+      if (synonyms.length == 3) f3++;
+    }
+    
+    buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
+    return buf.toString();
+  }
+  
+  /**
+   * Analyzes/transforms the given word on input stream loading. This default implementation simply
+   * lowercases the word. Override this method with a custom stemming
+   * algorithm or similar, if desired.
+   * 
+   * @param word
+   *            the word to analyze
+   * @return the same word, or a different word (or null to indicate that the
+   *         word should be ignored)
+   */
+  protected String analyze(String word) {
+    return word.toLowerCase();
+  }
 
-	private static boolean isValid(String str) {
-		for (int i=str.length(); --i >= 0; ) {
-			if (!Character.isLetter(str.charAt(i))) return false;
-		}
-		return true;
-	}
+  private static boolean isValid(String str) {
+    for (int i=str.length(); --i >= 0; ) {
+      if (!Character.isLetter(str.charAt(i))) return false;
+    }
+    return true;
+  }
 
-	private HashMap read(byte[] data) {
-		int WORDS  = (int) (76401 / 0.7); // presizing
-		int GROUPS = (int) (88022 / 0.7); // presizing
-		HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>
-		HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
-		HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
+  private HashMap read(byte[] data) {
+    int WORDS  = (int) (76401 / 0.7); // presizing
+    int GROUPS = (int) (88022 / 0.7); // presizing
+    HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>
+    HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
+    HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
 
-		Charset charset = Charset.forName("UTF-8");
-		int lastNum = -1;
-		Integer lastGroup = null;
-		int len = data.length;
-		int i=0;
-		
-		while (i < len) { // until EOF
-			/* Part A: Parse a line */
-			
-			// scan to beginning of group
-			while (i < len && data[i] != '(') i++;
-			if (i >= len) break; // EOF
-			i++;
-			
-			// parse group
-			int num = 0;
-			while (i < len && data[i] != ',') {
-				num = 10*num + (data[i] - 48);
-				i++;
-			}
-			i++;
-//			if (DEBUG) System.err.println("num="+ num);
-			
-			// scan to beginning of word
-			while (i < len && data[i] != '\'') i++;
-			i++;
-	
-			// scan to end of word
-			int start = i;
-			do {
-				while (i < len && data[i] != '\'') i++;
-				i++;
-			} while (i < len && data[i] != ','); // word must end with "',"
-			
-			if (i >= len) break; // EOF
-			String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
-//			String word = new String(data, 0, start, i-start-1); // ASCII
-			
-			/*
-			 * Part B: ignore phrases (with spaces and hyphens) and
-			 * non-alphabetic words, and let user customize word (e.g. do some
-			 * stemming)
-			 */
-			if (!isValid(word)) continue; // ignore
-			word = analyze(word);
-			if (word == null || word.length() == 0) continue; // ignore
-			
-			
-			/* Part C: Add (group,word) to tables */
-			
-			// ensure compact string representation, minimizing memory overhead
-			String w = (String) internedWords.get(word);
-			if (w == null) {
-				word = new String(word); // ensure compact string
-				internedWords.put(word, word);
-			} else {
-				word = w;
-			}
-			
-			Integer group = lastGroup;
-			if (num != lastNum) {
-				group = new Integer(num);
-				lastGroup = group;
-				lastNum = num;
-			}
-			
-			// add word --> group
-			ArrayList groups = (ArrayList) word2Groups.get(word);
-			if (groups == null) {
-				groups = new ArrayList(1);
-				word2Groups.put(word, groups);
-			}
-			groups.add(group);
+    Charset charset = Charset.forName("UTF-8");
+    int lastNum = -1;
+    Integer lastGroup = null;
+    int len = data.length;
+    int i=0;
+    
+    while (i < len) { // until EOF
+      /* Part A: Parse a line */
+      
+      // scan to beginning of group
+      while (i < len && data[i] != '(') i++;
+      if (i >= len) break; // EOF
+      i++;
+      
+      // parse group
+      int num = 0;
+      while (i < len && data[i] != ',') {
+        num = 10*num + (data[i] - 48);
+        i++;
+      }
+      i++;
+//      if (DEBUG) System.err.println("num="+ num);
+      
+      // scan to beginning of word
+      while (i < len && data[i] != '\'') i++;
+      i++;
+  
+      // scan to end of word
+      int start = i;
+      do {
+        while (i < len && data[i] != '\'') i++;
+        i++;
+      } while (i < len && data[i] != ','); // word must end with "',"
+      
+      if (i >= len) break; // EOF
+      String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
+//      String word = new String(data, 0, start, i-start-1); // ASCII
+      
+      /*
+       * Part B: ignore phrases (with spaces and hyphens) and
+       * non-alphabetic words, and let user customize word (e.g. do some
+       * stemming)
+       */
+      if (!isValid(word)) continue; // ignore
+      word = analyze(word);
+      if (word == null || word.length() == 0) continue; // ignore
+      
+      
+      /* Part C: Add (group,word) to tables */
+      
+      // ensure compact string representation, minimizing memory overhead
+      String w = (String) internedWords.get(word);
+      if (w == null) {
+        word = new String(word); // ensure compact string
+        internedWords.put(word, word);
+      } else {
+        word = w;
+      }
+      
+      Integer group = lastGroup;
+      if (num != lastNum) {
+        group = new Integer(num);
+        lastGroup = group;
+        lastNum = num;
+      }
+      
+      // add word --> group
+      ArrayList groups = (ArrayList) word2Groups.get(word);
+      if (groups == null) {
+        groups = new ArrayList(1);
+        word2Groups.put(word, groups);
+      }
+      groups.add(group);
 
-			// add group --> word
-			ArrayList words = (ArrayList) group2Words.get(group);
-			if (words == null) {
-				words = new ArrayList(1);
-				group2Words.put(group, words);
-			} 
-			words.add(word);
-		}
-		
-		
-		/* Part D: compute index data structure */
-		HashMap word2Syns = createIndex(word2Groups, group2Words);		
-				
-		/* Part E: minimize memory consumption by a factor 3 (or so) */
-//		if (true) return word2Syns;
-		word2Groups = null; // help gc
-		group2Words = null; // help gc		
-		return optimize(word2Syns, internedWords);
-	}
-	
-	private HashMap createIndex(Map word2Groups, Map group2Words) {
-		HashMap word2Syns = new HashMap();
-		Iterator iter = word2Groups.entrySet().iterator();
-		
-		while (iter.hasNext()) { // for each word
-			Map.Entry entry = (Map.Entry) iter.next();
-			ArrayList group = (ArrayList) entry.getValue();			
-			String word = (String) entry.getKey();
-			
-//			HashSet synonyms = new HashSet();
-			TreeSet synonyms = new TreeSet();
-			for (int i=group.size(); --i >= 0; ) { // for each groupID of word
-				ArrayList words = (ArrayList) group2Words.get(group.get(i));
-				for (int j=words.size(); --j >= 0; ) { // add all words				
-					Object synonym = words.get(j); // note that w and word are interned
-					if (synonym != word) { // a word is implicitly it's own synonym
-						synonyms.add(synonym);
-					}
-				}
-			}
+      // add group --> word
+      ArrayList words = (ArrayList) group2Words.get(group);
+      if (words == null) {
+        words = new ArrayList(1);
+        group2Words.put(group, words);
+      } 
+      words.add(word);
+    }
+    
+    
+    /* Part D: compute index data structure */
+    HashMap word2Syns = createIndex(word2Groups, group2Words);    
+        
+    /* Part E: minimize memory consumption by a factor 3 (or so) */
+//    if (true) return word2Syns;
+    word2Groups = null; // help gc
+    group2Words = null; // help gc    
+    return optimize(word2Syns, internedWords);
+  }
+  
+  private HashMap createIndex(Map word2Groups, Map group2Words) {
+    HashMap word2Syns = new HashMap();
+    Iterator iter = word2Groups.entrySet().iterator();
+    
+    while (iter.hasNext()) { // for each word
+      Map.Entry entry = (Map.Entry) iter.next();
+      ArrayList group = (ArrayList) entry.getValue();     
+      String word = (String) entry.getKey();
+      
+//      HashSet synonyms = new HashSet();
+      TreeSet synonyms = new TreeSet();
+      for (int i=group.size(); --i >= 0; ) { // for each groupID of word
+        ArrayList words = (ArrayList) group2Words.get(group.get(i));
+        for (int j=words.size(); --j >= 0; ) { // add all words       
+          Object synonym = words.get(j); // note that w and word are interned
+          if (synonym != word) { // a word is implicitly it's own synonym
+            synonyms.add(synonym);
+          }
+        }
+      }
 
-			int size = synonyms.size();
-			if (size > 0) {
-				String[] syns = new String[size];
-				if (size == 1)  
-					syns[0] = (String) synonyms.first();
-				else
-					synonyms.toArray(syns);
-//				if (syns.length > 1) Arrays.sort(syns);
-//				if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
-				word2Syns.put(word, syns);
-			}
-		}
-	
-		return word2Syns;
-	}
+      int size = synonyms.size();
+      if (size > 0) {
+        String[] syns = new String[size];
+        if (size == 1)  
+          syns[0] = (String) synonyms.first();
+        else
+          synonyms.toArray(syns);
+//        if (syns.length > 1) Arrays.sort(syns);
+//        if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
+        word2Syns.put(word, syns);
+      }
+    }
+  
+    return word2Syns;
+  }
 
-	private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
-		if (DEBUG) {
-			System.err.println("before gc");
-			for (int i=0; i < 10; i++) System.gc();
-			System.err.println("after gc");
-		}
-		
-		// collect entries
-		int len = 0;
-		int size = word2Syns.size();
-		String[][] allSynonyms = new String[size][];
-		String[] words = new String[size];
-		Iterator iter = word2Syns.entrySet().iterator();
-		for (int j=0; j < size; j++) {
-			Map.Entry entry = (Map.Entry) iter.next();
-			allSynonyms[j] = (String[]) entry.getValue(); 
-			words[j] = (String) entry.getKey();
-			len += words[j].length();
-		}
-		
-		// assemble large string containing all words
-		StringBuffer buf = new StringBuffer(len);
-		for (int j=0; j < size; j++) buf.append(words[j]);
-		String allWords = new String(buf.toString()); // ensure compact string across JDK versions
-		buf = null;
-		
-		// intern words at app level via memory-overlaid substrings
-		for (int p=0, j=0; j < size; j++) {
-			String word = words[j];
-			internedWords.put(word, allWords.substring(p, p + word.length()));
-			p += word.length();
-		}
-		
-		// replace words with interned words
-		for (int j=0; j < size; j++) {
-			String[] syns = allSynonyms[j];
-			for (int k=syns.length; --k >= 0; ) {
-				syns[k] = (String) internedWords.get(syns[k]);
-			}
-			Object replacement = syns;
-			if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
-			word2Syns.remove(words[j]);
-			word2Syns.put(internedWords.get(words[j]), replacement);
-		}
-		
-		if (DEBUG) {
-			words = null;
-			allSynonyms = null;
-			internedWords = null;
-			allWords = null;
-			System.err.println("before gc");
-			for (int i=0; i < 10; i++) System.gc();
-			System.err.println("after gc");
-		}
-		return word2Syns;
-	}
-	
-	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
-	private static byte[] toByteArray(InputStream input) throws IOException {
-		try {
-			// safe and fast even if input.available() behaves weird or buggy
-			int len = Math.max(256, input.available());
-			byte[] buffer = new byte[len];
-			byte[] output = new byte[len];
-			
-			len = 0;
-			int n;
-			while ((n = input.read(buffer)) >= 0) {
-				if (len + n > output.length) { // grow capacity
-					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
-					System.arraycopy(output, 0, tmp, 0, len);
-					System.arraycopy(buffer, 0, tmp, len, n);
-					buffer = output; // use larger buffer for future larger bulk reads
-					output = tmp;
-				} else {
-					System.arraycopy(buffer, 0, output, len, n);
-				}
-				len += n;
-			}
+  private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
+    if (DEBUG) {
+      System.err.println("before gc");
+      for (int i=0; i < 10; i++) System.gc();
+      System.err.println("after gc");
+    }
+    
+    // collect entries
+    int len = 0;
+    int size = word2Syns.size();
+    String[][] allSynonyms = new String[size][];
+    String[] words = new String[size];
+    Iterator iter = word2Syns.entrySet().iterator();
+    for (int j=0; j < size; j++) {
+      Map.Entry entry = (Map.Entry) iter.next();
+      allSynonyms[j] = (String[]) entry.getValue(); 
+      words[j] = (String) entry.getKey();
+      len += words[j].length();
+    }
+    
+    // assemble large string containing all words
+    StringBuffer buf = new StringBuffer(len);
+    for (int j=0; j < size; j++) buf.append(words[j]);
+    String allWords = new String(buf.toString()); // ensure compact string across JDK versions
+    buf = null;
+    
+    // intern words at app level via memory-overlaid substrings
+    for (int p=0, j=0; j < size; j++) {
+      String word = words[j];
+      internedWords.put(word, allWords.substring(p, p + word.length()));
+      p += word.length();
+    }
+    
+    // replace words with interned words
+    for (int j=0; j < size; j++) {
+      String[] syns = allSynonyms[j];
+      for (int k=syns.length; --k >= 0; ) {
+        syns[k] = (String) internedWords.get(syns[k]);
+      }
+      Object replacement = syns;
+      if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
+      word2Syns.remove(words[j]);
+      word2Syns.put(internedWords.get(words[j]), replacement);
+    }
+    
+    if (DEBUG) {
+      words = null;
+      allSynonyms = null;
+      internedWords = null;
+      allWords = null;
+      System.err.println("before gc");
+      for (int i=0; i < 10; i++) System.gc();
+      System.err.println("after gc");
+    }
+    return word2Syns;
+  }
+  
+  // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
+  private static byte[] toByteArray(InputStream input) throws IOException {
+    try {
+      // safe and fast even if input.available() behaves weird or buggy
+      int len = Math.max(256, input.available());
+      byte[] buffer = new byte[len];
+      byte[] output = new byte[len];
+      
+      len = 0;
+      int n;
+      while ((n = input.read(buffer)) >= 0) {
+        if (len + n > output.length) { // grow capacity
+          byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
+          System.arraycopy(output, 0, tmp, 0, len);
+          System.arraycopy(buffer, 0, tmp, len, n);
+          buffer = output; // use larger buffer for future larger bulk reads
+          output = tmp;
+        } else {
+          System.arraycopy(buffer, 0, output, len, n);
+        }
+        len += n;
+      }
 
-			if (len == output.length) return output;
-			buffer = null; // help gc
-			buffer = new byte[len];
-			System.arraycopy(output, 0, buffer, 0, len);
-			return buffer;
-		} finally {
-			if (input != null) input.close();
-		}
-	}
-	
+      if (len == output.length) return output;
+      buffer = null; // help gc
+      buffer = new byte[len];
+      System.arraycopy(output, 0, buffer, 0, len);
+      return buffer;
+    } finally {
+      if (input != null) input.close();
+    }
+  }
+  
 }

Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java Sun Jun 11 22:46:16 2006
@@ -30,105 +30,105 @@
  * @author whoschek.AT.lbl.DOT.gov
  */
 public class SynonymTokenFilter extends TokenFilter {
-		
-	/** The Token.type used to indicate a synonym to higher level filters. */
-	public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
+    
+  /** The Token.type used to indicate a synonym to higher level filters. */
+  public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
 
-	private final SynonymMap synonyms;
-	private final int maxSynonyms;
-	
-	private String[] stack = null;
-	private int index = 0;
-	private Token current = null;
-	private int todo = 0;
-	
-	/**
-	 * Creates an instance for the given underlying stream and synonym table.
-	 * 
-	 * @param input
-	 *            the underlying child token stream
-	 * @param synonyms
-	 *            the map used to extract synonyms for terms
-	 * @param maxSynonyms
-	 *            the maximum number of synonym tokens to return per underlying
-	 *            token word (a value of Integer.MAX_VALUE indicates unlimited)
-	 */
-	public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
-		super(input);
-		if (input == null)
-			throw new IllegalArgumentException("input must not be null");
-		if (synonyms == null)
-			throw new IllegalArgumentException("synonyms must not be null");
-		if (maxSynonyms < 0) 
-			throw new IllegalArgumentException("maxSynonyms must not be negative");
-		
-		this.synonyms = synonyms;
-		this.maxSynonyms = maxSynonyms;
-	}
-	
-	/** Returns the next token in the stream, or null at EOS. */
-	public Token next() throws IOException {
-		Token token;
-		while (todo > 0 && index < stack.length) { // pop from stack
-			token = createToken(stack[index++], current);
-			if (token != null) {
-				todo--;
-				return token;
-			}
-		}
-		
-		token = input.next();
-		if (token == null) return null; // EOS; iterator exhausted
-		
-		stack = synonyms.getSynonyms(token.termText()); // push onto stack
-		if (stack.length > maxSynonyms) randomize(stack);
-		index = 0;
-		current = token;
-		todo = maxSynonyms;
-		return token;
-	}
-	
-	/**
-	 * Creates and returns a token for the given synonym of the current input
-	 * token; Override for custom (stateless or stateful) behaviour, if desired.
-	 * 
-	 * @param synonym 
-	 *            a synonym for the current token's term
-	 * @param current
-	 *            the current token from the underlying child stream
-	 * @return a new token, or null to indicate that the given synonym should be
-	 *         ignored
-	 */
-	protected Token createToken(String synonym, Token current) {
-		Token token = new Token(
-			synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
-		token.setPositionIncrement(0);
-		return token;
-	}
-	
-	/**
-	 * Randomize synonyms to later sample a subset. Uses constant random seed
-	 * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
-	 * number generator with medium statistical quality (multiplicative
-	 * congruential method), producing integers in the range [Integer.MIN_VALUE,
-	 * Integer.MAX_VALUE].
-	 */
-	private static void randomize(Object[] arr) {
-		int seed = 1234567; // constant
-		int randomState = 4*seed + 1;
-//		Random random = new Random(seed); // unnecessary overhead
-		int len = arr.length;
-		for (int i=0; i < len-1; i++) {
-			randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
-			int r = randomState % (len-i);
-			if (r < 0) r = -r; // e.g. -9 % 2 == -1
-//			int r = random.nextInt(len-i);
-			
-			// swap arr[i, i+r]
-			Object tmp = arr[i];
-			arr[i] = arr[i + r];
-			arr[i + r] = tmp;
-		}		
-	}
-	
+  private final SynonymMap synonyms;
+  private final int maxSynonyms;
+  
+  private String[] stack = null;
+  private int index = 0;
+  private Token current = null;
+  private int todo = 0;
+  
+  /**
+   * Creates an instance for the given underlying stream and synonym table.
+   * 
+   * @param input
+   *            the underlying child token stream
+   * @param synonyms
+   *            the map used to extract synonyms for terms
+   * @param maxSynonyms
+   *            the maximum number of synonym tokens to return per underlying
+   *            token word (a value of Integer.MAX_VALUE indicates unlimited)
+   */
+  public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
+    super(input);
+    if (input == null)
+      throw new IllegalArgumentException("input must not be null");
+    if (synonyms == null)
+      throw new IllegalArgumentException("synonyms must not be null");
+    if (maxSynonyms < 0) 
+      throw new IllegalArgumentException("maxSynonyms must not be negative");
+    
+    this.synonyms = synonyms;
+    this.maxSynonyms = maxSynonyms;
+  }
+  
+  /** Returns the next token in the stream, or null at EOS. */
+  public Token next() throws IOException {
+    Token token;
+    while (todo > 0 && index < stack.length) { // pop from stack
+      token = createToken(stack[index++], current);
+      if (token != null) {
+        todo--;
+        return token;
+      }
+    }
+    
+    token = input.next();
+    if (token == null) return null; // EOS; iterator exhausted
+    
+    stack = synonyms.getSynonyms(token.termText()); // push onto stack
+    if (stack.length > maxSynonyms) randomize(stack);
+    index = 0;
+    current = token;
+    todo = maxSynonyms;
+    return token;
+  }
+  
+  /**
+   * Creates and returns a token for the given synonym of the current input
+   * token; Override for custom (stateless or stateful) behaviour, if desired.
+   * 
+   * @param synonym 
+   *            a synonym for the current token's term
+   * @param current
+   *            the current token from the underlying child stream
+   * @return a new token, or null to indicate that the given synonym should be
+   *         ignored
+   */
+  protected Token createToken(String synonym, Token current) {
+    Token token = new Token(
+      synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
+    token.setPositionIncrement(0);
+    return token;
+  }
+  
+  /**
+   * Randomize synonyms to later sample a subset. Uses constant random seed
+   * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
+   * number generator with medium statistical quality (multiplicative
+   * congruential method), producing integers in the range [Integer.MIN_VALUE,
+   * Integer.MAX_VALUE].
+   */
+  private static void randomize(Object[] arr) {
+    int seed = 1234567; // constant
+    int randomState = 4*seed + 1;
+//    Random random = new Random(seed); // unnecessary overhead
+    int len = arr.length;
+    for (int i=0; i < len-1; i++) {
+      randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
+      int r = randomState % (len-i);
+      if (r < 0) r = -r; // e.g. -9 % 2 == -1
+//      int r = random.nextInt(len-i);
+      
+      // swap arr[i, i+r]
+      Object tmp = arr[i];
+      arr[i] = arr[i + r];
+      arr[i + r] = tmp;
+    }   
+  }
+  
 }

Modified: lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java?rev=413584&r1=413583&r2=413584&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (original)
+++ lucene/java/trunk/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java Sun Jun 11 22:46:16 2006
@@ -197,319 +197,319 @@
 @author whoschek.AT.lbl.DOT.gov
 */
 public class MemoryIndexTest extends TestCase {
-	
-	private Analyzer analyzer;
-	private boolean fastMode = false;
-	
-	private static final String FIELD_NAME = "content";
+  
+  private Analyzer analyzer;
+  private boolean fastMode = false;
+  
+  private static final String FIELD_NAME = "content";
 
-	/** Runs the tests and/or benchmark */
-	public static void main(String[] args) throws Throwable {
-		new MemoryIndexTest().run(args);		
-	}
+  /** Runs the tests and/or benchmark */
+  public static void main(String[] args) throws Throwable {
+    new MemoryIndexTest().run(args);    
+  }
 
-//	public void setUp() {	}
-//	public void tearDown() {}
-	
-	public void testMany() throws Throwable {
-		String[] files = listFiles(new String[] {
-			"*.txt", "*.html", "*.xml", "xdocs/*.xml", 
-			"src/java/test/org/apache/lucene/queryParser/*.java",
-			"src/java/org/apache/lucene/index/memory/*.java",
-		});
-		System.out.println("files = " + java.util.Arrays.asList(files));
-		String[] xargs = new String[] {
-			"1", "1", "memram", 
-			"@src/test/org/apache/lucene/index/memory/testqueries.txt",
-		};
-		String[] args = new String[xargs.length + files.length];
-		System.arraycopy(xargs, 0, args, 0, xargs.length);
-		System.arraycopy(files, 0, args, xargs.length, files.length);
-		run(args);
-	}
-	
-	private void run(String[] args) throws Throwable {
-		int k = -1;
-		
-		int iters = 1;
-		if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
-		
-		int runs = 1;
-		if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
-		
-		String cmd = "memram";
-		if (args.length > ++k) cmd = args[k];
-		boolean useMemIndex = cmd.indexOf("mem") >= 0;
-		boolean useRAMIndex = cmd.indexOf("ram") >= 0;
-		
-		String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
-		if (args.length > ++k) {
-			String arg = args[k];
-			if (arg.startsWith("@")) 
-				queries = readLines(new File(arg.substring(1)));
-			else
-				queries = new String[] { arg };
-		}
-		
-		File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
-		if (args.length > ++k) {
-			files = new File[args.length - k];
-			for (int i=k; i < args.length; i++) {
-				files[i-k] = new File(args[i]);
-			}
-		}
-		
-		boolean toLowerCase = true;
-//		boolean toLowerCase = false;
-//		Set stopWords = null;
-		Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
-		
-		Analyzer[] analyzers = new Analyzer[] { 
-				new SimpleAnalyzer(),
-				new StopAnalyzer(),
-				new StandardAnalyzer(),
-				PatternAnalyzer.DEFAULT_ANALYZER,
-//				new WhitespaceAnalyzer(),
-//				new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
-//				new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),				
-//				new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
-		};
-		
-		for (int iter=0; iter < iters; iter++) {
-			System.out.println("\n########### iteration=" + iter);
-			long start = System.currentTimeMillis();						
-			long bytes = 0;
-			
-			for (int anal=0; anal < analyzers.length; anal++) {
-				this.analyzer = analyzers[anal];
-				
-				for (int i=0; i < files.length; i++) {
-					File file = files[i];
-					if (!file.exists() || file.isDirectory()) continue; // ignore
-					bytes += file.length();
-					String text = toString(new FileInputStream(file), null);
-					Document doc = createDocument(text);
-					System.out.println("\n*********** FILE=" + file);
-					
-					for (int q=0; q < queries.length; q++) {
-						try {
-							Query query = parseQuery(queries[q]);
-							
-							for (int run=0; run < runs; run++) {
-								float score1 = 0.0f; float score2 = 0.0f;
-								if (useMemIndex) score1 = query(createMemoryIndex(doc), query); 
-								if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
-								if (useMemIndex && useRAMIndex) {
-									System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
-									if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
-										throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
-									}
-								}
-							}
-						} catch (Throwable t) {
-							if (t instanceof OutOfMemoryError) t.printStackTrace();
-							System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
-							throw t;
-						}
-					}
-				}
-			}
-			long end = System.currentTimeMillis();
-			System.out.println("\nsecs = " + ((end-start)/1000.0f));
-			System.out.println("queries/sec= " + 
-				(1.0f * runs * queries.length * analyzers.length * files.length 
-						/ ((end-start)/1000.0f)));
-			float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
-			System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
-		}
-		
-		if (useMemIndex && useRAMIndex) 
-			System.out.println("No bug found. done.");
-		else 
-			System.out.println("Done benchmarking (without checking correctness).");
-	}
-	
-	// returns file line by line, ignoring empty lines and comments
-	private String[] readLines(File file) throws Exception {
-		BufferedReader reader = new BufferedReader(new InputStreamReader(
-				new FileInputStream(file))); 
-		ArrayList lines = new ArrayList();
-		String line;	
-		while ((line = reader.readLine()) != null) {
-			String t = line.trim(); 
-			if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
-				lines.add(line);
-			}
-		}
-		reader.close();
-		
-		String[] result = new String[lines.size()];
-		lines.toArray(result);
-		return result;
-	}
-	
-	private Document createDocument(String content) {
-		Document doc = new Document();
-		doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
-		return doc;
-	}
-	
-	private MemoryIndex createMemoryIndex(Document doc) {
-		MemoryIndex index = new MemoryIndex();
-		Enumeration iter = doc.fields();
-		while (iter.hasMoreElements()) {
-			Field field = (Field) iter.nextElement();
-			index.addField(field.name(), field.stringValue(), analyzer);
-		}
-		return index;
-	}
-	
-	private RAMDirectory createRAMIndex(Document doc) {
-		RAMDirectory dir = new RAMDirectory();		
-		IndexWriter writer = null;
-		try {
-			writer = new IndexWriter(dir, analyzer, true);
-			writer.setMaxFieldLength(Integer.MAX_VALUE);
-			writer.addDocument(doc);
-			writer.optimize();
-			return dir;
-		} catch (IOException e) { // should never happen (RAMDirectory)
-			throw new RuntimeException(e);
-		} finally {
-			try {
-				if (writer != null) writer.close();
-			} catch (IOException e) { // should never happen (RAMDirectory)
-				throw new RuntimeException(e);
-			}
-		}
-	}
-		
-	private float query(Object index, Query query) {
-//		System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
-		Searcher searcher = null;
-		try {
-			if (index instanceof Directory)
-				searcher = new IndexSearcher((Directory)index);
-			else 
-				searcher = ((MemoryIndex) index).createSearcher();
+//  public void setUp() {  }
+//  public void tearDown() {}
+  
+  public void testMany() throws Throwable {
+    String[] files = listFiles(new String[] {
+      "*.txt", "*.html", "*.xml", "xdocs/*.xml", 
+      "src/java/test/org/apache/lucene/queryParser/*.java",
+      "src/java/org/apache/lucene/index/memory/*.java",
+    });
+    System.out.println("files = " + java.util.Arrays.asList(files));
+    String[] xargs = new String[] {
+      "1", "1", "memram", 
+      "@src/test/org/apache/lucene/index/memory/testqueries.txt",
+    };
+    String[] args = new String[xargs.length + files.length];
+    System.arraycopy(xargs, 0, args, 0, xargs.length);
+    System.arraycopy(files, 0, args, xargs.length, files.length);
+    run(args);
+  }
+  
+  private void run(String[] args) throws Throwable {
+    int k = -1;
+    
+    int iters = 1;
+    if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
+    
+    int runs = 1;
+    if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
+    
+    String cmd = "memram";
+    if (args.length > ++k) cmd = args[k];
+    boolean useMemIndex = cmd.indexOf("mem") >= 0;
+    boolean useRAMIndex = cmd.indexOf("ram") >= 0;
+    
+    String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
+    if (args.length > ++k) {
+      String arg = args[k];
+      if (arg.startsWith("@")) 
+        queries = readLines(new File(arg.substring(1)));
+      else
+        queries = new String[] { arg };
+    }
+    
+    File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
+    if (args.length > ++k) {
+      files = new File[args.length - k];
+      for (int i=k; i < args.length; i++) {
+        files[i-k] = new File(args[i]);
+      }
+    }
+    
+    boolean toLowerCase = true;
+//    boolean toLowerCase = false;
+//    Set stopWords = null;
+    Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+    
+    Analyzer[] analyzers = new Analyzer[] { 
+        new SimpleAnalyzer(),
+        new StopAnalyzer(),
+        new StandardAnalyzer(),
+        PatternAnalyzer.DEFAULT_ANALYZER,
+//        new WhitespaceAnalyzer(),
+//        new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
+//        new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),        
+//        new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
+    };
+    
+    for (int iter=0; iter < iters; iter++) {
+      System.out.println("\n########### iteration=" + iter);
+      long start = System.currentTimeMillis();            
+      long bytes = 0;
+      
+      for (int anal=0; anal < analyzers.length; anal++) {
+        this.analyzer = analyzers[anal];
+        
+        for (int i=0; i < files.length; i++) {
+          File file = files[i];
+          if (!file.exists() || file.isDirectory()) continue; // ignore
+          bytes += file.length();
+          String text = toString(new FileInputStream(file), null);
+          Document doc = createDocument(text);
+          System.out.println("\n*********** FILE=" + file);
+          
+          for (int q=0; q < queries.length; q++) {
+            try {
+              Query query = parseQuery(queries[q]);
+              
+              for (int run=0; run < runs; run++) {
+                float score1 = 0.0f; float score2 = 0.0f;
+                if (useMemIndex) score1 = query(createMemoryIndex(doc), query); 
+                if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
+                if (useMemIndex && useRAMIndex) {
+                  System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
+                  if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
+                    throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
+                  }
+                }
+              }
+            } catch (Throwable t) {
+              if (t instanceof OutOfMemoryError) t.printStackTrace();
+              System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
+              throw t;
+            }
+          }
+        }
+      }
+      long end = System.currentTimeMillis();
+      System.out.println("\nsecs = " + ((end-start)/1000.0f));
+      System.out.println("queries/sec= " + 
+        (1.0f * runs * queries.length * analyzers.length * files.length 
+            / ((end-start)/1000.0f)));
+      float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
+      System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
+    }
+    
+    if (useMemIndex && useRAMIndex) 
+      System.out.println("No bug found. done.");
+    else 
+      System.out.println("Done benchmarking (without checking correctness).");
+  }
+  
+  // returns file line by line, ignoring empty lines and comments
+  private String[] readLines(File file) throws Exception {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        new FileInputStream(file))); 
+    ArrayList lines = new ArrayList();
+    String line;  
+    while ((line = reader.readLine()) != null) {
+      String t = line.trim(); 
+      if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
+        lines.add(line);
+      }
+    }
+    reader.close();
+    
+    String[] result = new String[lines.size()];
+    lines.toArray(result);
+    return result;
+  }
+  
+  private Document createDocument(String content) {
+    Document doc = new Document();
+    doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+    return doc;
+  }
+  
+  private MemoryIndex createMemoryIndex(Document doc) {
+    MemoryIndex index = new MemoryIndex();
+    Enumeration iter = doc.fields();
+    while (iter.hasMoreElements()) {
+      Field field = (Field) iter.nextElement();
+      index.addField(field.name(), field.stringValue(), analyzer);
+    }
+    return index;
+  }
+  
+  private RAMDirectory createRAMIndex(Document doc) {
+    RAMDirectory dir = new RAMDirectory();    
+    IndexWriter writer = null;
+    try {
+      writer = new IndexWriter(dir, analyzer, true);
+      writer.setMaxFieldLength(Integer.MAX_VALUE);
+      writer.addDocument(doc);
+      writer.optimize();
+      return dir;
+    } catch (IOException e) { // should never happen (RAMDirectory)
+      throw new RuntimeException(e);
+    } finally {
+      try {
+        if (writer != null) writer.close();
+      } catch (IOException e) { // should never happen (RAMDirectory)
+        throw new RuntimeException(e);
+      }
+    }
+  }
+    
+  private float query(Object index, Query query) {
+//    System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
+    Searcher searcher = null;
+    try {
+      if (index instanceof Directory)
+        searcher = new IndexSearcher((Directory)index);
+      else 
+        searcher = ((MemoryIndex) index).createSearcher();
 
-			final float[] scores = new float[1]; // inits to 0.0f
-			searcher.search(query, new HitCollector() {
-				public void collect(int doc, float score) {
-					scores[0] = score;
-				}
-			});
-			float score = scores[0];
-//			Hits hits = searcher.search(query);
-//			float score = hits.length() > 0 ? hits.score(0) : 0.0f;
-			return score;
-		} catch (IOException e) { // should never happen (RAMDirectory)
-			throw new RuntimeException(e);
-		} finally {
-			try {
-				if (searcher != null) searcher.close();
-			} catch (IOException e) { // should never happen (RAMDirectory)
-				throw new RuntimeException(e);
-			}
-		}
-	}
-	
-	private int getMemorySize(Object index) {
-		if (index instanceof Directory) {
-			try {
-				Directory dir = (Directory) index;
-				int size = 0;
-				String[] fileNames = dir.list();
-				for (int i=0; i < fileNames.length; i++) {
-					size += dir.fileLength(fileNames[i]);
-				}
-				return size;
-			}
-			catch (IOException e) { // can never happen (RAMDirectory)
-				throw new RuntimeException(e);
-			}
-		}
-		else {
-			return ((MemoryIndex) index).getMemorySize();
-		}
-	}
-	
-	private Query parseQuery(String expression) throws ParseException {
-		QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
-//		parser.setPhraseSlop(0);
-		return parser.parse(expression);
-	}
-	
-	/** returns all files matching the given file name patterns (quick n'dirty) */
-	static String[] listFiles(String[] fileNames) {
-		LinkedHashSet allFiles = new LinkedHashSet();
-		for (int i=0; i < fileNames.length; i++) {
-			int k;
-			if ((k = fileNames[i].indexOf("*")) < 0) {
-				allFiles.add(fileNames[i]);
-			} else {
-				String prefix = fileNames[i].substring(0, k);
-				if (prefix.length() == 0) prefix = ".";
-				final String suffix = fileNames[i].substring(k+1);
-				File[] files = new File(prefix).listFiles(new FilenameFilter() {
-					public boolean accept(File dir, String name) {
-						return name.endsWith(suffix);
-					}
-				});
-				if (files != null) {
-					for (int j=0; j < files.length; j++) {
-						allFiles.add(files[j].getPath());
-					}
-				}
-			}			
-		}
-		
-		String[] result = new String[allFiles.size()];
-		allFiles.toArray(result);
-		return result;
-	}
-	
-	// trick to detect default platform charset
-	private static final Charset DEFAULT_PLATFORM_CHARSET = 
-		Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());	
-	
-	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
-	private static String toString(InputStream input, Charset charset) throws IOException {
-		if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;			
-		byte[] data = toByteArray(input);
-		return charset.decode(ByteBuffer.wrap(data)).toString();
-	}
-	
-	private static byte[] toByteArray(InputStream input) throws IOException {
-		try {
-			// safe and fast even if input.available() behaves weird or buggy
-			int len = Math.max(256, input.available());
-			byte[] buffer = new byte[len];
-			byte[] output = new byte[len];
-			
-			len = 0;
-			int n;
-			while ((n = input.read(buffer)) >= 0) {
-				if (len + n > output.length) { // grow capacity
-					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
-					System.arraycopy(output, 0, tmp, 0, len);
-					System.arraycopy(buffer, 0, tmp, len, n);
-					buffer = output; // use larger buffer for future larger bulk reads
-					output = tmp;
-				} else {
-					System.arraycopy(buffer, 0, output, len, n);
-				}
-				len += n;
-			}
+      final float[] scores = new float[1]; // inits to 0.0f
+      searcher.search(query, new HitCollector() {
+        public void collect(int doc, float score) {
+          scores[0] = score;
+        }
+      });
+      float score = scores[0];
+//      Hits hits = searcher.search(query);
+//      float score = hits.length() > 0 ? hits.score(0) : 0.0f;
+      return score;
+    } catch (IOException e) { // should never happen (RAMDirectory)
+      throw new RuntimeException(e);
+    } finally {
+      try {
+        if (searcher != null) searcher.close();
+      } catch (IOException e) { // should never happen (RAMDirectory)
+        throw new RuntimeException(e);
+      }
+    }
+  }
+  
+  private int getMemorySize(Object index) {
+    if (index instanceof Directory) {
+      try {
+        Directory dir = (Directory) index;
+        int size = 0;
+        String[] fileNames = dir.list();
+        for (int i=0; i < fileNames.length; i++) {
+          size += dir.fileLength(fileNames[i]);
+        }
+        return size;
+      }
+      catch (IOException e) { // can never happen (RAMDirectory)
+        throw new RuntimeException(e);
+      }
+    }
+    else {
+      return ((MemoryIndex) index).getMemorySize();
+    }
+  }
+  
+  private Query parseQuery(String expression) throws ParseException {
+    QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
+//    parser.setPhraseSlop(0);
+    return parser.parse(expression);
+  }
+  
+  /** returns all files matching the given file name patterns (quick n'dirty) */
+  static String[] listFiles(String[] fileNames) {
+    LinkedHashSet allFiles = new LinkedHashSet();
+    for (int i=0; i < fileNames.length; i++) {
+      int k;
+      if ((k = fileNames[i].indexOf("*")) < 0) {
+        allFiles.add(fileNames[i]);
+      } else {
+        String prefix = fileNames[i].substring(0, k);
+        if (prefix.length() == 0) prefix = ".";
+        final String suffix = fileNames[i].substring(k+1);
+        File[] files = new File(prefix).listFiles(new FilenameFilter() {
+          public boolean accept(File dir, String name) {
+            return name.endsWith(suffix);
+          }
+        });
+        if (files != null) {
+          for (int j=0; j < files.length; j++) {
+            allFiles.add(files[j].getPath());
+          }
+        }
+      }      
+    }
+    
+    String[] result = new String[allFiles.size()];
+    allFiles.toArray(result);
+    return result;
+  }
+  
+  // trick to detect default platform charset
+  private static final Charset DEFAULT_PLATFORM_CHARSET = 
+    Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());  
+  
+  // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
+  private static String toString(InputStream input, Charset charset) throws IOException {
+    if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;      
+    byte[] data = toByteArray(input);
+    return charset.decode(ByteBuffer.wrap(data)).toString();
+  }
+  
+  private static byte[] toByteArray(InputStream input) throws IOException {
+    try {
+      // safe and fast even if input.available() behaves weird or buggy
+      int len = Math.max(256, input.available());
+      byte[] buffer = new byte[len];
+      byte[] output = new byte[len];
+      
+      len = 0;
+      int n;
+      while ((n = input.read(buffer)) >= 0) {
+        if (len + n > output.length) { // grow capacity
+          byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
+          System.arraycopy(output, 0, tmp, 0, len);
+          System.arraycopy(buffer, 0, tmp, len, n);
+          buffer = output; // use larger buffer for future larger bulk reads
+          output = tmp;
+        } else {
+          System.arraycopy(buffer, 0, output, len, n);
+        }
+        len += n;
+      }
 
-			if (len == output.length) return output;
-			buffer = null; // help gc
-			buffer = new byte[len];
-			System.arraycopy(output, 0, buffer, 0, len);
-			return buffer;
-		} finally {
-			if (input != null) input.close();
-		}
-	}
-	
+      if (len == output.length) return output;
+      buffer = null; // help gc
+      buffer = new byte[len];
+      System.arraycopy(output, 0, buffer, 0, len);
+      return buffer;
+    } finally {
+      if (input != null) input.close();
+    }
+  }
+  
 }



Mime
View raw message