Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 32411 invoked from network); 22 May 2008 06:25:18 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 22 May 2008 06:25:18 -0000 Received: (qmail 12264 invoked by uid 500); 22 May 2008 06:25:19 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 12238 invoked by uid 500); 22 May 2008 06:25:19 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 12229 invoked by uid 99); 22 May 2008 06:25:19 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 21 May 2008 23:25:19 -0700 X-ASF-Spam-Status: No, hits=-1997.6 required=10.0 tests=ALL_TRUSTED,FR_ALMOST_VIAG2 X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 22 May 2008 06:24:41 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 842612388A0A; Wed, 21 May 2008 23:24:57 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r659016 - in /lucene/java/trunk: CHANGES.txt contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java Date: Thu, 22 May 2008 06:24:56 -0000 To: java-commits@lucene.apache.org From: otis@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20080522062457.842612388A0A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: otis Date: Wed May 21 23:24:55 2008 New Revision: 659016 URL: http://svn.apache.org/viewvc?rev=659016&view=rev Log: LUCENE-1183: Optimized TRStringDistance class (in contrib/spell) that uses less memory than the previous version Modified: lucene/java/trunk/CHANGES.txt lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java Modified: lucene/java/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=659016&r1=659015&r2=659016&view=diff ============================================================================== --- lucene/java/trunk/CHANGES.txt (original) +++ lucene/java/trunk/CHANGES.txt Wed May 21 23:24:55 2008 @@ -174,6 +174,9 @@ runtime type checking for possible speedup of binaryValue(). (Eks Dev via Mike McCandless) + 5. LUCENE-1183: Optimized TRStringDistance class (in contrib/spell) that uses + less memory than the previous version. (Cédrik LIME via Otis Gospodnetic) + Documentation 1. LUCENE-1236: Added some clarifying remarks to EdgeNGram*.java (Hiroaki Kawai via Grant Ingersoll) Modified: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java?rev=659016&r1=659015&r2=659016&view=diff ============================================================================== --- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java (original) +++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java Wed May 21 23:24:55 2008 @@ -1,6 +1,5 @@ package org.apache.lucene.search.spell; - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,13 +18,16 @@ */ /** - * Edit distance class + * Edit distance class. + * Note: this class is not thread-safe. */ final class TRStringDistance { final char[] sa; final int n; - final int[][][] cache=new int[30][][]; + int p[]; //'previous' cost array, horizontally + int d[]; // cost array, horizontally + int _d[]; //placeholder to assist in swapping p and d /** @@ -33,101 +35,73 @@ * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster. */ public TRStringDistance (String target) { - sa=target.toCharArray(); - n=sa.length; + sa = target.toCharArray(); + n = sa.length; + p = new int[n+1]; //'previous' cost array, horizontally + d = new int[n+1]; // cost array, horizontally } //***************************** - // Compute Levenshtein distance - //***************************** - public final int getDistance (String other) { - int d[][]; // matrix - int cost; // cost - - // Step 1 - final char[] ta=other.toCharArray(); - final int m=ta.length; - if (n==0) { - return m; - } - if (m==0) { - return n; - } - - if (m>=cache.length) { - d=form(n, m); - } - else if (cache[m]!=null) { - d=cache[m]; - } - else { - d=cache[m]=form(n, m); - - // Step 3 - - } - for (int i=1; i<=n; i++) { - final char s_i=sa[i-1]; - - // Step 4 - - for (int j=1; j<=m; j++) { - final char t_j=ta[j-1]; - - // Step 5 - - if (s_i==t_j) { // same - cost=0; - } - else { // not a match - cost=1; - - // Step 6 - - } - d[i][j]=min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost); - - } - - } + // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String) + //***************************** + public final int getDistance (String other) { + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length()+1 by t.length()+1, + we maintain two single-dimensional arrays of length s.length()+1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + final int m = other.length(); + + if (n == 0) { + return m; + } else if (m == 0) { + return n; + } - // Step 7 - return d[n][m]; + // indexes into strings s and t + int i; // iterates through s + int j; // iterates through t - } + char t_j; // jth character of t + int cost; // cost - /** - * - */ - private static int[][] form (int n, int m) { - int[][] d=new int[n+1][m+1]; - // Step 2 - - for (int i=0; i<=n; i++) { - d[i][0]=i; - - } - for (int j=0; j<=m; j++) { - d[0][j]=j; + for (i = 0; i<=n; i++) { + p[i] = i; } - return d; - } + for (j = 1; j<=m; j++) { + t_j = other.charAt(j-1); + d[0] = j; + + for (i=1; i<=n; i++) { + cost = sa[i-1]==t_j ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); + } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } - //**************************** - // Get minimum of three values - //**************************** - private static int min3 (int a, int b, int c) { - int mi=a; - if (b