From commits-return-25786-apmail-commons-commits-archive=commons.apache.org@commons.apache.org Wed Mar 7 21:03:27 2012 Return-Path: X-Original-To: apmail-commons-commits-archive@minotaur.apache.org Delivered-To: apmail-commons-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id B20379CC5 for ; Wed, 7 Mar 2012 21:03:27 +0000 (UTC) Received: (qmail 60330 invoked by uid 500); 7 Mar 2012 21:03:27 -0000 Delivered-To: apmail-commons-commits-archive@commons.apache.org Received: (qmail 60228 invoked by uid 500); 7 Mar 2012 21:03:26 -0000 Mailing-List: contact commits-help@commons.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@commons.apache.org Delivered-To: mailing list commits@commons.apache.org Received: (qmail 60219 invoked by uid 99); 7 Mar 2012 21:03:26 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 07 Mar 2012 21:03:26 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 07 Mar 2012 21:03:22 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 27D7C2388860 for ; Wed, 7 Mar 2012 21:03:00 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1298118 - in /commons/proper/codec/trunk/src: changes/ main/java/org/apache/commons/codec/language/bm/ test/java/org/apache/commons/codec/language/bm/ Date: Wed, 07 Mar 2012 21:02:59 -0000 To: commits@commons.apache.org From: ggregory@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20120307210300.27D7C2388860@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: ggregory Date: Wed Mar 7 21:02:59 2012 New Revision: 1298118 URL: http://svn.apache.org/viewvc?rev=1298118&view=rev Log: [CODEC-132] BeiderMorseEncoder OOM issues Modified: commons/proper/codec/trunk/src/changes/changes.xml commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java Modified: commons/proper/codec/trunk/src/changes/changes.xml URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1298118&r1=1298117&r2=1298118&view=diff ============================================================================== --- commons/proper/codec/trunk/src/changes/changes.xml (original) +++ commons/proper/codec/trunk/src/changes/changes.xml Wed Mar 7 21:02:59 2012 @@ -26,6 +26,9 @@ org.apache.commons.codec.net.URLCodec charset field final. --> + + BeiderMorseEncoder OOM issues + QuotedPrintableCodec does not support soft line break per the 'quoted-printable' example on Wikipedia Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java?rev=1298118&r1=1298117&r2=1298118&view=diff ============================================================================== --- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java (original) +++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java Wed Mar 7 21:02:59 2012 @@ -100,7 +100,7 @@ public class BeiderMorseEncoder implemen /** * Gets the name type currently in operation. - * + * * @return the NameType currently being used */ public NameType getNameType() { @@ -109,7 +109,7 @@ public class BeiderMorseEncoder implemen /** * Gets the rule type currently in operation. - * + * * @return the RuleType currently being used */ public RuleType getRuleType() { @@ -118,7 +118,7 @@ public class BeiderMorseEncoder implemen /** * Discovers if multiple possible encodings are concatenated. - * + * * @return true if multiple encodings are concatenated, false if just the first one is returned */ public boolean isConcat() { @@ -127,33 +127,55 @@ public class BeiderMorseEncoder implemen /** * Sets how multiple possible phonetic encodings are combined. - * + * * @param concat * true if multiple encodings are to be combined with a '|', false if just the first one is to be considered */ public void setConcat(boolean concat) { - this.engine = new PhoneticEngine(this.engine.getNameType(), this.engine.getRuleType(), concat); + this.engine = new PhoneticEngine(this.engine.getNameType(), + this.engine.getRuleType(), + concat, + this.engine.getMaxPhonemes()); } /** * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phoentic encodings optimized for Ashkenazi or * Sephardic Jewish family names. - * + * * @param nameType * the NameType in use */ public void setNameType(NameType nameType) { - this.engine = new PhoneticEngine(nameType, this.engine.getRuleType(), this.engine.isConcat()); + this.engine = new PhoneticEngine(nameType, + this.engine.getRuleType(), + this.engine.isConcat(), + this.engine.getMaxPhonemes()); } /** * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered. - * + * * @param ruleType * {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches */ public void setRuleType(RuleType ruleType) { - this.engine = new PhoneticEngine(this.engine.getNameType(), ruleType, this.engine.isConcat()); + this.engine = new PhoneticEngine(this.engine.getNameType(), + ruleType, + this.engine.isConcat(), + this.engine.getMaxPhonemes()); + } + + /** + * Sets the number of maximum of phonemes that shall be considered by the engine. + * + * @param maxPhonemes + * the maximum number of phonemes returned by the engine + */ + public void setMaxPhonemes(int maxPhonemes) { + this.engine = new PhoneticEngine(this.engine.getNameType(), + this.engine.getRuleType(), + this.engine.isConcat(), + maxPhonemes); } } Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1298118&r1=1298117&r2=1298118&view=diff ============================================================================== --- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original) +++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Wed Mar 7 21:02:59 2012 @@ -101,17 +101,22 @@ public class PhoneticEngine { * incompatible. * * @param phonemeExpr the expression to apply + * @param maxPhonemes the maximum number of phonemes to build up * @return a new phoneme builder containing the results of phonemeExpr applied to each phoneme * in turn */ - public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) { + public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr, int maxPhonemes) { Set newPhonemes = new HashSet(); - for (Rule.Phoneme left : this.phonemes) { + EXPR: for (Rule.Phoneme left : this.phonemes) { for (Rule.Phoneme right : phonemeExpr.getPhonemes()) { Rule.Phoneme join = left.join(right); if (!join.getLanguages().isEmpty()) { - newPhonemes.add(join); + if (newPhonemes.size() < maxPhonemes) { + newPhonemes.add(join); + } else { + break EXPR; + } } } } @@ -168,9 +173,11 @@ public class PhoneticEngine { private PhonemeBuilder phonemeBuilder; private int i; + private int maxPhonemes; private boolean found; - public RulesApplication(List finalRules, CharSequence input, PhonemeBuilder phonemeBuilder, int i) { + public RulesApplication(List finalRules, CharSequence input, + PhonemeBuilder phonemeBuilder, int i, int maxPhonemes) { if (finalRules == null) { throw new NullPointerException("The finalRules argument must not be null"); } @@ -178,6 +185,7 @@ public class PhoneticEngine { this.phonemeBuilder = phonemeBuilder; this.input = input; this.i = i; + this.maxPhonemes = maxPhonemes; } public int getI() { @@ -208,7 +216,7 @@ public class PhoneticEngine { continue RULES; } - this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme()); + this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes); this.found = true; break RULES; } @@ -289,6 +297,8 @@ public class PhoneticEngine { return sb.toString(); } + private static final int DEFAULT_MAX_PHONEMES = 20; + private final Lang lang; private final NameType nameType; @@ -297,9 +307,11 @@ public class PhoneticEngine { private final boolean concat; + private final int maxPhonemes; + /** * Generates a new, fully-configured phonetic engine. - * + * * @param nameType * the type of names it will use * @param ruleType @@ -308,6 +320,22 @@ public class PhoneticEngine { * if it will concatenate multiple encodings */ public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat) { + this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES); + } + + /** + * Generates a new, fully-configured phonetic engine. + * + * @param nameType + * the type of names it will use + * @param ruleType + * the type of rules it will apply + * @param concat + * if it will concatenate multiple encodings + * @param maxPhonemes + * the maximum number of phonemes that will be handled + */ + public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat, int maxPhonemes) { if (ruleType == RuleType.RULES) { throw new IllegalArgumentException("ruleType must not be " + RuleType.RULES); } @@ -315,6 +343,7 @@ public class PhoneticEngine { this.ruleType = ruleType; this.concat = concat; this.lang = Lang.instance(nameType); + this.maxPhonemes = maxPhonemes; } /** @@ -341,7 +370,8 @@ public class PhoneticEngine { // System.err.println("Expanding: " + phonemeText); for (int i = 0; i < phonemeText.length();) { - RulesApplication rulesApplication = new RulesApplication(finalRules, phonemeText, subBuilder, i).invoke(); + RulesApplication rulesApplication = + new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).invoke(); boolean found = rulesApplication.isFound(); subBuilder = rulesApplication.getPhonemeBuilder(); @@ -459,7 +489,8 @@ public class PhoneticEngine { // loop over each char in the input - we will handle the increment manually CharSequence inputCache = cacheSubSequence(input); for (int i = 0; i < inputCache.length();) { - RulesApplication rulesApplication = new RulesApplication(rules, inputCache, phonemeBuilder, i).invoke(); + RulesApplication rulesApplication = + new RulesApplication(rules, inputCache, phonemeBuilder, i, maxPhonemes).invoke(); i = rulesApplication.getI(); phonemeBuilder = rulesApplication.getPhonemeBuilder(); // System.err.println(input + " " + i + ": " + phonemeBuilder.makeString()); @@ -508,4 +539,13 @@ public class PhoneticEngine { public boolean isConcat() { return this.concat; } + + /** + * Gets the maximum number of phonemes the engine will calculate for a given input. + * + * @return the maximum number of phonemes + */ + public int getMaxPhonemes() { + return this.maxPhonemes; + } } Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java?rev=1298118&r1=1298117&r2=1298118&view=diff ============================================================================== --- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java (original) +++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java Wed Mar 7 21:02:59 2012 @@ -19,6 +19,7 @@ package org.apache.commons.codec.languag import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoder; @@ -60,7 +61,7 @@ public class BeiderMorseEncoderTest exte public void testAllChars() throws EncoderException { BeiderMorseEncoder bmpm = createGenericApproxEncoder(); for (char c = Character.MIN_VALUE; c < Character.MAX_VALUE; c++) { - bmpm.encode("" + c); + bmpm.encode(Character.toString(c)); } } @@ -68,7 +69,7 @@ public class BeiderMorseEncoderTest exte public void testAsciiEncodeNotEmpty1Letter() throws EncoderException { BeiderMorseEncoder bmpm = createGenericApproxEncoder(); for (char c = 'a'; c <= 'z'; c++) { - final String value = "" + c; + final String value = Character.toString(c); final String valueU = value.toUpperCase(); assertNotEmpty(bmpm, value); assertNotEmpty(bmpm, valueU); @@ -138,6 +139,24 @@ public class BeiderMorseEncoderTest exte } @Test + public void testOOM() throws EncoderException { + String phrase = "200697900'-->�aadaabcf\"aedfbff?>cae" + + "cfaaa>&lang&fc;aadeaf?>>&bdquo< cc =\"abff\" />" + + "