Return-Path: X-Original-To: apmail-commons-notifications-archive@minotaur.apache.org Delivered-To: apmail-commons-notifications-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 17A64172DC for ; Sun, 26 Apr 2015 10:18:34 +0000 (UTC) Received: (qmail 52687 invoked by uid 500); 26 Apr 2015 10:18:34 -0000 Delivered-To: apmail-commons-notifications-archive@commons.apache.org Received: (qmail 52660 invoked by uid 500); 26 Apr 2015 10:18:34 -0000 Mailing-List: contact notifications-help@commons.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@commons.apache.org Delivered-To: mailing list notifications@commons.apache.org Received: (qmail 52467 invoked by uid 99); 26 Apr 2015 10:18:33 -0000 Received: from eris.apache.org (HELO hades.apache.org) (140.211.11.105) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 26 Apr 2015 10:18:33 +0000 Received: from hades.apache.org (localhost [127.0.0.1]) by hades.apache.org (ASF Mail Server at hades.apache.org) with ESMTP id B291DAC1436 for ; Sun, 26 Apr 2015 10:18:33 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r949214 [35/35] - in /websites/production/commons/content/sandbox/commons-text: ./ apidocs/ apidocs/org/apache/commons/text/diff/ apidocs/org/apache/commons/text/diff/class-use/ apidocs/org/apache/commons/text/names/ apidocs/org/apache/comm... Date: Sun, 26 Apr 2015 10:18:28 -0000 To: notifications@commons.apache.org From: britter@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20150426101833.B291DAC1436@hades.apache.org> Added: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/RegexTokenizer.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/RegexTokenizer.html (added) +++ websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/RegexTokenizer.html Sun Apr 26 10:18:25 2015 @@ -0,0 +1,63 @@ + + + +RegexTokenizer xref + + + +
+1   /*
+2    * Licensed to the Apache Software Foundation (ASF) under one or more
+3    * contributor license agreements.  See the NOTICE file distributed with
+4    * this work for additional information regarding copyright ownership.
+5    * The ASF licenses this file to You under the Apache License, Version 2.0
+6    * (the "License"); you may not use this file except in compliance with
+7    * the License.  You may obtain a copy of the License at
+8    *
+9    *      http://www.apache.org/licenses/LICENSE-2.0
+10   *
+11   * Unless required by applicable law or agreed to in writing, software
+12   * distributed under the License is distributed on an "AS IS" BASIS,
+13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+14   * See the License for the specific language governing permissions and
+15   * limitations under the License.
+16   */
+17  package org.apache.commons.text.similarity;
+18  
+19  import java.util.ArrayList;
+20  import java.util.List;
+21  import java.util.regex.Matcher;
+22  import java.util.regex.Pattern;
+23  
+24  /**
+25   * A simple word tokenizer that utilizes regex to find words. It applies a regex
+26   * {@code}(\w)+{@code} over the input text to extract words from a given character
+27   * sequence.
+28   */
+29  class RegexTokenizer implements Tokenizer<CharSequence> {
+30  
+31      /**
+32       * {@inheritDoc}
+33       *
+34       * @throws IllegalArgumentException if the input text is blank
+35       */
+36      @Override
+37      public CharSequence[] tokenize(CharSequence text) {
+38          if (text == null || text.toString().trim().equals("")) {
+39              throw new IllegalArgumentException("Invalid text");
+40          }
+41          Pattern pattern = Pattern.compile("(\\w)+");
+42          Matcher matcher = pattern.matcher(text.toString());
+43          List<String> tokens = new ArrayList<String>();
+44          while (matcher.find()) {
+45              tokens.add(matcher.group(0));
+46          }
+47          return tokens.toArray(new String[0]);
+48      }
+49  
+50  }
+
+
+ + + \ No newline at end of file Propchange: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/RegexTokenizer.html ------------------------------------------------------------------------------ svn:eol-style = native Added: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/Tokenizer.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/Tokenizer.html (added) +++ websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/Tokenizer.html Sun Apr 26 10:18:25 2015 @@ -0,0 +1,47 @@ + + + +Tokenizer xref + + + +
+1   /*
+2    * Licensed to the Apache Software Foundation (ASF) under one or more
+3    * contributor license agreements.  See the NOTICE file distributed with
+4    * this work for additional information regarding copyright ownership.
+5    * The ASF licenses this file to You under the Apache License, Version 2.0
+6    * (the "License"); you may not use this file except in compliance with
+7    * the License.  You may obtain a copy of the License at
+8    *
+9    *      http://www.apache.org/licenses/LICENSE-2.0
+10   *
+11   * Unless required by applicable law or agreed to in writing, software
+12   * distributed under the License is distributed on an "AS IS" BASIS,
+13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+14   * See the License for the specific language governing permissions and
+15   * limitations under the License.
+16   */
+17  package org.apache.commons.text.similarity;
+18  
+19  /**
+20   * A tokenizer. Can produce arrays of tokens from a given type.
+21   *
+22   * @param <T> given type
+23   */
+24  interface Tokenizer<T> {
+25  
+26      /**
+27       * Returns an array of tokens.
+28       *
+29       * @param text input text
+30       * @return array of tokens
+31       */
+32      T[] tokenize(CharSequence text);
+33  
+34  }
+
+
+ + + \ No newline at end of file Propchange: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/Tokenizer.html ------------------------------------------------------------------------------ svn:eol-style = native Modified: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-frame.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-frame.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-frame.html Sun Apr 26 10:18:25 2015 @@ -22,6 +22,9 @@ CosineSimilarity
  • + Counter +
  • +
  • EditDistance
  • @@ -39,6 +42,12 @@
  • LevenshteinDistance
  • +
  • + RegexTokenizer +
  • +
  • + Tokenizer +
  • Modified: websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-summary.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-summary.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/org/apache/commons/text/similarity/package-summary.html Sun Apr 26 10:18:25 2015 @@ -47,6 +47,11 @@ + Counter + + + + EditDistance @@ -75,6 +80,16 @@ LevenshteinDistance + + + RegexTokenizer + + + + + Tokenizer + + Modified: websites/production/commons/content/sandbox/commons-text/xref/overview-frame.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/overview-frame.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/overview-frame.html Sun Apr 26 10:18:25 2015 @@ -24,11 +24,7 @@
  • org.apache.commons.text.similarity
  • -
  • - org.apache.commons.text.similarity.internal -
  • - Modified: websites/production/commons/content/sandbox/commons-text/xref/overview-summary.html ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/overview-summary.html (original) +++ websites/production/commons/content/sandbox/commons-text/xref/overview-summary.html Sun Apr 26 10:18:25 2015 @@ -48,11 +48,6 @@ org.apache.commons.text.similarity - - - org.apache.commons.text.similarity.internal - - Modified: websites/production/commons/content/sandbox/commons-text/xref/stylesheet.css ============================================================================== --- websites/production/commons/content/sandbox/commons-text/xref/stylesheet.css (original) +++ websites/production/commons/content/sandbox/commons-text/xref/stylesheet.css Sun Apr 26 10:18:25 2015 @@ -111,4 +111,4 @@ hr { .jxr_keyword { color: #000; -} +} \ No newline at end of file