commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ki...@apache.org
Subject [1/7] [text] salutations: matches salutations. Miss, Dr, ..
Date Sun, 20 Nov 2016 06:57:40 GMT
Repository: commons-text
Updated Branches:
  refs/heads/master ebb2a9223 -> 6fd10f89a


salutations: matches salutations. Miss, Dr, ..


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/1c640335
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/1c640335
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/1c640335

Branch: refs/heads/master
Commit: 1c6403353bad890d0338d7f2d50274c7d79e4a3b
Parents: a0178d0
Author: Tom MacKenzie <tom.s.mackenzie@gmail.com>
Authored: Mon Jun 20 11:13:42 2016 -0500
Committer: Tom MacKenzie <tom.s.mackenzie@gmail.com>
Committed: Mon Jun 20 11:13:42 2016 -0500

----------------------------------------------------------------------
 .gitignore                                      |  1 +
 .../commons/text/names/HumanNameParser.java     | 11 +++-
 .../org/apache/commons/text/names/Name.java     |  9 ++-
 .../commons/text/names/HumanNameParserTest.java |  6 +-
 .../org/apache/commons/text/names/testNames.txt | 65 ++++++++++----------
 5 files changed, 57 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index c8130e7..7eaf4b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 # Include only files generated during build, and avoid IDE specific files 
 target/
 site-content
+*.iml
 

http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/src/main/java/org/apache/commons/text/names/HumanNameParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/names/HumanNameParser.java b/src/main/java/org/apache/commons/text/names/HumanNameParser.java
index d713e9f..6780194 100644
--- a/src/main/java/org/apache/commons/text/names/HumanNameParser.java
+++ b/src/main/java/org/apache/commons/text/names/HumanNameParser.java
@@ -100,6 +100,7 @@ import org.apache.commons.lang3.StringUtils;
  */
 public final class HumanNameParser {
 
+    private final List<String> salutations;
     private final List<String> suffixes;
     private final List<String> prefixes;
 
@@ -108,6 +109,10 @@ public final class HumanNameParser {
      */
     public HumanNameParser() {
         // TODO make this configurable
+        this.salutations = Arrays.asList(
+                "Mr",  "Mrs", "Ms", "Miss", "Dr",
+                "Mr.",  "Mrs.", "Ms.", "Miss.", "Dr."
+        );
         this.suffixes = Arrays.asList(
                 "esq", "esquire", "jr",
                 "sr", "2", "ii", "iii", "iv");
@@ -131,6 +136,7 @@ public final class HumanNameParser {
 
         NameString nameString = new NameString(name);
         // TODO compile regexes only once when the parser is created
+        String salutations = StringUtils.join(this.salutations, " |") + "";
         String suffixes = StringUtils.join(this.suffixes, "\\.*|") + "\\.*";
         String prefixes = StringUtils.join(this.prefixes, " |") + " ";
 
@@ -138,6 +144,7 @@ public final class HumanNameParser {
         // but you can select a particular parenthesized submatch to be returned.
         // Also, note that each regex requres that the preceding ones have been run, and
matches chopped out.
         // names that starts or end w/ an apostrophe break this
+        String salutationRegex = "^(("+salutations+"))";
         String nicknamesRegex = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) ";
         String suffixRegex = "(?i),* *((" + suffixes + ")$)";
         String lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$";
@@ -145,6 +152,8 @@ public final class HumanNameParser {
         String leadingInitRegex = "(?i)(^(.\\.*)(?= \\p{L}{2}))";
         String firstRegex = "(?i)^([^ ]+)";
 
+        String salutation = nameString.chopWithRegex(salutationRegex, 1);
+
         // get nickname, if there is one
         String nickname = nameString.chopWithRegex(nicknamesRegex, 2);
 
@@ -169,7 +178,7 @@ public final class HumanNameParser {
         // if anything's left, that's the middle name
         String middle = nameString.getWrappedString();
         
-        return new Name(leadingInit, first, nickname, middle, last, suffix);
+        return new Name(leadingInit, salutation, first, nickname, middle, last, suffix);
     }
 
 }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/src/main/java/org/apache/commons/text/names/Name.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/names/Name.java b/src/main/java/org/apache/commons/text/names/Name.java
index 7e32de4..ef3d36a 100644
--- a/src/main/java/org/apache/commons/text/names/Name.java
+++ b/src/main/java/org/apache/commons/text/names/Name.java
@@ -26,14 +26,16 @@ import java.util.Objects;
 public final class Name {
 
     private final String leadingInitial;
+    private final String salutation;
     private final String firstName;
     private final String nickName;
     private final String middleName;
     private final String lastName;
     private final String suffix;
 
-    Name(String leadingInitial, String firstName, String nickName, String middleName, String
lastName, String suffix) {
+    Name(String leadingInitial, String salutation, String firstName, String nickName, String
middleName, String lastName, String suffix) {
         this.leadingInitial = leadingInitial;
+        this.salutation = salutation;
         this.firstName = firstName;
         this.nickName = nickName;
         this.middleName = middleName;
@@ -52,6 +54,11 @@ public final class Name {
         return leadingInitial;
     }
 
+
+    public String getSalutation() {
+        return salutation;
+    }
+
     /**
      * Gets the first name.
      *

http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java b/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java
index f6c9ba6..22c96cc 100644
--- a/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java
+++ b/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java
@@ -74,6 +74,10 @@ public class HumanNameParserTest {
         Name result = nameParser.parse(record.get(Columns.Name));
 
         long recordNum = record.getRecordNumber();
+
+        assertThat("Wrong LeadingInit in record " + recordNum,
+                result.getSalutation(), equalTo(record.get(Columns.Salutation)));
+
         assertThat("Wrong LeadingInit in record " + recordNum,
                 result.getLeadingInitial(), equalTo(record.get(Columns.LeadingInit)));
 
@@ -94,6 +98,6 @@ public class HumanNameParserTest {
     }
 
     private enum Columns {
-        Name,LeadingInit,FirstName,NickName,MiddleName,LastName,Suffix
+        Name,Salutation,LeadingInit,FirstName,NickName,MiddleName,LastName,Suffix
     }
 }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/src/test/resources/org/apache/commons/text/names/testNames.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/org/apache/commons/text/names/testNames.txt b/src/test/resources/org/apache/commons/text/names/testNames.txt
index 8e32bf1..2cd2b4d 100644
--- a/src/test/resources/org/apache/commons/text/names/testNames.txt
+++ b/src/test/resources/org/apache/commons/text/names/testNames.txt
@@ -1,32 +1,33 @@
-Name|LeadingInit|FirstName|NickName|MiddleName|LastName|Suffix
-Björn O'Malley||Björn|||O'Malley|
-Bin Lin||Bin|||Lin|
-Linda Jones||Linda|||Jones|
-Jason H. Priem||Jason||H.|Priem|
-Björn O'Malley-Muñoz||Björn|||O'Malley-Muñoz|
-Björn C. O'Malley||Björn||C.|O'Malley|
-Björn "Bill" O'Malley||Björn|Bill||O'Malley|
-Björn ("Bill") O'Malley||Björn|Bill||O'Malley|
-Björn ("Wild Bill") O'Malley||Björn|Wild Bill||O'Malley|
-Björn (Bill) O'Malley||Björn|Bill||O'Malley|
-Björn 'Bill' O'Malley||Björn|Bill||O'Malley|
-Björn C O'Malley||Björn||C|O'Malley|
-Björn C. R. O'Malley||Björn||C. R.|O'Malley|
-Björn Charles O'Malley||Björn||Charles|O'Malley|
-Björn Charles R. O'Malley||Björn||Charles R.|O'Malley|
-Björn van O'Malley||Björn|||van O'Malley|
-Björn Charles van der O'Malley||Björn||Charles|van der O'Malley|
-Björn Charles O'Malley y Muñoz||Björn||Charles|O'Malley y Muñoz|
-Björn O'Malley, Jr.||Björn|||O'Malley|Jr.
-Björn O'Malley Jr||Björn|||O'Malley|Jr
-B O'Malley||B|||O'Malley|
-William Carlos Williams||William||Carlos|Williams|
-C. Björn Roger O'Malley|C.|Björn||Roger|O'Malley|
-B. C. O'Malley||B.||C.|O'Malley|
-B C O'Malley||B||C|O'Malley|
-B.J. Thomas||B.J.|||Thomas|
-O'Malley, Björn||Björn|||O'Malley|
-O'Malley, Björn Jr||Björn|||O'Malley|Jr
-O'Malley, C. Björn|C.|Björn|||O'Malley|
-O'Malley, C. Björn III|C.|Björn|||O'Malley|III
-O'Malley y Muñoz, C. Björn Roger III|C.|Björn||Roger|O'Malley y Muñoz|III
\ No newline at end of file
+Name|Salutation|LeadingInit|FirstName|NickName|MiddleName|LastName|Suffix
+Björn O'Malley|||Björn|||O'Malley|
+Bin Lin|||Bin|||Lin|
+Linda Jones|||Linda|||Jones|
+Jason H. Priem|||Jason||H.|Priem|
+Björn O'Malley-Muñoz|||Björn|||O'Malley-Muñoz|
+Björn C. O'Malley|||Björn||C.|O'Malley|
+Björn "Bill" O'Malley|||Björn|Bill||O'Malley|
+Björn ("Bill") O'Malley|||Björn|Bill||O'Malley|
+Björn ("Wild Bill") O'Malley|||Björn|Wild Bill||O'Malley|
+Björn (Bill) O'Malley|||Björn|Bill||O'Malley|
+Björn 'Bill' O'Malley|||Björn|Bill||O'Malley|
+Björn C O'Malley|||Björn||C|O'Malley|
+Björn C. R. O'Malley|||Björn||C. R.|O'Malley|
+Björn Charles O'Malley|||Björn||Charles|O'Malley|
+Björn Charles R. O'Malley|||Björn||Charles R.|O'Malley|
+Björn van O'Malley|||Björn|||van O'Malley|
+Björn Charles van der O'Malley|||Björn||Charles|van der O'Malley|
+Björn Charles O'Malley y Muñoz|||Björn||Charles|O'Malley y Muñoz|
+Björn O'Malley, Jr.|||Björn|||O'Malley|Jr.
+Björn O'Malley Jr|||Björn|||O'Malley|Jr
+B O'Malley|||B|||O'Malley|
+William Carlos Williams|||William||Carlos|Williams|
+C. Björn Roger O'Malley||C.|Björn||Roger|O'Malley|
+B. C. O'Malley|||B.||C.|O'Malley|
+B C O'Malley|||B||C|O'Malley|
+B.J. Thomas|||B.J.|||Thomas|
+O'Malley, Björn|||Björn|||O'Malley|
+O'Malley, Björn Jr|||Björn|||O'Malley|Jr
+O'Malley, C. Björn||C.|Björn|||O'Malley|
+O'Malley, C. Björn III||C.|Björn|||O'Malley|III
+O'Malley y Muñoz, C. Björn Roger III||C.|Björn||Roger|O'Malley y Muñoz|III
+Dr. Gaius Baltar|Dr.||Gaius|||Baltar|
\ No newline at end of file


Mime
View raw message