opennlp-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ma...@apache.org
Subject svn commit: r1609600 [2/2] - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: ./ indexing/ scoring/
Date Fri, 11 Jul 2014 01:04:59 GMT
Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
(added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
Fri Jul 11 01:04:58 2014
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.geoentitylinker.AdminBoundary;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class USGSProcessor {
+
+  public static void main(String[] args) {
+    try {
+      Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),
GazetteerIndexer.GazType.USGS);
+      process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"),
null, null);
+    } catch (Exception ex) {
+      Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex);
+    }
+  }
+
+  public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile,
IndexWriter w) throws Exception {
+    Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS);
+    readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData);
+    writeCountryContextFile(outputCountryContextfile, provData);
+  }
+
+  public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType
type, Map<String, AdminBoundary> lookupMap) throws Exception {
+
+    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+    List<String> fields = new ArrayList<>();
+    int counter = 0;
+    System.out.println("reading gazetteer data from USGS file...........");
+    String line = "";
+    while ((line = reader.readLine()) != null) {
+
+      String[] values = line.split(type.getSeparator());
+      if (counter == 0) {
+        for (String columnName : values) {
+          fields.add(columnName.replace("»¿", "").trim());
+        }
+
+      } else {
+        Document doc = new Document();
+        for (int i = 0; i < fields.size() - 1; i++) {
+          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+        }
+        String placeName = values[1];
+        String lat = values[9];
+        String lon = values[10];
+        String dsg = values[2];
+        String id = values[0];
+
+        String ccode = values[6];
+        String admincode = values[3];
+        AdminBoundary get = lookupMap.get(admincode + "." + ccode);
+        String countyname = "";
+        String countyCode = get.getCountyCode();
+        if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
+          countyname =  get.getCountyName();
+        }
+        if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {
+          countyCode = get.getCountyCode();
+        }
+        String hierarchy = get.getCountryName() + ", " + get.getProvinceName() +", "+ countyname
+ ", " + placeName;
+
+        doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+        doc.add(new TextField("placename", placeName, Field.Store.YES));
+        doc.add(new TextField("latitude", lat, Field.Store.YES));
+        doc.add(new TextField("longitude", lon, Field.Store.YES));
+        doc.add(new TextField("loctype", dsg, Field.Store.YES));
+        doc.add(new TextField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(),
Field.Store.YES));
+        doc.add(new TextField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES));
+        doc.add(new TextField("countycode", (get.getCountryCode() + "." + get.getProvCode()
+ "." + countyCode).toLowerCase(), Field.Store.YES));
+
+        doc.add(new TextField("locid", id, Field.Store.YES));
+        doc.add(new TextField("gazsource", "usgs", Field.Store.YES));
+        w.addDocument(doc);
+      }
+      counter++;
+      if (counter % 100000 == 0) {
+        w.commit();
+        System.out.println(counter + " .........USGS entries committed to index..............");
+      }
+
+    }
+    w.commit();
+    System.out.println("Completed indexing USGS gaz!");
+  }
+
+  private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType
type) {
+ System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());
+    Map<String, AdminBoundary> outmap = new HashMap<>();
+    BufferedReader reader;
+
+    try {
+
+      reader = new BufferedReader(new FileReader(govUnitsFile));
+      int i = 0;
+      String line = "";
+      String[] fields = null;
+      while ((line = reader.readLine()) != null) {
+
+        String[] values = line.split(type.getSeparator());
+        if (i == 0) {
+          fields = values;
+          i++;
+          continue;
+        }
+        i++;
+        // System.out.println(i);
+        String countyCode = values[2];
+        String countyName = values[3];
+        String stateCode = values[5];
+        String stateName = values[6];
+        String countryCode = values[7];
+        String countryName = values[8];
+        AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode,
stateName, countyCode, countyName);
+        outmap.put(stateCode + "." + countyCode, adminBoundary);
+        //  System.out.println(adminBoundary);
+
+      }
+      reader.close();
+    } catch (IOException ex) {
+      ex.printStackTrace();
+    }
+  System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath());
+
+    return outmap;
+
+  }
+
+  public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary>
adms) {
+    // FileWriter writer = null;
+    try (FileWriter writer = new FileWriter(outfile, true)) {
+
+      for (String admkey : adms.keySet()) {
+        AdminBoundary adm = adms.get(admkey);
+        if (adm == null) {
+          continue;
+        }
+        String province = adm.getProvinceName();
+        String country = adm.getCountryName();
+        /**
+         * this is the standard format of the country context file... Geonames
+         * data will have an empty string for the county
+         */
+        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode()
+ "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\n";
+        writer.write(line);
+      ///  System.out.println(line);
+
+      }
+      writer.close();
+    } catch (IOException ex) {
+      Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex);
+    }
+    System.out.println("successfully wrote USGS entries to country oontext file");
+  }
+}

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
(from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
(original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
Fri Jul 11 01:04:58 2014
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
 
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -22,6 +22,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.BaseLink;
 import opennlp.tools.entitylinker.LinkedSpan;
@@ -29,20 +30,20 @@ import opennlp.tools.util.Span;
 
 /**
  * Scores toponyms based on their proximity to a country mention. Based on the
- * heuristic that typonymn mentions are more likely close to their parent
+ * heuristic that toponymn mentions are more likely close to their parent
  * country mentions. For instance, if the toponym Berlin is mentioned near an
  * indicator of Germany, it is more likely to be Berlin Germany than Berlin
- * Connecticut.
+ * Connecticut (if Connecticut is mentioned further down in the article).
  *
  *
  */
-public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
+public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext>
{
 
   private Map<String, Set<String>> nameCodesMap;
   String dominantCode = "";
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, CountryContext additionalContext) {
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
 
     score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(),
docText, sentenceSpans, 1000);
 

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
(from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
(original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
Fri Jul 11 01:04:58 2014
@@ -13,43 +13,50 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
 
+import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.BaseLink;
 import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.ngram.NGramGenerator;
 import opennlp.tools.util.Span;
 
 /**
  *
  * Generates scores based on string comparisons levenstein and dice
  */
-public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
+public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext>
{
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, CountryContext additionalContext) {
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
     for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
       for (BaseLink link : linkedSpan.getLinkedEntries()) {
-        Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace("
", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
-        link.getScoreMap().put("dice", dice);
-        Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace("
", ""), link.getItemName().toLowerCase().replace(" ", ""));
-        link.getScoreMap().put("levenshtein", ld);
+        if (link instanceof GazetteerEntry) {
+          GazetteerEntry entry = (GazetteerEntry) link;
+          String hierarchy = entry.getHierarchy();
+          if (hierarchy != null) {
+            Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(),
2);
+            link.getScoreMap().put("hierarchydicecoef", dice);
+            Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(),
hierarchy.toLowerCase().toLowerCase());
+            link.getScoreMap().put("hierarchylevenshtein", ld);
+          }
+        }
       }
     }
 
-
   }
 
   /**
    * Generates a score based on an overlap of nGrams between two strings using
    * the DiceCoefficient technique.
    *
-   * @param s1     first string
-   * @param s2     second string
+   * @param s1 first string
+   * @param s2 second string
    * @param nGrams number of chars in each gram
    * @return
    */
@@ -57,8 +64,22 @@ public class FuzzyStringMatchScorer impl
     if (s1.equals("") || s1.equals("")) {
       return 0d;
     }
-    List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");
-    List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");
+    List<String> s1Grams = new ArrayList<>();
+    List<String> s2Grams = new ArrayList<>();
+    String[] split1 = s1.split("[ ,]");
+    for (String token : split1) {
+      if (token.trim().equals("")) {
+        continue;
+      }
+      s1Grams.add(token);
+    }
+    String[] split2 = s2.split("[ ,]");
+    for (String token : split2) {
+      if (token.trim().equals("")) {
+        continue;
+      }
+      s2Grams.add(token);
+    }
 
     Set<String> overlap = new HashSet<String>(s1Grams);
     overlap.retainAll(s2Grams);

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
(from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
(original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
Fri Jul 11 01:04:58 2014
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
 
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.BaseLink;
 import opennlp.tools.entitylinker.LinkedSpan;
@@ -29,13 +31,13 @@ import opennlp.tools.util.Span;
  * outliers by finding those points that are not near the majority
  *
  */
-public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
+public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext>
{
 
   private final PointClustering CLUSTERER = new PointClustering();
   private int PRECISION = 3;
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, CountryContext additionalContext) {
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties,  AdminBoundaryContext additionalContext) {
      //Map<Double, Double> latLongs = new HashMap<Double, Double>();
     List<GazetteerEntry> allGazEntries = new ArrayList<>();
 

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
(from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
(original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
Fri Jul 11 01:04:58 2014
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
 
 import java.util.List;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
@@ -23,6 +23,7 @@ import opennlp.tools.util.Span;
 /**
  * Structure for scoring linked entities. The Map logically represents a pair :
  * "Score type" to the "actual Score."
+ * @param <T> a generic for providing additional context
  */
 public interface LinkedEntityScorer<T> {
 
@@ -32,6 +33,7 @@ public interface LinkedEntityScorer<T> {
  * @param linkedSpans the spans that have been linked to some external source and have all
the data they need to be scored
  * @param docText the full text of the document.
  * @param sentenceSpans the sentence spans the correspond to the document text
+   * @param properties the entitylinker properties config file
  * @param additionalContext any additional data required to perform the scoring operation
  * @return void
  */

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
(from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java&r1=1594067&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
(original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
Fri Jul 11 01:04:58 2014
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
 
 import java.io.File;
 import java.io.FileNotFoundException;
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
@@ -33,7 +34,7 @@ import org.apache.log4j.Logger;
  *
  * Utilizes a doccat model to score toponyms based on surrounding context
  */
-public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {
+public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> {
 
   private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class);
   DocumentCategorizerME documentCategorizerME;
@@ -42,7 +43,7 @@ public class ModelBasedScorer implements
   boolean modelexists = false;
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, CountryContext additionalContext) {
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
     try {
       if (doccatModel == null) {
         String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath",
"");

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
(from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java&r1=1594067&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
(original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
Fri Jul 11 01:04:58 2014
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
 
 import com.spatial4j.core.context.SpatialContext;
 import com.spatial4j.core.io.GeohashUtils;
@@ -22,6 +22,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
 
 /**
  *
@@ -114,36 +115,6 @@ public class PointClustering {
     return point;
   }
 
-  /**
-   * Hashes a lat long based on adding 90 or 180 and then interlarding lat lon
-   * chars. reduces a set of points to a sortable set
-   *
-   * @param lat
-   * @param lon
-   * @return
-   */
-  public String simpleGeohash(Double lat, Double lon) {
-    String geoHash = "";
-    lat = lat + 90;
-    lon = lon + 180;
-    String latString = String.valueOf(lat).replace(".", "");
-    String lonString = String.valueOf(lon).replace(".", "");
-    int length = latString.length() > lonString.length() ? lonString.length() : latString.length();
-    while (length < 12) {
-      latString += "0";
-      lonString += "0";
-      length++;
-    }
-    latString = latString.substring(0, 10);
-    lonString = lonString.substring(0, 10);
-    char[] latChars = latString.toCharArray();
-    char[] lonChars = lonString.toCharArray();
-
-    for (int i = 0; i < latChars.length; i++) {
-      geoHash += String.valueOf(latChars[i]) + String.valueOf(lonChars[i]);
-    }
-    return geoHash;
-  }
 
   private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
     Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) +
0;

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
(from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
(original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
Fri Jul 11 01:04:58 2014
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
 
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -22,29 +22,39 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
 import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.LinkedSpan;
 import opennlp.tools.util.Span;
 
 /**
- * Scores toponyms based on their proximity to a country mention. Based on the
- * heuristic that typonymn mentions are more likely close to their parent
- * country mentions. For instance, if the toponym Berlin is mentioned near an
- * indicator of Germany, it is more likely to be Berlin Germany than Berlin
- * Connecticut.
+ * Scores toponyms based on their proximity to a province mention. Based on the
+ * heuristic that toponymn mentions are more likely close to their parent
+ * province mentions. For instance, if the toponym Berlin is mentioned near an
+ * indicator of Connecticut, it is more likely to be Berlin Connecticut than
+ * Berlin Germany (if Germany did not exist in, or is mentioned further down in,
+ * the article).
  *
  *
  */
-public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
+public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundaryContext>
{
 
   private Map<String, Set<String>> nameCodesMap;
   String dominantCode = "";
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, CountryContext additionalContext) {
-
-    score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(),
docText, sentenceSpans, 1000);
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,
EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
+    if (!additionalContext.getProvHits().isEmpty()) {
+      score(linkedSpans, additionalContext.getProvMentions(), additionalContext.getNameCodesMap(),
docText, sentenceSpans, 1000);
+    } else {
+      for (LinkedSpan<BaseLink> span : linkedSpans) {
+        for (BaseLink link : span.getLinkedEntries()) {
+          link.getScoreMap().put("provincecontext", Double.NaN);
+        }
+      }
+    }
 
   }
 
@@ -53,20 +63,19 @@ public class CountryProximityScorer impl
    * matches. Currently the scoring indicates the probability that the toponym
    * is correct based on the country context in the document
    *
-   * @param linkedData     the linked spans, holds the Namefinder results, and
-   *                       the list of BaseLink for each
-   * @param countryHits    all the country mentions in the document
-   * @param nameCodesMap   maps a country indicator name to a country code. Used
-   *                       to determine if the namefinder found the same exact
-   *                       toponym the country context did. If so the score is
-   *                       boosted due to the high probability that the
-   *                       NameFinder actually "rediscovered" a country
-   * @param docText        the full text of the document...not used in this
-   *                       default implementation
-   * @param sentences      the sentences that correspond to the doc text.
+   * @param linkedData the linked spans, holds the Namefinder results, and the
+   * list of BaseLink for each
+   * @param countryHits all the country mentions in the document
+   * @param nameCodesMap maps a province indicator name to a province code. Used
+   * to determine if the namefinder found the same exact toponym the country
+   * context did. If so the score is boosted due to the high probability that
+   * the NameFinder actually "rediscovered" a country
+   * @param docText the full text of the document...not used in this default
+   * implementation
+   * @param sentences the sentences that correspond to the doc text.
    * @param maxAllowedDist a constant that is used to determine which country
-   *                       mentions, based on proximity within the text, should
-   *                       be used to score the Named Entity.
+   * mentions, based on proximity within the text, should be used to score the
+   * Named Entity.
    * @return
    */
   public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>>
countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences,
Integer maxAllowedDist) {
@@ -149,34 +158,35 @@ public class CountryProximityScorer impl
     Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
     for (BaseLink link : span.getLinkedEntries()) {
       //getItemParentId is the country code
-      String spanCountryCode = link.getItemParentID();
+    GazetteerEntry entry = (GazetteerEntry)link;
+      String spanCountryCode = entry.getProvinceCode();
       if (scoreMap.containsKey(spanCountryCode)) {
 
         score = scoreMap.get(spanCountryCode);
         ///does the name extracted match a country name?
-        if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
+        if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) {
           //if so, is it the correct country code for that name?
-          if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID()))
{
+          if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode()))
{
             //boost the score becuase it is likely that this is the location in the text,
so add 50% to the score or set to 1
             //TODO: make this smarter, and utilize province/state info in the future to be
even more specific
             score = (score + .75) > 1.0 ? 1d : (score + .75);
 
-            if (link.getItemParentID().equals(dominantCode)) {
+            if (entry.getProvinceCode().equals(dominantCode)) {
               score = (score + .25) > 1.0 ? 1d : (score + .25);
             }
           }
         }
       }
-      link.getScoreMap().put("countrycontext", score);
+      link.getScoreMap().put("provincecontext", score);
     }
     return span;
   }
 
   /**
-   * takes a map of distances from the toponym to each country mention and generates
-   * a map of scores for each country code. The map is then correlated to the
-   * code of the BaseLink parentid for retrieval. Then the
-   * score is added to the overall list.
+   * takes a map of distances from the toponym to each country mention and
+   * generates a map of scores for each country code. The map is then correlated
+   * to the code of the BaseLink parentid for retrieval. Then the score is added
+   * to the overall list.
    *
    * @param distanceMap
    * @param sentences
@@ -211,7 +221,6 @@ public class CountryProximityScorer impl
         normalizedDistances.add(reverse);
       }
 
-
       List<Double> doubles = new ArrayList<Double>(normalizedDistances);
       scoreMap.put(key, slidingDistanceAverage(doubles));
     }
@@ -257,8 +266,8 @@ public class CountryProximityScorer impl
    * range. Used to normalize distances in this class.
    *
    * @param valueToNormalize the value to place within the new range
-   * @param minimum          the min of the set to be transposed
-   * @param maximum          the max of the set to be transposed
+   * @param minimum the min of the set to be transposed
+   * @param maximum the max of the set to be transposed
    * @return
    */
   private Double normalize(int valueToNormalize, int minimum, int maximum) {



Mime
View raw message