ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1663771 - /ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/data/analysis/Utils.java
Date Tue, 03 Mar 2015 19:55:30 GMT
Author: dligach
Date: Tue Mar  3 19:55:29 2015
New Revision: 1663771

URL: http://svn.apache.org/r1663771
Log:
added code to read word embeddings from file

Modified:
    ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/data/analysis/Utils.java

Modified: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/data/analysis/Utils.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/data/analysis/Utils.java?rev=1663771&r1=1663770&r2=1663771&view=diff
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/data/analysis/Utils.java
(original)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/data/analysis/Utils.java
Tue Mar  3 19:55:29 2015
@@ -19,8 +19,11 @@
 package org.apache.ctakes.relationextractor.data.analysis;
 
 import java.io.File;
+import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.ctakes.core.cr.XMIReader;
 import org.apache.ctakes.typesystem.type.syntax.WordToken;
@@ -30,11 +33,17 @@ import org.apache.uima.fit.util.JCasUtil
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+import com.google.common.io.LineProcessor;
+
 /**
  * Various useful classes and methods.
  */
 public class Utils {
   
+  public static final String embeddingPath = "/Users/dima/Boston/Vectors/Python/sharp-arg-head-word-vectors.txt";
+  
   /**
    * Instantiate an XMI collection reader.
    */
@@ -69,4 +78,42 @@ public class Utils {
     WordToken lastToken = tokens.get(tokens.size() - 1);
     return lastToken.getCoveredText();
   }
+  
+  /**
+   * Read word embeddings from file.
+   */
+  public static class Callback implements LineProcessor <Map<String, List<Float>>>
{
+    
+    private Map<String, List<Float>> wordToVector;
+    
+    public Callback() {
+      wordToVector = new HashMap<>();
+    }
+    
+    public boolean processLine(String line) throws IOException {
+      
+      String[] elements = line.split(" "); // e.g. skin -0.024690 0.108761 0.038441 -0.088759
...
+      List<Float> vector = new ArrayList<>();
+      
+      for(int dimension = 1; dimension < elements.length; dimension++) {
+        vector.add(Float.parseFloat(elements[dimension]));
+      }
+      
+      wordToVector.put(elements[0], vector);
+      return true;
+    }
+    
+    public Map<String, List<Float>> getResult() {
+      
+      return wordToVector;
+    }
+  }
+  
+  public static void main(String[] args) throws IOException {
+    
+    File word2vec = new File(embeddingPath);
+    Map<String, List<Float>> data = Files.readLines(word2vec, Charsets.UTF_8,
new Callback());
+    System.out.println(data.get("skin"));
+    System.out.println(data.get("oov"));
+  }
 }



Mime
View raw message