Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id CB921200B33 for ; Wed, 29 Jun 2016 22:05:45 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id CA102160A57; Wed, 29 Jun 2016 20:05:45 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id BD8F7160A3C for ; Wed, 29 Jun 2016 22:05:44 +0200 (CEST) Received: (qmail 96219 invoked by uid 500); 29 Jun 2016 20:05:43 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 96210 invoked by uid 99); 29 Jun 2016 20:05:43 -0000 Received: from pnap-us-west-generic-nat.apache.org (HELO spamd1-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 29 Jun 2016 20:05:43 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd1-us-west.apache.org (ASF Mail Server at spamd1-us-west.apache.org) with ESMTP id 794EDC3B53 for ; Wed, 29 Jun 2016 20:05:43 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd1-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: 0.513 X-Spam-Level: X-Spam-Status: No, score=0.513 tagged_above=-999 required=6.31 tests=[KAM_ASCII_DIVIDERS=0.8, KAM_LAZY_DOMAIN_SECURITY=1, RP_MATCHES_RCVD=-1.287] autolearn=disabled Received: from mx1-lw-eu.apache.org ([10.40.0.8]) by localhost (spamd1-us-west.apache.org [10.40.0.7]) (amavisd-new, port 10024) with ESMTP id FtbElaklTTZG for ; Wed, 29 Jun 2016 20:05:41 +0000 (UTC) Received: from mailrelay1-us-west.apache.org (mailrelay1-us-west.apache.org [209.188.14.139]) by mx1-lw-eu.apache.org (ASF Mail Server at mx1-lw-eu.apache.org) with ESMTP id 2BC195F1E3 for ; Wed, 29 Jun 2016 20:05:41 +0000 (UTC) Received: from svn01-us-west.apache.org (svn.apache.org [10.41.0.6]) by mailrelay1-us-west.apache.org (ASF Mail Server at mailrelay1-us-west.apache.org) with ESMTP id C9601E01AB for ; Wed, 29 Jun 2016 20:05:39 +0000 (UTC) Received: from svn01-us-west.apache.org (localhost [127.0.0.1]) by svn01-us-west.apache.org (ASF Mail Server at svn01-us-west.apache.org) with ESMTP id 906DC3A019C for ; Wed, 29 Jun 2016 20:05:38 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1750710 - in /ctakes/trunk/ctakes-core: ./ src/main/java/org/apache/ctakes/core/cleartk/ Date: Wed, 29 Jun 2016 20:05:37 -0000 To: commits@ctakes.apache.org From: tmill@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20160629200538.906DC3A019C@svn01-us-west.apache.org> archived-at: Wed, 29 Jun 2016 20:05:46 -0000 Author: tmill Date: Wed Jun 29 20:05:37 2016 New Revision: 1750710 URL: http://svn.apache.org/viewvc?rev=1750710&view=rev Log: Added some cleartk-derived feature extractors for working with embeddings/neural networks. Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java Modified: ctakes/trunk/ctakes-core/pom.xml Modified: ctakes/trunk/ctakes-core/pom.xml URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/pom.xml?rev=1750710&r1=1750709&r2=1750710&view=diff ============================================================================== --- ctakes/trunk/ctakes-core/pom.xml (original) +++ ctakes/trunk/ctakes-core/pom.xml Wed Jun 29 20:05:37 2016 @@ -109,5 +109,9 @@ org.apache.uima uimafit-core + + org.cleartk + cleartk-ml + Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java?rev=1750710&view=auto ============================================================================== --- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java (added) +++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java Wed Jun 29 20:05:37 2016 @@ -0,0 +1,79 @@ +package org.apache.ctakes.core.cleartk; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.core.resource.FileLocator; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.utils.distsem.WordEmbeddings; +import org.apache.ctakes.utils.distsem.WordVector; +import org.apache.ctakes.utils.distsem.WordVectorReader; +import org.apache.uima.jcas.JCas; +import org.cleartk.ml.feature.extractor.CleartkExtractorException; +import org.cleartk.ml.feature.extractor.NamedFeatureExtractor1; +import org.cleartk.ml.Feature; + + +public class ContinuousTextExtractor implements NamedFeatureExtractor1 { + public enum OovStrategy {OOV_FEATURE, EMPTY_VECTOR, MEAN_VECTOR} + + private int dims; + private WordEmbeddings words = null; + private OovStrategy oovStrategy = null; + + public ContinuousTextExtractor(String vecFile) throws + CleartkExtractorException { + this(vecFile, OovStrategy.OOV_FEATURE); + } + + public ContinuousTextExtractor(String vecFile, OovStrategy oovStrategy) throws + CleartkExtractorException { + super(); + try { + words = + WordVectorReader.getEmbeddings(FileLocator.getAsStream(vecFile)); + } catch (IOException e) { + e.printStackTrace(); + throw new CleartkExtractorException(e); + } + this.oovStrategy = oovStrategy; + } + @Override + public List extract(JCas view, BaseToken token) throws + CleartkExtractorException { + List feats = new ArrayList<>(); + + String wordText = token.getCoveredText(); + WordVector vec = null; + if(words.containsKey(wordText)){ + vec = words.getVector(wordText); + }else if(words.containsKey(wordText.toLowerCase())){ + vec = words.getVector(wordText.toLowerCase()); + }else{ + if(this.oovStrategy == OovStrategy.OOV_FEATURE){ + feats.add(new Feature(getFeatureName(), "OOV")); + return feats; + }else if(this.oovStrategy == OovStrategy.EMPTY_VECTOR){ + vec = new WordVector("_empty_", new double[words.getDimensionality()]); + }else if(this.oovStrategy == OovStrategy.MEAN_VECTOR){ + vec = words.getMeanVector(); + } + } + + for(int i = 0; i < vec.size(); i++){ + feats.add(new Feature(getFeatureName() + "_" + i, vec.getValue(i))); + } + return feats; + } + + public int getEmbeddingsDimensionality(){ + return words.getDimensionality(); + } + + @Override + public String getFeatureName() { + return "ContinuousText"; + } + +} Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java?rev=1750710&view=auto ============================================================================== --- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java (added) +++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java Wed Jun 29 20:05:37 2016 @@ -0,0 +1,59 @@ +package org.apache.ctakes.core.cleartk; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.ml.Feature; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Following; +import org.cleartk.ml.feature.extractor.CleartkExtractorException; +import org.cleartk.ml.feature.extractor.FeatureExtractor1; + +public class FollowingWithPadding extends Following { + + public int dims; + + public FollowingWithPadding(int end, int dims) { + super(end); + this.dims = dims; + } + + @Override + public List extract(JCas jCas, + Annotation focusAnnotation, Bounds bounds, + Class annotationClass, FeatureExtractor1 extractor) + throws CleartkExtractorException { + LinkedList rawFeats = new LinkedList<>(super.extract(jCas, focusAnnotation, bounds, annotationClass, extractor)); + List processedFeats = new ArrayList<>(); + + for(Feature feat : rawFeats){ + if(feat.getValue().toString().startsWith("OOB")){ + // add one feature for each dimension and set it to 0. + for(int j = 0; j < this.dims; j++){ + processedFeats.add(new Feature(feat.getName() + "_" + j, 0.0)); + } + }else{ + processedFeats.add(feat); + } + } + return processedFeats; + } + + /* + @Override + protected List select(JCas jCas, + Annotation focusAnnotation, Class annotationClass, int count) { + List validList = super.select(jCas, focusAnnotation, annotationClass, count); + + // Pad the end of the list with repeats of the last element + while(validList.size() < count){ + validList.add(validList.get(validList.size()-1)); + } + + return validList; + } + */ +} Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java?rev=1750710&view=auto ============================================================================== --- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java (added) +++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java Wed Jun 29 20:05:37 2016 @@ -0,0 +1,74 @@ +package org.apache.ctakes.core.cleartk; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.ml.Feature; +import org.cleartk.ml.feature.extractor.CleartkExtractor; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Context; +import org.cleartk.ml.feature.extractor.CleartkExtractorException; +import org.cleartk.ml.feature.extractor.FeatureExtractor1; + +public class MaxContext implements CleartkExtractor.Context { + + private Context[] contexts; + + private String name; + + /** + * Constructs a {@link Context} which converts the features extracted by the argument contexts + * into a bag of features where all features have the same name. + * + * @param contexts + * The contexts which should be combined into a bag. + */ + public MaxContext(Context... contexts) { + this.contexts = contexts; + String[] names = new String[contexts.length + 1]; + names[0] = "Max"; + for (int i = 1; i < names.length; ++i) { + names[i] = contexts[i - 1].getName(); + } + this.name = Feature.createName(names); + } + + public String getName() { + return this.name; + } + + public List extract(JCas jCas, + Annotation focusAnnotation, Bounds bounds, + Class annotationClass, FeatureExtractor1 extractor) + throws CleartkExtractorException { + HashMap runningTotals = new HashMap<>(); + + for (Context context : this.contexts) { + for (Feature feature : context.extract( + jCas, + focusAnnotation, + bounds, + annotationClass, + extractor)) { + try{ + double val = Double.parseDouble(feature.getValue().toString()); + if(!runningTotals.containsKey(feature.getName())){ + runningTotals.put(feature.getName(), 0.0); + } + runningTotals.put(feature.getName(), Double.max(runningTotals.get(feature.getName()), val)); + }catch(Exception e){ + // just ignore this feature? + } + } + } + List features = new ArrayList<>(); + for(String key : runningTotals.keySet()){ + features.add(new Feature(this.name + "_" + key, runningTotals.get(key))); + } + return features; + } + +} Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java?rev=1750710&view=auto ============================================================================== --- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java (added) +++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java Wed Jun 29 20:05:37 2016 @@ -0,0 +1,74 @@ +package org.apache.ctakes.core.cleartk; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.ml.Feature; +import org.cleartk.ml.feature.extractor.CleartkExtractor; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Context; +import org.cleartk.ml.feature.extractor.CleartkExtractorException; +import org.cleartk.ml.feature.extractor.FeatureExtractor1; + +public class MinContext implements CleartkExtractor.Context { + + private Context[] contexts; + + private String name; + + /** + * Constructs a {@link Context} which converts the features extracted by the argument contexts + * into a bag of features where all features have the same name. + * + * @param contexts + * The contexts which should be combined into a bag. + */ + public MinContext(Context... contexts) { + this.contexts = contexts; + String[] names = new String[contexts.length + 1]; + names[0] = "Min"; + for (int i = 1; i < names.length; ++i) { + names[i] = contexts[i - 1].getName(); + } + this.name = Feature.createName(names); + } + + public String getName() { + return this.name; + } + + public List extract(JCas jCas, + Annotation focusAnnotation, Bounds bounds, + Class annotationClass, FeatureExtractor1 extractor) + throws CleartkExtractorException { + HashMap runningTotals = new HashMap<>(); + + for (Context context : this.contexts) { + for (Feature feature : context.extract( + jCas, + focusAnnotation, + bounds, + annotationClass, + extractor)) { + try{ + double val = Double.parseDouble(feature.getValue().toString()); + if(!runningTotals.containsKey(feature.getName())){ + runningTotals.put(feature.getName(), 0.0); + } + runningTotals.put(feature.getName(), Double.min(runningTotals.get(feature.getName()), val)); + }catch(Exception e){ + // just ignore this feature? + } + } + } + List features = new ArrayList<>(); + for(String key : runningTotals.keySet()){ + features.add(new Feature(this.name + "_" + key, runningTotals.get(key))); + } + return features; + } + +} Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java?rev=1750710&view=auto ============================================================================== --- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java (added) +++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java Wed Jun 29 20:05:37 2016 @@ -0,0 +1,44 @@ +package org.apache.ctakes.core.cleartk; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.ml.Feature; +import org.cleartk.ml.feature.extractor.CleartkExtractorException; +import org.cleartk.ml.feature.extractor.FeatureExtractor1; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding; + +public class PrecedingWithPadding extends Preceding { + + public int dims; + + public PrecedingWithPadding(int end, int dims){ + super(0, end); + this.dims = dims; + } + + @Override + public List extract(JCas jCas, + Annotation focusAnnotation, Bounds bounds, + Class annotationClass, FeatureExtractor1 extractor) + throws CleartkExtractorException { + LinkedList rawFeats = new LinkedList<>(super.extract(jCas, focusAnnotation, bounds, annotationClass, extractor)); + List processedFeats = new ArrayList<>(); + + for(Feature feat : rawFeats){ + if(feat.getValue().toString().startsWith("OOB")){ + // add one feature for each dimension and set it to 0. + for(int j = 0; j < this.dims; j++){ + processedFeats.add(new Feature(feat.getName() + "_" + j, 0.0)); + } + }else{ + processedFeats.add(feat); + } + } + return processedFeats; + } +} Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java?rev=1750710&view=auto ============================================================================== --- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java (added) +++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java Wed Jun 29 20:05:37 2016 @@ -0,0 +1,75 @@ +package org.apache.ctakes.core.cleartk; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.ml.Feature; +import org.cleartk.ml.feature.extractor.CleartkExtractor; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds; +import org.cleartk.ml.feature.extractor.CleartkExtractor.Context; +import org.cleartk.ml.feature.extractor.CleartkExtractorException; +import org.cleartk.ml.feature.extractor.FeatureExtractor1; + +public class SumContext implements CleartkExtractor.Context { + + private Context[] contexts; + + private String name; + + /** + * Constructs a {@link Context} which converts the features extracted by the argument contexts + * into a bag of features where all features have the same name. + * + * @param contexts + * The contexts which should be combined into a bag. + */ + public SumContext(Context... contexts) { + this.contexts = contexts; + String[] names = new String[contexts.length + 1]; + names[0] = "Sum"; + for (int i = 1; i < names.length; ++i) { + names[i] = contexts[i - 1].getName(); + } + this.name = Feature.createName(names); + } + + public String getName() { + return this.name; + } + + public List extract(JCas jCas, + Annotation focusAnnotation, Bounds bounds, + Class annotationClass, FeatureExtractor1 extractor) + throws CleartkExtractorException { + LinkedHashMap runningTotals = new LinkedHashMap<>(); + + for (Context context : this.contexts) { + for (Feature feature : context.extract( + jCas, + focusAnnotation, + bounds, + annotationClass, + extractor)) { + try{ + double val = Double.parseDouble(feature.getValue().toString()); + if(!runningTotals.containsKey(feature.getName())){ + runningTotals.put(feature.getName(), 0.0); + } + runningTotals.put(feature.getName(), runningTotals.get(feature.getName()) + val); + }catch(Exception e){ + // just ignore this feature? + } + } + } + List features = new ArrayList<>(); + for(String key : runningTotals.keySet()){ + features.add(new Feature(this.name + "_" + key, runningTotals.get(key))); + } + return features; + } + +}