ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1666500 - in /ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features: ./ cluster/ salience/
Date Fri, 13 Mar 2015 16:22:11 GMT
Author: tmill
Date: Fri Mar 13 16:22:11 2015
New Revision: 1666500

URL: http://svn.apache.org/r1666500
Log:
New features and modified features ported from temporal.

Added:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterPartOfSpeechFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java
Modified:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,32 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class CorefSyntaxFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+      IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode head1 = DependencyUtility.getNominalHeadNode(jCas, arg1);
+    ConllDependencyNode head2 = DependencyUtility.getNominalHeadNode(jCas, arg2);
+    
+    if(head1 != null){
+      feats.add(new Feature("Arg1Head", head1.getCoveredText().toLowerCase()));
+    }
+    if(head2 != null){
+      feats.add(new Feature("Arg2Head", head2.getCoveredText().toLowerCase()));
+    }
+    return feats;
+  }
+
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java?rev=1666500&r1=1666499&r2=1666500&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -18,10 +18,10 @@ import org.apache.ctakes.utils.distsem.W
 import org.apache.ctakes.utils.distsem.WordVectorReader;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.jcas.JCas;
-import org.cleartk.classifier.Feature;
-import org.uimafit.util.JCasUtil;
+import org.cleartk.ml.Feature;
+import org.apache.uima.fit.util.JCasUtil;
 
-public class DistSemFeatureExtractor implements RelationFeaturesExtractor {
+public class DistSemFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
 
   // default value is 0.5 (rather than 0.0) because we don't want to assume OOV words are dissimilar
   public static final double DEFAULT_SIM = 0.5;  

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java?rev=1666500&r1=1666499&r2=1666500&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -10,15 +10,15 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.jcas.JCas;
-import org.cleartk.classifier.Feature;
-import org.uimafit.util.JCasUtil;
+import org.cleartk.ml.Feature;
+import org.apache.uima.fit.util.JCasUtil;
 
-public class DistanceFeatureExtractor implements RelationFeaturesExtractor {
+public class DistanceFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
 
 	@Override
 	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
 			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
-		List<Feature> feats = new ArrayList<Feature>();
+		List<Feature> feats = new ArrayList<>();
 		feats.add(new Feature("TOK_DIST",
 				  JCasUtil.selectCovered(jCas, BaseToken.class, arg1.getBegin(), arg2.getEnd()).size() / (double)CorefConst.TOKEN_DIST));
 		feats.add(new Feature("SENT_DIST",

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,56 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class SectionFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jcas, IdentifiedAnnotation ante,
+      IdentifiedAnnotation ana) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    boolean anteInHeader = false;
+    boolean anaInHeader = false;
+    int antePar = -1;
+    int anaPar = -1;
+    
+    // Find section headers -- paragraphs 
+    List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+    for(int i = 0; i < pars.size(); i++){
+      Paragraph par = pars.get(i);
+      if(par.getBegin() > ana.getEnd()){
+        break;
+      }
+      if(ante.getBegin() >= par.getBegin() && ante.getEnd() <= par.getEnd()){
+        antePar = i;
+      }
+      if(ana.getBegin() >= par.getBegin() && ana.getEnd() <= par.getEnd()){
+        anaPar = i;
+      }
+      List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+      if(coveredSents != null && coveredSents.size() == 1){
+        if(antePar == i){
+          anteInHeader = true;
+        }
+        if(anaPar == i){
+          anaInHeader = true;
+        }
+      }
+    }
+
+    feats.add(new Feature("AnteInHeader", anteInHeader));
+    feats.add(new Feature("AnaInHeader", anaInHeader));
+    if(anteInHeader && antePar+1 == anaPar){
+      feats.add(new Feature("AnteHeaderHeadsAna", true));      
+    }
+    return feats;
+  }
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java?rev=1666500&r1=1666499&r2=1666500&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -9,18 +9,21 @@ import org.apache.ctakes.relationextract
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
-import org.cleartk.classifier.Feature;
-import org.uimafit.util.JCasUtil;
+import org.cleartk.ml.Feature;
 
 public class StringMatchingFeatureExtractor implements
-		RelationFeaturesExtractor {
+		RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
 
 	@Override
 	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
 			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
-		List<Feature> feats = new ArrayList<Feature>();
+		List<Feature> feats = new ArrayList<>();
+		
+		// don't extract sim features if one of the markables is a pronoun
+		if(isPronoun(arg1) || isPronoun(arg2)) return feats;
 		
 		String s1 = arg1.getCoveredText();
 		String s2 = arg2.getCoveredText();
@@ -53,8 +56,8 @@ public class StringMatchingFeatureExtrac
 	public static boolean endMatch (String a, String b) {
 		int ia = a.lastIndexOf(" ");
 		int ib = b.lastIndexOf(" ");
-		String aa = a.substring(ia==-1?(a.length()>5?a.length()-5:0):ia);
-		String bb = b.substring(ib==-1?(b.length()>5?b.length()-5:0):ib);
+		String aa = a.substring(ia==-1?(a.length()>5?a.length()-5:0):ia+1);
+		String bb = b.substring(ib==-1?(b.length()>5?b.length()-5:0):ib+1);
 		return aa.equalsIgnoreCase(bb);
 	}
 
@@ -89,10 +92,28 @@ public class StringMatchingFeatureExtrac
 	}
 	
 	public static Set<String> contentWords(Annotation a1){
-		Set<String> words = new HashSet<String>();
+		Set<String> words = new HashSet<>();
 		for(BaseToken tok : JCasUtil.selectCovered(BaseToken.class, a1)){
 			words.add(tok.getCoveredText().toLowerCase());
 		}
 		return words;
 	}
+	
+	public static boolean isPronoun(IdentifiedAnnotation a1){
+	  List<BaseToken> tokens = JCasUtil.selectCovered(BaseToken.class, a1);
+	  
+	  if(tokens.size() != 1){
+	    return false;
+	  }
+	  
+	  BaseToken token = tokens.get(0);
+	  if(token.getPartOfSpeech() == null){
+	    return false;
+	  }
+	  if(token.getPartOfSpeech().startsWith("PRP")) return true;
+	  if(token.getPartOfSpeech().equals("DT")) return true;
+	  
+	  
+	  return false;
+	}
 }

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,51 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class TemporalFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+      IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    String a1dtr = getDocTimeRelForArg(jCas, arg1);
+    feats.add(new Feature("Arg1DTR_" + a1dtr, true));
+    String a2dtr = getDocTimeRelForArg(jCas, arg2);
+    feats.add(new Feature("Arg2DTR_" + a2dtr, true));
+    
+    if(a1dtr.equals(a2dtr) && !a1dtr.equals("NA")){
+      feats.add(new Feature("DTR_Match", true));      
+    }
+    
+    return feats;
+  }
+
+  private static String getDocTimeRelForArg(JCas jCas, IdentifiedAnnotation arg){
+    String dtr = "NA";
+    
+    // find EventMentions and grab their event properties
+    ConllDependencyNode node = DependencyUtility.getNominalHeadNode(jCas, arg);
+    if(node != null){
+      List<EventMention> events = JCasUtil.selectCovered(jCas, EventMention.class, node);
+      for(EventMention event : events){
+        if(event.getClass().getSimpleName().equals("EventMention")){
+          if(event.getEvent() != null && event.getEvent().getProperties() != null && event.getEvent().getProperties().getDocTimeRel() != null){
+            dtr = event.getEvent().getProperties().getDocTimeRel();
+          }
+        }
+      }
+    }
+    return dtr;
+  }
+}
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java?rev=1666500&r1=1666499&r2=1666500&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -3,20 +3,24 @@ package org.apache.ctakes.coreference.ae
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
-import org.cleartk.classifier.Feature;
-import org.uimafit.util.JCasUtil;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
 
-public class TokenFeatureExtractor implements RelationFeaturesExtractor {
+public class TokenFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
 
 	@Override
 	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
 			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
-		List<Feature> feats = new ArrayList<Feature>();
+		List<Feature> feats = new ArrayList<>();
 		
 		String s1 = arg1.getCoveredText().toLowerCase();
 		String s2 = arg2.getCoveredText().toLowerCase();
@@ -29,19 +33,53 @@ public class TokenFeatureExtractor imple
 		feats.add(new Feature("TOKEN_DEF1", isDefinite(s1)));
 		feats.add(new Feature("TOKEN_DEF2", isDefinite(s2)));
 		feats.add(new Feature("TOKEN_NUMAGREE",
-				numberSingular(arg1) == numberSingular(arg2)));
-		
+				numberSingular(jCas, arg1, s1) == numberSingular(jCas, arg2, s2)));
+
 		String gen1 = getGender(s1);
 		String gen2 = getGender(s2);
 		feats.add(new Feature("TOKEN_GEN1", gen1));
 		feats.add(new Feature("TOKEN_GEN2", gen2));
 		feats.add(new Feature("TOKEN_GENAGREE", gen1.equals(gen2)));
 		
-		String p1 = getPerson(s1);
-		String p2 = getPerson(s2);
-		feats.add(new Feature("TOKEN_PERSON1", p1));
-		feats.add(new Feature("TOKEN_PERSON2", p2));
-		feats.add(new Feature("TOKEN_PERSONAGREE", p1.equals(p2)));
+//		String p1 = getPerson(s1);
+//		String p2 = getPerson(s2);
+//		feats.add(new Feature("TOKEN_PERSON1", p1));
+//		feats.add(new Feature("TOKEN_PERSON2", p2));
+//		feats.add(new Feature("TOKEN_PERSONAGREE", p1.equals(p2)));
+//		feats.add(new Feature("TOKEN_PERSONPAIR", p1+"-"+p2));
+//		feats.add(new Feature("IS_TITLE1", isTitle(s1)));
+//		feats.add(new Feature("IS_TITLE2", isTitle(s2)));
+		
+//		feats.add(new Feature("IS_DOCTOR1", s1.startsWith("dr.")));
+//		feats.add(new Feature("IS_DOCTOR2", s2.startsWith("dr.")));
+//		feats.add(new Feature("BOTH_DOCTOR", s1.startsWith("dr.") && s2.startsWith("dr.")));
+		
+//		boolean a1IsHuman = false;
+//		boolean a2IsHuman = false;
+		
+		// if has some person (1st, 2nd, 3rd) or gender (masc., fem), is doctor
+//		a1IsHuman |= (!p1.equals("NONE"));
+//		a1IsHuman |= (!gen1.equals("NEUTER"));
+//		a1IsHuman |= (isTitle(s1));
+//		    
+//    a2IsHuman |= (!p2.equals("NONE"));
+//    a2IsHuman |= (!gen2.equals("NEUTER"));
+//    a2IsHuman |= (isTitle(s2));
+//		
+//		feats.add(new Feature("IS_HUMAN1", a1IsHuman));
+//		feats.add(new Feature("IS_HUMAN2", a2IsHuman));
+//		feats.add(new Feature("BOTH_HUMAN", a1IsHuman && a2IsHuman));
+//		feats.add(new Feature("NEITHER_HUMAN", !a1IsHuman && !a2IsHuman));
+		
+		// is it a section header?
+		List<BaseToken> nextToks = JCasUtil.selectFollowing(jCas, BaseToken.class, arg1, 1);
+		if(nextToks.size() > 0 && nextToks.get(0) instanceof NewlineToken){
+		  feats.add(new Feature("IS_HEADER1", true));
+		}
+		nextToks = JCasUtil.selectFollowing(jCas, BaseToken.class, arg2, 1);
+		if(nextToks.size() > 0 && nextToks.get(0) instanceof NewlineToken){
+		  feats.add(new Feature("IS_HEADER2", true));
+		}
 		return feats;
 	}
 	
@@ -49,9 +87,10 @@ public class TokenFeatureExtractor imple
 		if (s.startsWith("this") ||
 				s.startsWith("that") ||
 				s.startsWith("these") ||
-				s.startsWith("those"))
+				s.startsWith("those")){
 				return true;
-		else return false;
+		}
+		return false;
 	}
 	
 	public static boolean isDefinite (String s) {
@@ -60,34 +99,39 @@ public class TokenFeatureExtractor imple
 
 	// FYI - old code used treebanknode types and found head using head rules filled in by the parser
 	// not sure if there is an appreciable difference...
-	public static boolean numberSingular(IdentifiedAnnotation arg){
-		List<BaseToken> tokens = new ArrayList<BaseToken>(JCasUtil.selectCovered(BaseToken.class, arg));
-		for (int i = tokens.size()-1; i >=0; i--){
-			BaseToken t = tokens.get(i);
-			String pos = t.getPartOfSpeech();
-			if ("NN".equals(pos) || "NNP".equals(pos)){
-				return true;
-			}else if ("NNS".equals(pos) || "NNPS".equals(pos)){
-				return false;
-			}else if(t.getCoveredText().toLowerCase().equals("we")){
-			  return true;
-			}
-		}
+	public static boolean numberSingular(JCas jcas, Annotation arg, String s1){
+	  ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, arg);
+//		List<BaseToken> tokens = new ArrayList<>(JCasUtil.selectCovered(BaseToken.class, arg));
+//		for (int i = tokens.size()-1; i >=0; i--){
+//			BaseToken t = tokens.get(i);
+//			String pos = t.getPartOfSpeech();
+	  if(head != null && head.getPostag() != null){
+	    String pos = head.getPostag();
+	    if ("NN".equals(pos) || "NNP".equals(pos)){
+	      return true;
+	    }else if ("NNS".equals(pos) || "NNPS".equals(pos)){
+	      return false;
+	    }else if(s1.equals("we") || s1.equals("they")){
+	      return false;
+	    }
+	  }
+//		}
 		return true;
 	}
 	
 	public static String getGender(String s1){
-	  if(s1.equals("he") || s1.equals("his") || s1.startsWith("mr.")) return "MALE";
+	  if(s1.equals("he") || s1.equals("his") || s1.equals("him") || s1.startsWith("mr.")) return "MALE";
 	  else if(s1.equals("she") || s1.equals("her") || s1.startsWith("mrs.") || s1.startsWith("ms.")) return "FEMALE";
 	  else return "NEUTER";
 	}
 	
 	public static String getPerson(String s1){
-	  if(s1.equals("i")) return "FIRST";
+	  if(s1.equals("i") || s1.equals("my")) return "FIRST";
 	  else if(s1.equals("he") || s1.equals("she") || s1.equals("his") || s1.equals("her") || s1.equals("hers")){
 	    return "THIRD";
 	  }else if(s1.equals("you") || s1.equals("your")) return "SECOND";
-	  else if(s1.equals("we")) return "FIRST_PLURAL";
+	  else if(s1.equals("we") || s1.equals("our")) return "FIRST_PLURAL";
+	  else if(s1.equals("they") || s1.equals("their")) return "THIRD_PLURAL";
 	  else return "NONE";
 	}
 	
@@ -95,4 +139,8 @@ public class TokenFeatureExtractor imple
 	  if(s1.equals("i")) return true;
 	  return false;
 	}
+	
+	public static boolean isTitle(String s1){
+	  return s1.startsWith("dr.") || s1.startsWith("mr.") || s1.startsWith("mrs.") || s1.startsWith("ms.");
+	}
 }

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java?rev=1666500&r1=1666499&r2=1666500&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -1,43 +1,131 @@
 package org.apache.ctakes.coreference.ae.features;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.FSArray;
-import org.cleartk.classifier.Feature;
-import org.uimafit.util.JCasUtil;
+import org.cleartk.ml.Feature;
+import org.cleartk.util.ViewUriUtil;
 
-public class UMLSFeatureExtractor implements RelationFeaturesExtractor {
+public class UMLSFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
 
+  String docId = null;
+  Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> coveringMap = null;
+  
 	@Override
 	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
 			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
-		List<Feature> feats = new ArrayList<Feature>();
+		List<Feature> feats = new ArrayList<>();
+		
+		if(docId == null || !getDocId(jCas).equals(docId)){
+		  docId = getDocId(jCas);
+		  coveringMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+		}
+		
 		
 		if(arg1 instanceof Markable && arg2 instanceof Markable){
+//		  feats.add(new Feature("AntecedentSalience", arg1.getConfidence()));
+//		  feats.add(new Feature("AnaphorSalience", arg2.getConfidence()));
+		  
 		  // get the head of each markable
 		  ConllDependencyNode head1 = DependencyUtility.getNominalHeadNode(jCas, arg1);
 		  ConllDependencyNode head2 = DependencyUtility.getNominalHeadNode(jCas, arg2);
+		  List<IdentifiedAnnotation> rmList = new ArrayList<>();
 		  
 		  if(head1 != null && head2 != null){
-		    List<IdentifiedAnnotation> ents1 = JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());
-		    List<IdentifiedAnnotation> ents2 = JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head2.getBegin(), head2.getEnd());
-
+		    List<IdentifiedAnnotation> ents1 = new ArrayList<>(coveringMap.get(head1)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());'
+		    for(IdentifiedAnnotation ann : ents1){
+		      if(!(ann instanceof EntityMention || ann instanceof EventMention)){
+		        rmList.add(ann);
+		      }
+		    }
+		    for(IdentifiedAnnotation toRm : rmList){
+		      ents1.remove(toRm);
+		    }
+		    rmList.clear();
+		    List<IdentifiedAnnotation> ents2 = new ArrayList<>(coveringMap.get(head2)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head2.getBegin(), head2.getEnd());
+		    for(IdentifiedAnnotation ann : ents2){
+		      if(!(ann instanceof EntityMention || ann instanceof EventMention)){
+		        rmList.add(ann);
+		      }
+		    }
+		    for(IdentifiedAnnotation toRm : rmList){
+		      ents2.remove(toRm);
+		    }
+		    
+		    if(ents1.size() == 0 && ents2.size() > 0){
+		      feats.add(new Feature("Arg1NoCui_Arg2Cui", true));
+		    }else if(ents1.size() > 0 && ents2.size() == 0){
+		      feats.add(new Feature("Arg1Cui_Arg2NoCui", true));		      
+		    }else if(ents1.size() == 0 && ents2.size() == 0){
+		      feats.add(new Feature("Arg1Arg2NoCui", true));
+		    }else{
+		      feats.add(new Feature("Arg1Arg2BothCui", true));
+		    }
+		    
+		    if((ents1.size() == 0 & ents2.size() > 0) ||
+		        (ents1.size() > 0 && ents2.size() == 0)){
+		      feats.add(new Feature("Arg1OrArg2NoCui", true));
+		    }
+		    
 		    for(IdentifiedAnnotation ent1 : ents1){
+	        HashSet<String> a1Tuis = new HashSet<>(); 
+		      String a1SemType = ent1.getClass().getSimpleName();
+		      feats.add(new Feature("Arg1SemType" + a1SemType, true));
+		      FSArray cons1 = ent1.getOntologyConceptArr();
+		      if(cons1 != null){
+		        for(int i = 0; i < cons1.size(); i++){
+		          if(cons1.get(i) instanceof UmlsConcept){
+		            a1Tuis.add(((UmlsConcept)cons1.get(i)).getTui());
+		          }
+		        }
+		      }
 		      for(IdentifiedAnnotation ent2 : ents2){
+		        HashSet<String> a2Tuis = new HashSet<>();
+		        String a2SemType = ent2.getClass().getSimpleName();
+	          feats.add(new Feature("Arg2SemType" + a2SemType, true));
 		        if(alias(ent1, ent2)){
 		          feats.add(new Feature("UMLS_ALIAS", true));
 		          break;
 		        }
+		        feats.add(new Feature("Arg1Arg2SemType" + a1SemType + "_" + a2SemType, true));
+		        
+		        FSArray cons2 = ent2.getOntologyConceptArr();
+		        if(cons2 != null){
+		          for(int i = 0; i < cons2.size(); i++){
+		            if(cons2.get(i) instanceof UmlsConcept){
+		              a2Tuis.add(((UmlsConcept)cons2.get(i)).getTui());
+		            }
+		          }
+		        }
+		        for(String tui1 : a1Tuis){
+		          feats.add(new Feature("Arg1Tui_" +  tui1, true));
+		          for(String tui2 : a2Tuis){
+		            feats.add(new Feature("Arg1Tui_" + tui1 + "_Arg2Tui_ " + tui2, true));
+		            if(tui1.equals(tui2)){
+		              feats.add(new Feature("Arg1Arg2TuiMatch", true));
+		            }
+		          }
+		        }
+		        for(String tui2 : a2Tuis){
+		          feats.add(new Feature("Arg2Tui_" + tui2, true));
+		        }
 		      }
 		    }
 		  }
@@ -45,11 +133,27 @@ public class UMLSFeatureExtractor implem
 		return feats;
 	}
 
-	public static boolean alias(IdentifiedAnnotation a1, IdentifiedAnnotation a2){  
+	private static String getDocId(JCas jcas) throws AnalysisEngineProcessException {
+	  String docId = null;
+	  
+	  docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
+	  if(docId != null) return docId;
+	  
+	  try{
+	    if(jcas.getView(ViewUriUtil.URI) != null){
+	      docId = ViewUriUtil.getURI(jcas).toString();
+	    }
+	  }catch(Exception e){
+	    // don't need to do anything -- just return null
+	  }
+	  return docId;
+  }
+
+  public static boolean alias(IdentifiedAnnotation a1, IdentifiedAnnotation a2){  
 	  if(a1 != null && a2 != null){
 	    FSArray fsa = a1.getOntologyConceptArr();
 	    if(fsa != null){
-	      HashSet<String> cuis = new HashSet<String>();
+	      HashSet<String> cuis = new HashSet<>();
 	      for(int i = 0; i < fsa.size(); i++){
 	        if(fsa.get(i) instanceof UmlsConcept){
 	          cuis.add(((UmlsConcept)fsa.get(i)).getCui());

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,73 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.getGender;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isDefinite;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isDemonstrative;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.numberSingular;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterAgreementFeaturesExtractor implements RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> features = new ArrayList<>();
+    
+    String s = mention.getCoveredText().toLowerCase();
+    boolean isDem = isDemonstrative(s);
+    boolean isDef = isDefinite(s);
+    features.add(new Feature("MC_MENTION_DEM", isDem));
+    features.add(new Feature("MC_MENTION_DEF", isDef));
+    
+    String gender = getGender(s);
+    features.add(new Feature("MC_MENTION_GENDER", gender));
+
+    boolean singular = numberSingular(jCas, mention, s);
+    features.add(new Feature("MC_MENTION_NUMBER", singular));
+    
+    boolean matchDem = false;
+    boolean matchDef = false;
+    boolean matchGender = false;
+    boolean matchNumber = false;
+    
+    for(IdentifiedAnnotation member : new ListIterable<IdentifiedAnnotation>(cluster.getMembers())){
+      if(member == null){
+        System.err.println("Something that shouldn't happen has happened");
+        continue;
+      }else if(mention.getBegin() < member.getEnd()){
+        // during training this might happen -- see a member of a cluster that
+        // is actually subsequent to the candidate mention
+        continue;
+      }
+      String m = member.getCoveredText().toLowerCase();
+      if(!matchDem && isDemonstrative(m) == isDem){
+        matchDem = true;
+      }
+      if(!matchDef && isDefinite(m) == isDef){
+        matchDef = true;
+      }
+      if(!matchGender && getGender(m).equals(gender)){
+        matchGender = true;
+      }
+      if(!matchNumber && numberSingular(jCas, member, m) == singular){
+        matchNumber = true;
+      }
+    }
+    
+    features.add(new Feature("MC_AGREE_DEM", matchDem));
+    features.add(new Feature("MC_AGREE_DEF", matchDef));
+    features.add(new Feature("MC_AGREE_GEN", matchGender));
+    features.add(new Feature("MC_AGREE_NUM", matchNumber));
+    
+    return features;
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterPartOfSpeechFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterPartOfSpeechFeaturesExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterPartOfSpeechFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterPartOfSpeechFeaturesExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,22 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterPartOfSpeechFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jCas, CollectionTextRelation arg1,
+      IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+    // TODO Auto-generated method stub
+    return null;
+  }
+
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,56 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.*;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterStringFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    CounterMap<String> featCounts = new CounterMap<>();
+    
+    String m = mention.getCoveredText();
+    Set<String> mentionWords = contentWords(mention);
+    
+    for(IdentifiedAnnotation member : new ListIterable<IdentifiedAnnotation>(cluster.getMembers())){
+      if(member == null){
+        System.err.println("Something that shouldn't happen has happened");
+        continue;
+      }else if(mention.getBegin() < member.getEnd()){
+        // during training this might happen -- see a member of a cluster that
+        // is actually subsequent to the candidate mention
+        continue;
+      }
+      
+      String s = member.getCoveredText();
+      Set<String> memberWords = contentWords(member);
+      
+      if(m.equalsIgnoreCase(s)) featCounts.add("MC_STRING_EXACT");
+      if(startMatch(m,s)) featCounts.add("MC_STRING_START");
+      if(endMatch(m,s)) featCounts.add("MC_STRING_END");
+      if(soonMatch(m,s)) featCounts.add("MC_STRING_SOON");
+      if(wordOverlap(mentionWords, memberWords)) featCounts.add("MC_OVERLAP");
+      if(wordSubstring(mentionWords, memberWords)) featCounts.add("MC_SUB");
+    }
+    
+    for(String featKey : featCounts.keySet()){
+      feats.add(new Feature(featKey, featCounts.get(featKey)));
+    }
+    return feats;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,69 @@
+package org.apache.ctakes.coreference.ae.features.salience;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class ClinicalFeatureExtractor implements FeatureExtractor1<Markable> {
+
+  @Override
+  public List<Feature> extract(JCas jcas, Markable markable){    
+    List<Feature> feats = new ArrayList<>();
+    
+    List<Paragraph> coveringPars = JCasUtil.selectCovering(jcas, Paragraph.class, markable);
+    List<Sentence> coveringSents = JCasUtil.selectCovering(jcas, Sentence.class, markable);
+    Sentence coveringSent = DependencyUtility.getSentence(jcas, markable);
+
+    if(coveringPars.size() == 1 && coveringSents.size() == 1){
+      List<Sentence> parSents = JCasUtil.selectCovered(Sentence.class, coveringPars.get(0));
+      if(parSents.size() == 1){
+        // covering paragraph for this markable is exactly one sentence long -- 
+        // AKA it is a header
+        feats.add(new Feature("ClinIsHeader", true));
+      }else{
+        int sentPos = 0;
+        for(int i = 0; i < parSents.size(); i++){
+          if(parSents.get(i) == coveringSent){
+            sentPos = i;
+            break;
+          }
+        }
+        if(sentPos < parSents.size() / 3){
+          feats.add(new Feature("ClinSentPosBegin", true));
+        }else if(sentPos > (2*parSents.size() / 3)){
+          feats.add(new Feature("ClinSentPosEnd", true));
+        }else{
+          feats.add(new Feature("ClinSentPosMiddle", true));
+        }
+      }
+    }
+    
+    
+    
+    List<EventMention> events = JCasUtil.selectCovered(EventMention.class, markable);
+    EventMention longestEvent = null;
+    for(EventMention event : events){
+      if(event.getTypeID() > 0){
+        if(longestEvent == null || (event.getEnd()-event.getBegin()) > (longestEvent.getEnd()-longestEvent.getBegin())){
+          longestEvent = event;
+        }
+      }
+    }
+    if(longestEvent != null){
+      feats.add(new Feature("ClinSemType" + longestEvent.getClass().getSimpleName(), true));
+    }
+    
+    
+    
+    return feats;
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,70 @@
+package org.apache.ctakes.coreference.ae.features.salience;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+/*
+ * Citations:
+ * Recasens, de Marneffe, Potts: The Life and Death of Discourse Entities: Identifying Singleton Mentions
+ * NAACL-HLT 2013 short paper, 627-633.
+ * 
+ * This class implements features in Table 3. Since there is highly ambiguous descriptions
+ * of the features (e.g., Sentence Position=End as well as Sentence Position=Last, 
+ * I looked at the source code for the system to determine precisely how the features
+ * were defined.
+ * First, last means literally first or last token in sentence.
+ * Begin, middle, and end mean which third of the sentence is it in.
+ */
+public class GrammaticalRoleFeatureExtractor implements FeatureExtractor1<Markable> {
+
+  public List<Feature> extract(JCas jcas, Markable markable)
+      throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    if(head == null){
+      return feats;
+    }
+    Sentence sent = DependencyUtility.getSentence(jcas, markable);
+    List<ConllDependencyNode> sentNodes = DependencyUtility.getDependencyNodes(jcas, sent);
+//    List<ConllDependencyNode> covering = DependencyUtility.getProgeny(head, sentNodes);
+    int numNodes = sentNodes.size()-1; // remove root whole sentence node
+    
+    feats.add(new Feature("GrammaticalRoleSentencePositionFirst", head.getId() == 1));
+    feats.add(new Feature("GrammaticalRoleSentencePositionLast", head.getId() == numNodes));
+    if(head.getId() < (numNodes / 3)){
+      feats.add(new Feature("GrammaticalRoleSentencePositionBegin", true));
+    }else if(head.getId() > 2*(numNodes/3)){
+      feats.add(new Feature("GrammaticalRoleSentencePositionEnd", true));
+    }else{
+      feats.add(new Feature("GrammaticalRoleSentencePositionMiddle", true));
+    }
+    
+    String deprel = head.getDeprel();
+    if(deprel.equals("nsubj")){
+      feats.add(new Feature("GrammaticalRoleRelSubj", true));
+    }else if(deprel.equals("dobj") || deprel.equals("iobj")){
+      feats.add(new Feature("GrammaticalRoleRelVerbArg", true));
+    }else if(deprel.equals("nn")){
+      feats.add(new Feature("GrammaticalRoleRelNounArg", true));
+    }else if(deprel.equals("root")){
+      feats.add(new Feature("GrammaticalRoleRelRoot", true));
+    }else if(deprel.equals("conj")){
+      feats.add(new Feature("GrammaticalRoleRelConj", true));
+    }else{
+      feats.add(new Feature("GrammaticalRoleRelOther", true));
+    }
+    
+    return feats;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,78 @@
+package org.apache.ctakes.coreference.ae.features.salience;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+/*
+ * Citations:
+ * Recasens, de Marneffe, Potts: The Life and Death of Discourse Entities: Identifying Singleton Mentions
+ * NAACL-HLT 2013 short paper, 627-633.
+ * 
+ * BBN corpus description (for the 18 named entity types)
+ * https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
+ * 
+ * This feature extractor is intended to implement the features in Table 2, described
+ * in the subsection of 3 called "Internal morphosyntax of the mention."
+ * Left off the table are the 18 NE types from CoNLL. Most of these are not relevant
+ * to our task, especially since we resolve person mentions with simple rules.
+ */
+public class MorphosyntacticFeatureExtractor implements FeatureExtractor1<Markable> {
+
+  public List<Feature> extract(JCas jcas, Markable markable)
+      throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    if(head == null){
+      return feats;
+    }
+    List<ConllDependencyNode> covering = DependencyUtility.getProgeny(head, DependencyUtility.getDependencyNodes(jcas, DependencyUtility.getSentence(jcas, markable)));
+
+    if(head.getId() != 0 && (head.getPostag().startsWith("PRP") || 
+        (head.getPostag().equals("DT") && !head.getDeprel().equals("det")))){
+      // 2 conditions -- head is a pronoun POS tag (he, she, it) like PRP or PRP$
+      // or head is a determiner (This, that) that does not have a determiner dependency relation
+      // -- usually marked as nsubj or dobj when used as pronoun (This was..., discussed this with...) 
+      // but would be "det" when used as in "this discussion"
+      feats.add(new Feature("MorphoIsPronoun", true));
+    }else{
+      feats.add(new Feature("MorphoIsPronoun", false));
+    }
+    
+    feats.add(new Feature("MorphoIsProper", head.getPostag().equals("NNP")));
+    
+    // skip animacy and person features for now -- planning to not do person mentions
+    
+    // replace singular/other with plural/other
+    feats.add(new Feature("MorphoPlural", head.getPostag().equals("NNS")));
+    
+    boolean indefinite = false;
+    boolean containsNum = false;
+    for(ConllDependencyNode node : covering){
+      if(node.getPostag().equals("DT") && 
+          (node.getLemma().equals("a") || node.getLemma().equals("an"))){
+        indefinite = true;
+      }
+      
+      if(node.getPostag().equals("CD")){
+        containsNum = true;
+      }
+    }
+    
+    feats.add(new Feature("MorphoIndefinite", indefinite));
+    feats.add(new Feature("MorphoNumeric", containsNum)); // lump together many NE types from OntoNotes (date, time, ordinal, percent, quantity)
+    feats.add(new Feature("MorphoNumModifiers", covering.size()-1));    
+    
+    return feats;
+  }
+
+  
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java?rev=1666500&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java Fri Mar 13 16:22:11 2015
@@ -0,0 +1,136 @@
+package org.apache.ctakes.coreference.ae.features.salience;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+import com.google.common.collect.Sets;
+
+import static org.apache.ctakes.dependency.parser.util.DependencyUtility.*;
+
+public class SemanticEnvironmentFeatureExtractor implements
+    FeatureExtractor1<Markable> {
+
+  // this is a subset of the attitude verbs listed in White et al:
+  // Discovering classes of attitude verbs using subcategorization frame distributsion
+  // NELS 2012.
+  private static Set<String> propVerbs = 
+      Sets.newHashSet("allow", "believe", "bother", "demand", "deny", "doubt", "expect", "feel", "forbid", "guess", "hate", "hear", "hope", "imagine", "need", "promise", "realize", "remember", "said", "say", "see", "suppose", "tell", "think", "understand", "want", "worry");
+  
+  public List<Feature> extract(JCas jcas, Markable markable)
+      throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    if(head == null){
+      return feats;
+    }
+    Sentence sent = DependencyUtility.getSentence(jcas, markable);
+    List<ConllDependencyNode> sentNodes = DependencyUtility.getDependencyNodes(jcas, sent);
+    List<ConllDependencyNode> covering = DependencyUtility.getProgeny(head, sentNodes);
+    
+    List<EventMention> events = JCasUtil.selectCovered(jcas, EventMention.class, markable);
+    EventMention markableEvent = null;
+    for(EventMention event : events){
+      ConllDependencyNode eventHead = getNominalHeadNode(jcas, event);
+      if(eventHead == head){
+        if(markableEvent == null || (event.getEnd()-event.getBegin()) > (markableEvent.getEnd()-markableEvent.getBegin())){
+          markableEvent = event;
+        }
+      }
+    }
+    
+    boolean neg = false;
+    if(markableEvent != null){
+      neg = markableEvent.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT;
+      feats.add(new Feature("SemEnvNegation", neg));
+    }
+    
+    boolean modal = presenceOfModality(head, sentNodes);
+    feats.add(new Feature("SemEnvModality", modal));
+    
+    boolean underPropVerb = presenceOfAttitude(jcas, head);
+    feats.add(new Feature("SemEnvAttitude", underPropVerb));
+    
+    
+    // modal * pronoun, neg * pronoun
+    if(head.getPostag().startsWith("PRP") || (head.getPostag().equals("DT") && !head.getDeprel().equals("det"))){
+      feats.add(new Feature("SemEnvProTrueModal"+modal, true));
+      feats.add(new Feature("SemEnvProTrueNeg"+neg, true));
+      feats.add(new Feature("SemEnvProTrueAtt"+underPropVerb, true));
+    }else{
+      feats.add(new Feature("SemEnvProFalseModal"+modal, true));
+      feats.add(new Feature("SemEnvProFalseNeg"+neg, true));
+      feats.add(new Feature("SemEnvProFalseAtt"+underPropVerb, true));
+    }
+    
+    // modal * Proper noun
+    if(head.getPostag().equals("NNP")){
+      feats.add(new Feature("SemEnvProperTrueModal"+modal, true));
+      feats.add(new Feature("SemEnvProperTrueNeg"+neg, true));
+      feats.add(new Feature("SemEnvProperTrueAtt"+underPropVerb, true));
+    }else{
+      feats.add(new Feature("SemEnvProperFalseModal"+modal,true));
+      feats.add(new Feature("SemEnvProperFalseNeg"+neg, true));
+      feats.add(new Feature("SemEnvProperFalseAtt"+underPropVerb, true));
+    }
+    
+    boolean indefinite = false;
+    for(ConllDependencyNode node : covering){
+      if(node.getId() != 0 && (node.getPostag().equals("DT") && 
+          (node.getLemma().equals("a") || node.getLemma().equals("an")))){
+        indefinite = true;
+      }
+    }
+    feats.add(new Feature("Neg"+neg+"Indef"+indefinite, true));
+    
+    feats.add(new Feature("Neg"+neg+"Mods"+(covering.size()-1), true));
+    
+    return feats;
+  }
+
+  private static final boolean presenceOfModality(ConllDependencyNode head, List<ConllDependencyNode> sentNodes) {
+    boolean modal = false;
+    ConllDependencyNode vb = null;
+    
+    if(head.getHead() != null){
+      vb = head.getHead();
+      while(vb.getHead() != null && !vb.getPostag().startsWith("VB")){
+        vb = vb.getHead();
+      }
+      
+      for(ConllDependencyNode node : sentNodes){
+        if(node.getHead() == vb && node.getPostag().equals("MD")){
+          modal = true;
+          break;
+        }
+      }
+    }
+    return modal;
+  }
+
+  private static final boolean presenceOfAttitude(JCas jcas, ConllDependencyNode head){
+    boolean att = false;
+    
+    for(ConllDependencyNode cur : getPathToTop(jcas, head)){
+      if(propVerbs.contains(cur.getLemma())){
+        att = true;
+        break;
+      }
+    }
+    
+    return att;
+  }
+}



Mime
View raw message