ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1748736 [3/5] - in /ctakes/trunk/ctakes-coreference: ./ src/main/java/org/apache/ctakes/coreference/ae/ src/main/java/org/apache/ctakes/coreference/ae/features/ src/main/java/org/apache/ctakes/coreference/ae/features/cluster/ src/main/java...
Date Thu, 16 Jun 2016 14:51:51 GMT
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,169 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.struct.MapFactory;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+
+public class TokenFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+		List<Feature> feats = new ArrayList<>();
+		
+		String s1 = arg1.getCoveredText().toLowerCase();
+		String s2 = arg2.getCoveredText().toLowerCase();
+		
+		boolean dem1 = isDemonstrative(s1);
+		boolean dem2 = isDemonstrative(s2);
+		
+		feats.add(new Feature("TOKEN_DEM1", dem1));
+		feats.add(new Feature("TOKEN_DEM2", dem2));
+		feats.add(new Feature("TOKEN_DEF1", isDefinite(s1)));
+		feats.add(new Feature("TOKEN_DEF2", isDefinite(s2)));
+		feats.add(new Feature("TOKEN_NUMAGREE",
+				numberSingular(jCas, arg1, s1) == numberSingular(jCas, arg2, s2)));
+
+		String gen1 = getGender(s1);
+		String gen2 = getGender(s2);
+		feats.add(new Feature("TOKEN_GEN1", gen1));
+		feats.add(new Feature("TOKEN_GEN2", gen2));
+		feats.add(new Feature("TOKEN_GENAGREE", gen1.equals(gen2)));
+		
+//		String p1 = getPerson(s1);
+//		String p2 = getPerson(s2);
+//		feats.add(new Feature("TOKEN_PERSON1", p1));
+//		feats.add(new Feature("TOKEN_PERSON2", p2));
+//		feats.add(new Feature("TOKEN_PERSONAGREE", p1.equals(p2)));
+//		feats.add(new Feature("TOKEN_PERSONPAIR", p1+"-"+p2));
+//		feats.add(new Feature("IS_TITLE1", isTitle(s1)));
+//		feats.add(new Feature("IS_TITLE2", isTitle(s2)));
+		
+//		feats.add(new Feature("IS_DOCTOR1", s1.startsWith("dr.")));
+//		feats.add(new Feature("IS_DOCTOR2", s2.startsWith("dr.")));
+//		feats.add(new Feature("BOTH_DOCTOR", s1.startsWith("dr.") && s2.startsWith("dr.")));
+		
+//		boolean a1IsHuman = false;
+//		boolean a2IsHuman = false;
+		
+		// if has some person (1st, 2nd, 3rd) or gender (masc., fem), is doctor
+//		a1IsHuman |= (!p1.equals("NONE"));
+//		a1IsHuman |= (!gen1.equals("NEUTER"));
+//		a1IsHuman |= (isTitle(s1));
+//		    
+//    a2IsHuman |= (!p2.equals("NONE"));
+//    a2IsHuman |= (!gen2.equals("NEUTER"));
+//    a2IsHuman |= (isTitle(s2));
+//		
+//		feats.add(new Feature("IS_HUMAN1", a1IsHuman));
+//		feats.add(new Feature("IS_HUMAN2", a2IsHuman));
+//		feats.add(new Feature("BOTH_HUMAN", a1IsHuman && a2IsHuman));
+//		feats.add(new Feature("NEITHER_HUMAN", !a1IsHuman && !a2IsHuman));
+		
+		// is it a section header?
+		List<BaseToken> nextToks = JCasUtil.selectFollowing(jCas, BaseToken.class, arg1, 1);
+		if(nextToks.size() > 0 && nextToks.get(0) instanceof NewlineToken){
+		  feats.add(new Feature("IS_HEADER1", true));
+		}
+		nextToks = JCasUtil.selectFollowing(jCas, BaseToken.class, arg2, 1);
+		if(nextToks.size() > 0 && nextToks.get(0) instanceof NewlineToken){
+		  feats.add(new Feature("IS_HEADER2", true));
+		}
+		return feats;
+	}
+	
+	public static boolean isDemonstrative (String s) {
+		if (s.startsWith("this") ||
+				s.startsWith("that") ||
+				s.startsWith("these") ||
+				s.startsWith("those")){
+				return true;
+		}
+		return false;
+	}
+	
+	public static boolean isDefinite (String s) {
+		return s.startsWith("the ");
+	}
+
+	// FYI - old code used treebanknode types and found head using head rules filled in by the parser
+	// not sure if there is an appreciable difference...
+	public static boolean numberSingular(JCas jcas, Annotation arg, String s1){
+    ConllDependencyNode head = MapFactory.get(getKey(jcas), arg);
+//		List<BaseToken> tokens = new ArrayList<>(JCasUtil.selectCovered(BaseToken.class, arg));
+//		for (int i = tokens.size()-1; i >=0; i--){
+//			BaseToken t = tokens.get(i);
+//			String pos = t.getPartOfSpeech();
+	  if(head != null && head.getPostag() != null){
+	    String pos = head.getPostag();
+	    if ("NN".equals(pos) || "NNP".equals(pos)){
+	      return true;
+	    }else if ("NNS".equals(pos) || "NNPS".equals(pos)){
+	      return false;
+	    }else if(s1.equals("we") || s1.equals("they")){
+	      return false;
+	    }
+	  }
+//		}
+		return true;
+	}
+	
+	public static String getGender(String s1){
+	  if(s1.equals("he") || s1.equals("his") || s1.equals("him") || s1.startsWith("mr.")) return "MALE";
+	  else if(s1.equals("she") || s1.equals("her") || s1.startsWith("mrs.") || s1.startsWith("ms.")) return "FEMALE";
+	  else return "NEUTER";
+	}
+	
+	public static String getPerson(String s1){
+	  if(s1.equals("i") || s1.equals("my")) return "FIRST";
+	  else if(s1.equals("he") || s1.equals("she") || s1.equals("his") || s1.equals("her") || s1.equals("hers")){
+	    return "THIRD";
+	  }else if(s1.equals("you") || s1.equals("your")) return "SECOND";
+	  else if(s1.equals("we") || s1.equals("our")) return "FIRST_PLURAL";
+	  else if(s1.equals("they") || s1.equals("their")) return "THIRD_PLURAL";
+	  else return "NONE";
+	}
+	
+	public static boolean getAnimate(String s1){
+	  if(s1.equals("i")) return true;
+	  return false;
+	}
+	
+	public static boolean isTitle(String s1){
+	  return s1.startsWith("dr.") || s1.startsWith("mr.") || s1.startsWith("mrs.") || s1.startsWith("ms.");
+	}
+	
+	public static boolean isNegated(IdentifiedAnnotation mention){
+	  return mention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT;
+	}
+	
+	public static boolean isUncertain(IdentifiedAnnotation mention){
+	  return mention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT;
+	}
+	
+	public static boolean isGeneric(IdentifiedAnnotation mention){
+	  return mention.getGeneric() == CONST.NE_GENERIC_TRUE;
+	}
+	
+	public static boolean isPatient(IdentifiedAnnotation mention){
+	  return mention.getSubject() == CONST.ATTR_SUBJECT_PATIENT;
+	}
+	
+	public static boolean isHistory(IdentifiedAnnotation mention){
+	  return mention.getHistoryOf() == CONST.NE_HISTORY_OF_PRESENT;
+	}
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,217 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.cleartk.ml.Feature;
+import org.cleartk.util.ViewUriUtil;
+
+public class UMLSFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+  String docId = null;
+  Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> coveringMap = null;
+  
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+		List<Feature> feats = new ArrayList<>();
+		
+		if(docId == null || !getDocId(jCas).equals(docId)){
+		  docId = getDocId(jCas);
+		  coveringMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+		}
+		
+		
+		if(arg1 instanceof Markable && arg2 instanceof Markable){
+//		  feats.add(new Feature("AntecedentSalience", arg1.getConfidence()));
+//		  feats.add(new Feature("AnaphorSalience", arg2.getConfidence()));
+		  
+		  // get the head of each markable
+		  ConllDependencyNode head1 = DependencyUtility.getNominalHeadNode(jCas, arg1);
+		  ConllDependencyNode head2 = DependencyUtility.getNominalHeadNode(jCas, arg2);
+		  List<IdentifiedAnnotation> rmList = new ArrayList<>();
+		  
+		  if(head1 != null && head2 != null){
+		    List<IdentifiedAnnotation> ents1 = new ArrayList<>(coveringMap.get(head1)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());'
+		    for(IdentifiedAnnotation ann : ents1){
+		      if(!(ann instanceof EntityMention || ann instanceof EventMention) || ann.getClass() == EventMention.class){
+		        rmList.add(ann);
+		      }
+		    }
+		    for(IdentifiedAnnotation toRm : rmList){
+		      ents1.remove(toRm);
+		    }
+		    rmList.clear();
+		    List<IdentifiedAnnotation> ents2 = new ArrayList<>(coveringMap.get(head2)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head2.getBegin(), head2.getEnd());
+		    for(IdentifiedAnnotation ann : ents2){
+		      if(!(ann instanceof EntityMention || ann instanceof EventMention)|| ann.getClass() == EventMention.class){
+		        rmList.add(ann);
+		      }
+		    }
+		    for(IdentifiedAnnotation toRm : rmList){
+		      ents2.remove(toRm);
+		    }
+		    
+		    if(ents1.size() == 0 && ents2.size() > 0){
+		      feats.add(new Feature("Arg1NoCui_Arg2Cui", true));
+		    }else if(ents1.size() > 0 && ents2.size() == 0){
+		      feats.add(new Feature("Arg1Cui_Arg2NoCui", true));		      
+		    }else if(ents1.size() == 0 && ents2.size() == 0){
+		      feats.add(new Feature("Arg1Arg2NoCui", true));
+		    }else{
+		      feats.add(new Feature("Arg1Arg2BothCui", true));
+		    }
+		    
+		    if((ents1.size() == 0 & ents2.size() > 0) ||
+		        (ents1.size() > 0 && ents2.size() == 0)){
+		      feats.add(new Feature("Arg1OrArg2NoCui", true));
+		    }
+		    
+		    for(IdentifiedAnnotation ent1 : ents1){
+	        HashSet<String> a1Tuis = new HashSet<>(); 
+		      String a1SemType = ent1.getClass().getSimpleName();
+		      feats.add(new Feature("Arg1SemType" + a1SemType, true));
+		      FSArray cons1 = ent1.getOntologyConceptArr();
+		      if(cons1 != null){
+		        for(int i = 0; i < cons1.size(); i++){
+		          if(cons1.get(i) instanceof UmlsConcept){
+		            a1Tuis.add(((UmlsConcept)cons1.get(i)).getTui());
+		          }
+		        }
+		      }
+		      for(IdentifiedAnnotation ent2 : ents2){
+		        HashSet<String> a2Tuis = new HashSet<>();
+		        String a2SemType = ent2.getClass().getSimpleName();
+	          feats.add(new Feature("Arg2SemType" + a2SemType, true));
+		        if(alias(ent1, ent2)){
+		          feats.add(new Feature("UMLS_ALIAS", true));
+//		          break;
+		        }
+//		        if(!alias(ent1, ent2) && isHypernym(ent1, ent2)){
+//		          feats.add(new Feature("IS_HYPERNYM", true));
+//		        }
+//		        if(!alias(ent1, ent2) && isHyponym(ent1, ent2)){
+//		          feats.add(new Feature("IS_HYPONYM", true));
+//		        }
+		        feats.add(new Feature("Arg1Arg2SemType" + a1SemType + "_" + a2SemType, true));
+		        
+		        FSArray cons2 = ent2.getOntologyConceptArr();
+		        if(cons2 != null){
+		          for(int i = 0; i < cons2.size(); i++){
+		            if(cons2.get(i) instanceof UmlsConcept){
+		              a2Tuis.add(((UmlsConcept)cons2.get(i)).getTui());
+		            }
+		          }
+		        }
+		        for(String tui1 : a1Tuis){
+		          feats.add(new Feature("Arg1Tui_" +  tui1, true));
+		          for(String tui2 : a2Tuis){
+		            feats.add(new Feature("Arg1Tui_" + tui1 + "_Arg2Tui_ " + tui2, true));
+		            if(tui1.equals(tui2)){
+		              feats.add(new Feature("Arg1Arg2TuiMatch", true));
+		            }
+		          }
+		        }
+		        for(String tui2 : a2Tuis){
+		          feats.add(new Feature("Arg2Tui_" + tui2, true));
+		        }
+		      }
+		    }
+		  }
+		}
+		return feats;
+	}
+
+	public static String getDocId(JCas jcas) {
+	  String docId = null;
+	  
+	  docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
+	  if(docId != DocumentIDAnnotationUtil.NO_DOCUMENT_ID) return docId;
+	  
+	  try{
+	    if(jcas.getView(ViewUriUtil.URI) != null){
+	      docId = ViewUriUtil.getURI(jcas).toString();
+	    }
+	  }catch(Exception e){
+	    // don't need to do anything -- just return null
+	  }
+	  return docId;
+  }
+
+  public static boolean alias(IdentifiedAnnotation a1, IdentifiedAnnotation a2){  
+	  if(a1 != null && a2 != null){
+	    for(UmlsConcept concept1 : JCasUtil.select(a1.getOntologyConceptArr(), UmlsConcept.class)){
+	      String cui = concept1.getCui();
+	      for(UmlsConcept concept2 : JCasUtil.select(a2.getOntologyConceptArr(), UmlsConcept.class)){
+	        if(cui.equals(concept2.getCui())){
+	          return true;
+	        }
+	      }
+	    }	  
+	  }
+		return false;
+	}
+  
+/*  
+  public static boolean isHypernym(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
+    if(a1 != null && a2 != null){
+      for(UmlsConcept concept1 : JCasUtil.select(a1.getOntologyConceptArr(), UmlsConcept.class)){
+        for(UmlsConcept concept2 : JCasUtil.select(a2.getOntologyConceptArr(), UmlsConcept.class)){
+          if(GraphFunctions.isa(concept1.getCui(), concept2.getCui())){
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+  
+  public static boolean isHyponym(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
+    return isHypernym(a2, a1);
+  }
+  
+  // returns distance in graph. For isa relation the distance will be positive and for
+  // inverse isa it will be negative, thus the absolute value comparisons.
+  public static int graphDistance(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
+    int distance = Integer.MAX_VALUE;
+    
+    if(a1 != null && a2 != null){
+      for(UmlsConcept concept1 : JCasUtil.select(a1.getOntologyConceptArr(), UmlsConcept.class)){
+        String cui1 = concept1.getCui();
+        for(UmlsConcept concept2 : JCasUtil.select(a2.getOntologyConceptArr(), UmlsConcept.class)){
+          String cui2 = concept2.getCui();
+          int len = GraphFunctions.minDistance(cui1, cui2);
+          if(len < 0){
+            len = GraphFunctions.minDistance(cui2, cui1);
+            if(len < 0){
+              len = Integer.MAX_VALUE;
+            }else{
+              len = -len;
+            }
+          }
+          if(Math.abs(len) < Math.abs(distance)){
+            distance = len;
+          }
+        }
+      }
+    }
+    return distance;
+  }
+*/
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,90 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.getGender;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isDefinite;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isDemonstrative;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.numberSingular;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MentionClusterAgreementFeaturesExtractor implements RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>, FeatureExtractor1<Markable> {
+
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> features = new ArrayList<>();
+    
+    String s = mention.getCoveredText().toLowerCase();
+    boolean isDem = isDemonstrative(s);
+    boolean isDef = isDefinite(s);
+    String gender = getGender(s);
+    boolean singular = numberSingular(jCas, mention, s);
+
+    boolean matchDem = false;
+    boolean matchDef = false;
+    boolean matchGender = false;
+    boolean matchNumber = false;
+    
+    for(IdentifiedAnnotation member : new ListIterable<IdentifiedAnnotation>(cluster.getMembers())){
+      if(member == null){
+        System.err.println("Found an empty cluster member in agreement features extractor.");
+        continue;
+      }else if(mention.getBegin() < member.getEnd()){
+        // during training this might happen -- see a member of a cluster that
+        // is actually subsequent to the candidate mention
+        continue;
+      }
+      String m = member.getCoveredText().toLowerCase();
+      if(!matchDem && isDemonstrative(m) == isDem){
+        matchDem = true;
+      }
+      if(!matchDef && isDefinite(m) == isDef){
+        matchDef = true;
+      }
+      if(!matchGender && getGender(m).equals(gender)){
+        matchGender = true;
+      }
+      if(!matchNumber && numberSingular(jCas, member, m) == singular){
+        matchNumber = true;
+      }
+    }
+    
+    features.add(new Feature("MC_AGREE_DEM", matchDem));
+    features.add(new Feature("MC_AGREE_DEF", matchDef));
+    features.add(new Feature("MC_AGREE_GEN", matchGender));
+    features.add(new Feature("MC_AGREE_NUM", matchNumber));
+    
+    return features;
+  }
+
+  @Override
+  public List<Feature> extract(JCas jCas, Markable mention) throws CleartkExtractorException {
+    List<Feature> features = new ArrayList<>();
+
+    String s = mention.getCoveredText().toLowerCase();
+
+    boolean isDem = isDemonstrative(s);
+    boolean isDef = isDefinite(s);
+    features.add(new Feature("MC_MENTION_DEM", isDem));
+    features.add(new Feature("MC_MENTION_DEF", isDef));
+    
+    String gender = getGender(s);
+    features.add(new Feature("MC_MENTION_GENDER", gender));
+
+    boolean singular = numberSingular(jCas, mention, s);
+    features.add(new Feature("MC_MENTION_NUMBER", singular));
+
+    return features;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,172 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isGeneric;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isHistory;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isNegated;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isPatient;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isUncertain;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.LocationOfTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.ProcedureMention;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MentionClusterAttributeFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation>, FeatureExtractor1<Markable> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> features = new ArrayList<>();
+    
+    boolean mentionNegated = isNegated(mention);
+    boolean mentionUnc = isUncertain(mention);
+//    boolean mentionGen = isGeneric(mention);
+//    boolean mentionSubj = isPatient(mention);
+//    boolean mentionHist = isHistory(mention);
+    
+    boolean mentionTimex = isTimex(mention);
+
+    boolean matchNeg = true;
+    boolean clusterTimex = false;  // if any cluster member is timex
+    boolean matchUnc = true;
+//    boolean matchGen = true;
+//    boolean matchSubj = true;
+//    boolean matchHist = true;
+    
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+      if(member.getBegin() > mention.getEnd()){
+        break;
+      }
+      if(mentionNegated != isNegated(member)){
+        matchNeg = false;
+      }
+      if(mentionUnc != isUncertain(member)){
+        matchUnc = false;
+      }
+//      if(mentionGen != isGeneric(member)){
+//        matchGen = false;
+//      }
+//      if(mentionSubj != isPatient(member)){
+//        matchSubj = false;
+//      }
+//      if(mentionHist != isHistory(member)){
+//        matchHist = false;
+//      }
+      if(isTimex(member)){
+        clusterTimex = true;
+      }
+    }
+    
+    features.add(new Feature("MC_AGREE_NEG", matchNeg));
+    features.add(new Feature("MC_AGREE_UNC", matchUnc));
+//    features.add(new Feature("MC_AGREE_GEN", matchGen));
+//    features.add(new Feature("MC_AGREE_SUBJ", matchSubj));
+//    features.add(new Feature("MC_AGREE_HIST", matchHist));
+    
+    features.add(new Feature("MC_AGREE_TIMEX", clusterTimex == mentionTimex));
+
+    /// check attributes like location/degree/negation/uncertainty
+    /*
+    Set<String> mentionSites = new HashSet<>();
+    
+    
+    if(mentionHead != null){
+      for(IdentifiedAnnotation annot : JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, mentionHead)){
+        LocationOfTextRelation rel = getLocation(annot);
+        if(rel != null){
+          AnatomicalSiteMention site = (AnatomicalSiteMention)rel.getArg2().getArgument();
+          for(UmlsConcept concept : JCasUtil.select(site.getOntologyConceptArr(), UmlsConcept.class)){
+            mentionSites.add(concept.getCui());
+          }
+        }
+      }
+    }
+
+    if(mentionSites.size() > 0){
+      Set<String> memberSites = new HashSet<>();
+      for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        if(mention.getBegin() <= member.getBegin()) break;
+        ConllDependencyNode memberHead = DependencyUtility.getNominalHeadNode(jCas, member);
+        if(memberHead == null) continue;
+        
+        for(IdentifiedAnnotation annot : JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, memberHead)){
+          LocationOfTextRelation rel = getLocation(annot);
+          if(rel != null){
+            boolean conflict = true;
+            AnatomicalSiteMention site = (AnatomicalSiteMention)rel.getArg2().getArgument();
+            for(UmlsConcept concept : JCasUtil.select(site.getOntologyConceptArr(), UmlsConcept.class)){
+              memberSites.add(concept.getCui());
+              if(mentionSites.contains(concept.getCui())){
+                conflict = false;
+              }
+            }
+            if(conflict){
+              features.add(new Feature("MC_LOCATION_CONFLICT", true));
+            }
+          }
+        }
+      }
+    }
+    */
+    return features;
+  }
+
+  @Override
+  public List<Feature> extract(JCas view, Markable mention)
+      throws CleartkExtractorException {
+    List<Feature> features = new ArrayList<>();
+    
+    boolean mentionNegated = isNegated(mention);
+    features.add(new Feature("MC_MENTION_NEGATED", mentionNegated));
+    
+    boolean mentionUncertain = isUncertain(mention);
+    features.add(new Feature("MC_MENTION_UNCERTAIN", mentionUncertain));
+    
+    boolean mentionGen = isGeneric(mention);
+    features.add(new Feature("MC_MENTION_GENERIC", mentionGen));
+    
+    boolean mentionSubj = isPatient(mention);
+    features.add(new Feature("MC_MENTION_PATIENT", mentionSubj));
+    
+    boolean mentionHist = isHistory(mention);
+    features.add(new Feature("MC_MENTION_HISTORY", mentionHist));
+
+    boolean mentionTimex = isTimex(mention);
+    features.add(new Feature("MC_MENTION_TIMEX", mentionTimex));
+
+    return features;
+  }
+  
+  private static boolean isTimex(Annotation a){
+    return JCasUtil.selectCovered(TimeMention.class, a).size() > 0;
+  }
+  
+  @SuppressWarnings("unused")
+  private static LocationOfTextRelation getLocation(IdentifiedAnnotation annot){
+    LocationOfTextRelation rel = null;
+    if(annot instanceof ProcedureMention){
+      rel = ((ProcedureMention)annot).getBodyLocation();
+    }else if(annot instanceof DiseaseDisorderMention){
+      rel = ((DiseaseDisorderMention)annot).getBodyLocation();
+    }else if(annot instanceof SignSymptomMention){
+      rel = ((SignSymptomMention)annot).getBodyLocation();
+    }
+    return rel;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeVectorExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeVectorExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeVectorExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeVectorExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,22 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterAttributeVectorExtractor implements 
+  RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation arg1, IdentifiedAnnotation arg2)
+      throws AnalysisEngineProcessException {
+    // TODO Auto-generated method stub
+    return null;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,77 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.struct.MapFactory;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MentionClusterDepHeadExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation>, FeatureExtractor1<Markable> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode mentionHead = MapFactory.get(getKey(jCas), mention);
+    Set<String> memberHeads = new HashSet<>();
+    Set<String> memberPaths = new HashSet<>();
+    
+    for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+      if(member.getBegin() > mention.getEnd()) break;
+      ConllDependencyNode memberHead = MapFactory.get(getKey(jCas), member);
+      if(memberHead != null){
+        String headWord = memberHead.getCoveredText().toLowerCase();
+        memberHeads.add(headWord);
+        memberPaths.add(memberHead.getDeprel());
+      }
+//      DependencyPath path = DependencyUtility.getPathToTop(jCas, memberHead);
+    }
+//    for(String headWord : memberHeads){
+//      feats.add(new Feature("MemberHead", headWord));
+//    }
+//    for(String path : memberPaths){
+//      feats.add(new Feature("MemberRel", path));
+//    }
+    
+    if(mentionHead != null){
+      String headWord = mentionHead.getCoveredText().toLowerCase();
+//      feats.add(new Feature("MentionRel", mentionHead.getDeprel()));
+//      feats.add(new Feature("MentionHead", headWord));
+      if(memberHeads.contains(headWord) && !StringMatchingFeatureExtractor.isPronoun(mention)){
+        feats.add(new Feature("ClusterHeadMatchesMentionHead", true));
+      }
+    }
+    
+    return feats;
+  }
+
+  @Override
+  public List<Feature> extract(JCas jCas, Markable mention) throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    ConllDependencyNode mentionHead = MapFactory.get(getKey(jCas), mention);
+
+    if(mentionHead != null){
+      feats.add(new Feature("MentionRel", mentionHead.getDeprel()));
+    }
+    
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,130 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterDistSemExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  public static final double DEFAULT_SIM = 0.5;  
+  
+  private WordEmbeddings words = null;
+  
+  public MentionClusterDistSemExtractor() throws FileNotFoundException, IOException{
+    this("org/apache/ctakes/coreference/distsem/mimic_vectors.txt");
+  }
+  
+  public MentionClusterDistSemExtractor(String embeddingsPath) throws FileNotFoundException, IOException{
+    words = WordVectorReader.getEmbeddings(FileLocator.getAsStream(embeddingsPath));
+  }
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    if(StringMatchingFeatureExtractor.isPronoun(mention)) return feats;
+    
+    double maxSim = 0.0;
+    double maxPhraseSim = 0.0;
+    
+    ConllDependencyNode mentionNode = DependencyUtility.getNominalHeadNode(jCas, mention);
+    
+    double[] mentionVec = getPhraseVec(mention);
+    boolean exactMatch = false;
+    
+    // first, do not bother with pronouns:
+    String mentionHead = mentionNode != null ? mentionNode.getCoveredText().toLowerCase() : null;
+    if(mentionHead != null){
+      for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+        if(mention.getBegin() < member.getEnd()){
+          // during training this might happen -- see a member of a cluster that
+          // is actually subsequent to the candidate mention
+          break;
+        }
+
+        double[] memberVec = getPhraseVec(member);
+        
+        double phraseSim = 0.0;
+        for(int i = 0; i < memberVec.length; i++){
+          phraseSim += (mentionVec[i] * memberVec[i]);
+        }
+        if(phraseSim > maxPhraseSim){
+          maxPhraseSim = phraseSim;
+        }
+        
+        ConllDependencyNode memberNode = DependencyUtility.getNominalHeadNode(jCas, member);
+        String memberHead = memberNode != null ? memberNode.getCoveredText().toLowerCase() : null;
+        if(mentionHead.equals(memberHead)){
+          exactMatch = true;
+        }
+        if(memberNode != null && words.containsKey(memberHead) && words.containsKey(mentionHead)){
+          double sim = words.getSimilarity(mentionHead, memberHead);
+          if(sim > maxSim){
+            maxSim = sim;
+          }
+        }
+      }
+    }
+    if(exactMatch){
+      maxSim = 0.0;
+    }
+    
+    feats.add(new Feature("HEAD_SIMILARITY_WORD2VEC", maxSim));
+//    feats.add(new Feature("PHRASE_SIMILARITY_WORD2VEC", maxPhraseSim));
+    
+    return feats;
+  }
+
+  private double[] getPhraseVec(Annotation annotation){
+    double[] phraseVec = new double[words.getDimensionality()];
+    double vecLength = 0.0;
+    
+    for(BaseToken token : JCasUtil.selectCovered(BaseToken.class, annotation)){
+      String word = token.getCoveredText().toLowerCase();
+      if(words.containsKey(word)){
+        WordVector vec = words.getVector(word);
+        for(int i = 0; i < phraseVec.length; i++){
+          double val = vec.getValue(i);
+          phraseVec[i] += val;
+          vecLength = (val * val);
+        }
+      }
+    }
+    
+    // normalize vector:
+    for(int i = 0; i < phraseVec.length; i++){
+      double val = phraseVec[i];
+      vecLength += (val * val);
+    }
+    vecLength = Math.sqrt(vecLength);
+    
+    if(vecLength > 0.0){
+      for(int i = 0; i < phraseVec.length; i++){
+        phraseVec[i] /= vecLength;
+      }    
+    }
+    
+    return phraseVec;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,46 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterDistanceFeaturesExtractor
+    implements RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster, IdentifiedAnnotation mention)
+      throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    int minDistance = Integer.MAX_VALUE;
+    int neMinDistance = Integer.MAX_VALUE;
+    int sentMinDistance = Integer.MAX_VALUE;
+    
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        int dist = JCasUtil.selectBetween(BaseToken.class, member, mention).size();
+        minDistance = Math.min(minDistance, dist);
+        
+        int neDist = JCasUtil.selectBetween(Markable.class, member, mention).size();
+        neMinDistance = Math.min(neMinDistance, neDist);
+        
+        int sentDist = JCasUtil.selectBetween(Sentence.class, member, mention).size();
+        sentMinDistance = Math.min(sentMinDistance, sentDist);
+    }
+    feats.add(new Feature("MinTokenDistance", minDistance / 4000.0));
+    feats.add(new Feature("MinMarkableDistance", neMinDistance / 900.0));
+    feats.add(new Feature("MinSentDistance", sentMinDistance / 350.0));
+    
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,99 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.coreference.extractors.ContinuousTextExtractor;
+import org.apache.ctakes.relationextractor.ae.features.DependencyTreeFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.FirstCovered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.LastCovered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.feature.extractor.NamingExtractor1;
+import org.cleartk.ml.feature.extractor.TypePathExtractor;
+
+public class MentionClusterMentionFeaturesExtractor implements FeatureExtractor1<Markable> {
+
+  private FeatureExtractor1<BaseToken> coveredText = new CoveredTextExtractor<>();
+  private FeatureExtractor1<Markable> tokenIdentityContext = new CleartkExtractor<>(
+      BaseToken.class,
+      coveredText,
+      new FirstCovered(1),
+      new LastCovered(1),
+      new Bag(new Covered()),
+      new Preceding(3),
+      new Following(3));
+  
+  private FeatureExtractor1<BaseToken> continuousText = null;
+  private FeatureExtractor1<Markable> tokenVectorContext = null;      
+
+  private FeatureExtractor1<BaseToken> pos = new TypePathExtractor<>(BaseToken.class, "partOfSpeech");
+
+  /**
+   * All part-of-speech tags of the mention as a bag
+   */
+  private FeatureExtractor1<Markable> tokenPOS = new CleartkExtractor<>(
+      BaseToken.class,
+      pos,
+      new Bag(new Covered()));
+
+  /**
+   * All extractors for mention 1, with features named to distinguish them from mention 2
+   */
+  private FeatureExtractor1<Markable> mentionFeaturesExtractor = new NamingExtractor1<>(
+      "mention1pos",
+      tokenPOS);
+
+  public MentionClusterMentionFeaturesExtractor() throws CleartkExtractorException{
+    this(null);
+  }
+  
+  public MentionClusterMentionFeaturesExtractor(String vectorFile) throws CleartkExtractorException {
+    if(vectorFile != null){
+      this.continuousText = new ContinuousTextExtractor(vectorFile);
+      this.tokenVectorContext = new CleartkExtractor<>(
+          BaseToken.class,
+          continuousText,
+          new FirstCovered(1),
+          new LastCovered(1),
+//          new Bag(new Covered()),
+          new Preceding(1),
+          new Following(1));
+    }
+  }
+  
+  @Override
+  public List<Feature> extract(JCas view, Markable focusAnnotation) throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    // token features:
+    feats.addAll(tokenIdentityContext.extract(view, focusAnnotation));
+    
+    
+    // token vector features:
+//    if(this.tokenVectorContext != null){
+//      feats.addAll(this.tokenVectorContext.extract(view, focusAnnotation));
+//    }
+    
+    // pos features:
+    feats.addAll(mentionFeaturesExtractor.extract(view, focusAnnotation));
+    
+    // Always do num covered and dep features
+    feats.add(new Feature("NumCoveredTokens", JCasUtil.selectCovered(BaseToken.class, focusAnnotation).size()));
+    feats.addAll(DependencyTreeFeaturesExtractor.extractForNode(view, focusAnnotation, "dep"));
+    
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,50 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MentionClusterSalienceFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation>, FeatureExtractor1<Markable> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    double maxSalience = 0.0;
+    for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+      if(mention.getBegin() < member.getEnd()){
+        // during training this might happen -- see a member of a cluster that
+        // is actually subsequent to the candidate mention
+        break;
+      }
+      if(member.getConfidence() > maxSalience){
+        maxSalience = member.getConfidence();
+      }
+    }
+    
+    feats.add(new Feature("MC_MAX_SALIENCE", maxSalience));
+    return feats;
+  }
+
+  @Override
+  public List<Feature> extract(JCas jCas, Markable mention) throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    feats.add(new Feature("MC_MENTION_SALIENCE", mention.getConfidence()));
+
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,102 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MentionClusterSectionFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation>, FeatureExtractor1<Markable> {
+
+  @Override
+  public List<Feature> extract(JCas jcas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    Set<Integer> parsWithAnteHeader = new HashSet<>();
+    
+    boolean anteInHeader = false;
+    int anaPar = -1;
+    
+    // Find section headers -- paragraphs 
+    // FIXME - should be paragraphs that only cover one sentence
+    List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+    for(int i = 0; i < pars.size(); i++){
+      Paragraph par = pars.get(i);
+      // find the paragraph with the anaphor
+      if(mention.getBegin() >= par.getBegin() && mention.getEnd() <= par.getEnd()){
+        anaPar = i;
+      }
+
+      if(par.getBegin() > mention.getEnd()){
+        break;
+      }
+      
+      List<Sentence> coveredSents = JCasUtil.selectCovered(Sentence.class, par);
+      if(coveredSents == null || coveredSents.size() == 0 || coveredSents.size() > 1) continue;
+      
+      // if we get this far then we are in a paragraph comprised of a single sentence 
+      for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+        if(member.getBegin() >= par.getBegin() && member.getEnd() <= par.getEnd()){
+          parsWithAnteHeader.add(i);
+          anteInHeader = true;
+          break;
+        }
+      }
+      
+    }
+
+    feats.add(new Feature("AnteInHeader", parsWithAnteHeader.size() > 0));
+    if(anteInHeader && parsWithAnteHeader.contains(anaPar-1)){
+      feats.add(new Feature("AnteHeaderHeadsAna", true));      
+    }
+
+    return feats;
+  }
+
+  @Override
+  public List<Feature> extract(JCas jcas, Markable mention) throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    boolean anaInHeader = false;
+    int anaPar = -1;
+
+    // Find section headers -- paragraphs 
+    List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+    for(int i = 0; i < pars.size(); i++){
+      Paragraph par = pars.get(i);
+      if(par.getBegin() > mention.getEnd()){
+        break;
+      }
+      // find the paragraph with the anaphor
+      if(mention.getBegin() >= par.getBegin() && mention.getEnd() <= par.getEnd()){
+        anaPar = i;
+      }
+      List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+      if(coveredSents != null && coveredSents.size() == 1){
+        if(anaPar == i){
+          anaInHeader = true;
+          break;
+        }
+      }
+    }
+    feats.add(new Feature("AnaInHeader", anaInHeader));
+
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,73 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterSemTypeDepPrefsFeatureExtractor implements RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> {
+
+  private HashMap<String,HashMap<String,Double>> probs = new HashMap<>();
+  
+  public MentionClusterSemTypeDepPrefsFeatureExtractor() throws FileNotFoundException {
+    try(Scanner scanner = new Scanner(FileLocator.getAsStream("org/apache/ctakes/coreference/pref_probs.txt"))){
+    	while(scanner.hasNextLine()){
+    		String line = scanner.nextLine().trim();
+    		String[] parts = line.split("\t");
+    		if(!probs.containsKey(parts[0])){
+    			probs.put(parts[0], new HashMap<String,Double>());
+    		}
+    		probs.get(parts[0]).put(parts[1], Double.parseDouble(parts[2]));
+    	}
+    }
+    
+  }
+  
+  @Override
+  public List<Feature> extract(JCas jcas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    double maxProb = 0.0;
+    String mentionText = mention.getCoveredText().toLowerCase();
+    
+    if(mentionText.equals("this") || mentionText.equals("it") || mentionText.equals("that")){
+      ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, mention);
+      String key = head.getHead().getCoveredText().toLowerCase() + "::" + head.getDeprel();
+      Map<String,Double> semProbs = probs.get(key);
+      if(semProbs == null) return feats;
+
+      for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        if(mention.getBegin() < m.getEnd()){
+          // during training this might happen -- see a member of a cluster that
+          // is actually subsequent to the candidate mention
+          continue;
+        }
+        List<IdentifiedAnnotation> ents = JCasUtil.selectCovering(jcas, IdentifiedAnnotation.class, m);
+        for(IdentifiedAnnotation ent : ents){
+          String semKey = ent.getClass().getSimpleName();
+          if(semProbs.containsKey(semKey)){
+            double prob = semProbs.get(semKey);
+            if(prob > maxProb) maxProb = prob;
+          }
+        }
+      }
+      feats.add(new Feature("InferredSemTypeMaxProb", maxProb));
+    }
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,58 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterStackFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+
+    // This feature didn't work.
+//    feats.add(new Feature("ClusterSize_" + size, true));
+//    feats.add(new Feature("ClusterSize", size));
+    
+    NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+    Annotation mostRecent = ClusterUtils.getMostRecent(members, mention);
+    int mentionEnd = mostRecent.getEnd();
+    int numIntervening = 0;
+    int numNonSingletonIntervening = 0;
+    
+    // this feature is how far down the current cluster is on the stack -- to calculate it
+    // we go over all other clusters in the cas, look at the most recent element, and
+    // see if it is more recent than the current cluster underconsideration
+    for(CollectionTextRelation otherCluster : JCasUtil.select(jCas, CollectionTextRelation.class)){
+      if(otherCluster == cluster) continue;
+
+      members = ((NonEmptyFSList)otherCluster.getMembers());
+      mostRecent = ClusterUtils.getMostRecent(members, mention);
+      if(mostRecent != null && mostRecent.getEnd() > mentionEnd){
+        numIntervening++;
+        if(ClusterUtils.getSize(members) > 1){
+          numNonSingletonIntervening++;
+        }
+      }
+    }
+    
+//    feats.add(new Feature("ClusterStackPositionInclSingleton"+numIntervening,true));
+//    feats.add(new Feature("ClusterStackPosition"+numNonSingletonIntervening,true));
+    feats.add(new Feature("ClusterStackPositionInclSingleton", 1 + Math.log10(numIntervening+1)));
+    feats.add(new Feature("ClusterStackPosition", 1 + Math.log10(numNonSingletonIntervening+1)));
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,113 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.contentWords;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.endMatch;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.soonMatch;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.startMatch;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.wordOverlap;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.wordSubstring;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.ctakes.utils.struct.MapFactory;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterStringFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    CounterMap<String> featCounts = new CounterMap<>();
+    
+    if(StringMatchingFeatureExtractor.isPronoun(mention)) return feats;
+    
+    String m = mention.getCoveredText();
+    Set<String> mentionWords = contentWords(mention);
+    Set<String> nonHeadMentionWords = new HashSet<>(mentionWords);
+    ConllDependencyNode mentionHead = MapFactory.get(getKey(jCas), mention);
+    
+    String mentionHeadString = null;
+    if(mentionHead != null){
+      mentionHeadString = mentionHead.getCoveredText().toLowerCase();
+      nonHeadMentionWords.remove(mentionHeadString);
+
+      int maxNonoverlap = 0;
+
+      for(IdentifiedAnnotation member : new ListIterable<IdentifiedAnnotation>(cluster.getMembers())){
+        if(member == null){
+          System.err.println("Something that shouldn't happen has happened");
+          continue;
+        }else if(mention.getBegin() < member.getEnd()){
+          // during training this might happen -- see a member of a cluster that
+          // is actually subsequent to the candidate mention
+          continue;
+        }else if(StringMatchingFeatureExtractor.isPronoun(member)){
+          continue;
+        }
+
+        String s = member.getCoveredText();
+        Set<String> memberWords = contentWords(member);
+        Set<String> nonHeadMemberWords = new HashSet<>(memberWords);
+        ConllDependencyNode memberHead = MapFactory.get(getKey(jCas), member);
+        String memberHeadString = null;
+        if(memberHead != null){
+          memberHeadString = memberHead.getCoveredText().toLowerCase();
+          nonHeadMemberWords.remove(memberHeadString);
+
+          if(mentionHeadString.equals(memberHeadString)){
+
+            if(m.equalsIgnoreCase(s)) featCounts.add("MC_STRING_EXACT");
+            if(startMatch(m,s)) featCounts.add("MC_STRING_START");
+            if(endMatch(m,s)) featCounts.add("MC_STRING_END");
+            if(soonMatch(m,s)) featCounts.add("MC_STRING_SOON");
+            if(wordOverlap(mentionWords, memberWords)) featCounts.add("MC_OVERLAP");
+            if(wordSubstring(mentionWords, memberWords)) featCounts.add("MC_SUB");
+
+            int nonHeadOverlap = wordNonOverlapCount(nonHeadMemberWords, nonHeadMentionWords);
+            if(nonHeadOverlap > maxNonoverlap){
+              maxNonoverlap = nonHeadOverlap;
+            }
+          }
+        }
+      }
+      feats.add(new Feature("MC_MAX_NONOVERLAP", maxNonoverlap));
+    }
+    
+    
+    for(String featKey : featCounts.keySet()){
+      // normalized
+//      feats.add(new Feature(featKey, (double) featCounts.get(featKey) / clusterSize));
+      // boolean
+      feats.add(new Feature(featKey, true));
+    }
+    return feats;
+  }
+  
+  public static int wordNonOverlapCount(Set<String> w1, Set<String> w2){
+    int count = 0;
+    
+    for(String w : w1){
+      if(!w2.contains(w)) count++;
+    }
+    
+    for(String w : w2){
+      if(!w1.contains(w)) count++;
+    }
+    return count;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,212 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+import static org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor.alias;
+import static org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor.getDocId;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.struct.MapFactory;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MentionClusterUMLSFeatureExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation>, FeatureExtractor1<Markable> {
+
+  String docId = null;
+  Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> coveringMap = null;
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    Set<String> trueFeats = new HashSet<>();
+    
+    if(docId == null || !getDocId(jCas).equals(docId)){
+      docId = getDocId(jCas);
+      coveringMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+    }
+    
+    ConllDependencyNode head = MapFactory.get(getKey(jCas), mention);
+    
+    if(head != null){
+      List<IdentifiedAnnotation> rmList = new ArrayList<>();
+      // get the entities covering this markable:
+      List<IdentifiedAnnotation> mentionEnts = new ArrayList<>(coveringMap.get(head)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());'
+      for(IdentifiedAnnotation ann : mentionEnts){
+        if(!(ann instanceof EntityMention || ann instanceof EventMention) || ann.getClass() == EventMention.class){
+          rmList.add(ann);
+        }
+      }
+      for(IdentifiedAnnotation toRm : rmList){
+        mentionEnts.remove(toRm);
+      }
+      
+      Set<IdentifiedAnnotation> clusterEnts = new HashSet<>();
+      for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+        ConllDependencyNode memberHead = MapFactory.get(getKey(jCas), member);
+        rmList.clear();
+        // get the named entities covering this cluster member:
+        List<IdentifiedAnnotation> ents2 = new ArrayList<>(coveringMap.get(memberHead)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head2.getBegin(), head2.getEnd());
+        for(IdentifiedAnnotation ann : ents2){
+          if(!(ann instanceof EntityMention || ann instanceof EventMention) || ann.getClass() == EventMention.class){
+            rmList.add(ann);
+          }
+        }
+        for(IdentifiedAnnotation toRm : rmList){
+          ents2.remove(toRm);
+        }
+        
+        clusterEnts.addAll(ents2);
+      }
+      
+      if(clusterEnts.size() == 0 && mentionEnts.size() > 0){
+        trueFeats.add("ClusterNoCui_MentionCui");
+      }else if(clusterEnts.size() > 0 && mentionEnts.size() == 0){
+        trueFeats.add("ClusterCui_MentionNoCui");          
+      }else if(clusterEnts.size() == 0 && mentionEnts.size() == 0){
+        trueFeats.add("ClusterMentionNoCui");
+      }else{
+        trueFeats.add("ClusterMentionBothCui");
+      }
+      
+      if((clusterEnts.size() == 0 & mentionEnts.size() > 0) ||
+          (clusterEnts.size() > 0 && mentionEnts.size() == 0)){
+        trueFeats.add("ClusterOrMentionNoCui");
+      }
+      
+//      int minDistance = Integer.MAX_VALUE;
+      for(IdentifiedAnnotation ent1 : clusterEnts){
+        HashSet<String> a1Tuis = new HashSet<>(); 
+        String a1SemType = ent1.getClass().getSimpleName();
+        trueFeats.add("ClusterSemType" + a1SemType);
+        FSArray cons1 = ent1.getOntologyConceptArr();
+        if(cons1 != null){
+          for(int i = 0; i < cons1.size(); i++){
+            if(cons1.get(i) instanceof UmlsConcept){
+              a1Tuis.add(((UmlsConcept)cons1.get(i)).getTui());
+            }
+          }
+        }
+        for(IdentifiedAnnotation ent2 : mentionEnts){
+          HashSet<String> a2Tuis = new HashSet<>();
+          String a2SemType = ent2.getClass().getSimpleName();
+//          trueFeats.add("MentionSemType" + a2SemType);
+                   
+          if(alias(ent1, ent2)){
+            trueFeats.add("UMLS_ALIAS");
+          }
+
+          /*
+          if(!trueFeats.contains("UMLS_ALIAS") && isHypernym(ent1, ent2)){
+            trueFeats.add("IS_HYPERNYM");
+          }
+          
+          if(!trueFeats.contains("UMLS_ALIAS") && isHyponym(ent1, ent2)){
+            trueFeats.add("IS_HYPONYM");
+          }
+          */
+
+//          int pairDist = graphDistance(ent1, ent2);
+//          if(Math.abs(pairDist) < Math.abs(minDistance)){
+//            minDistance = pairDist;
+//          }
+          
+          trueFeats.add("MentionClusterSemTypePair" + a1SemType + "_" + a2SemType);
+          
+          FSArray cons2 = ent2.getOntologyConceptArr();
+          if(cons2 != null){
+            for(int i = 0; i < cons2.size(); i++){
+              if(cons2.get(i) instanceof UmlsConcept){
+                a2Tuis.add(((UmlsConcept)cons2.get(i)).getTui());
+              }
+            }
+          }
+          for(String tui1 : a1Tuis){
+//            trueFeats.add("ClusterTui_" +  tui1);
+            for(String tui2 : a2Tuis){
+//              trueFeats.add("ClusterTui_" + tui1 + "_MentionTui_ " + tui2);
+              if(tui1.equals(tui2)){
+                trueFeats.add("ClusterMentionTuiMatch");
+              }
+            }
+          }
+//          for(String tui2 : a2Tuis){
+//            trueFeats.add("MentionTui_" + tui2);
+//          }
+        }
+      }
+//      double distFeat = 0.0;
+//      if(minDistance != Integer.MAX_VALUE){
+//        distFeat = 1.0 / minDistance;
+//        if(distFeat < 0){
+//          feats.add(new Feature("AncestorDistance", -distFeat));
+//        }else{
+//          feats.add(new Feature("DescendentDistance", distFeat));
+//        }
+//      }        
+    }
+    
+    
+    for(String feat : trueFeats){
+      feats.add(new Feature(feat, true));
+    }
+    return feats;
+  }
+
+  @Override
+  public List<Feature> extract(JCas jCas, Markable mention) throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    Set<String> trueFeats = new HashSet<>();
+    
+    if(docId == null || !getDocId(jCas).equals(docId)){
+        docId = getDocId(jCas);
+        coveringMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+    }
+    
+    ConllDependencyNode head = MapFactory.get(getKey(jCas), mention);
+
+    List<IdentifiedAnnotation> rmList = new ArrayList<>();
+    // get the entities covering this markable:
+    List<IdentifiedAnnotation> mentionEnts = new ArrayList<>(coveringMap.get(head)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());'
+    for(IdentifiedAnnotation ann : mentionEnts){
+      if(!(ann instanceof EntityMention || ann instanceof EventMention) || ann.getClass() == EventMention.class){
+        rmList.add(ann);
+      }
+    }
+    for(IdentifiedAnnotation toRm : rmList){
+      mentionEnts.remove(toRm);
+    }
+
+    for(IdentifiedAnnotation ent : mentionEnts){
+      String a2SemType = ent.getClass().getSimpleName();
+      trueFeats.add("MentionSemType" + a2SemType);
+    }
+    
+    for(String feat : trueFeats){
+      feats.add(new Feature(feat, true));
+    }
+
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,69 @@
+package org.apache.ctakes.coreference.ae.features.salience;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class ClinicalFeatureExtractor implements FeatureExtractor1<Markable> {
+
+  @Override
+  public List<Feature> extract(JCas jcas, Markable markable){    
+    List<Feature> feats = new ArrayList<>();
+    
+    List<Paragraph> coveringPars = JCasUtil.selectCovering(jcas, Paragraph.class, markable);
+    List<Sentence> coveringSents = JCasUtil.selectCovering(jcas, Sentence.class, markable);
+    Sentence coveringSent = DependencyUtility.getSentence(jcas, markable);
+
+    if(coveringPars.size() == 1 && coveringSents.size() == 1){
+      List<Sentence> parSents = JCasUtil.selectCovered(Sentence.class, coveringPars.get(0));
+      if(parSents.size() == 1){
+        // covering paragraph for this markable is exactly one sentence long -- 
+        // AKA it is a header
+        feats.add(new Feature("ClinIsHeader", true));
+      }else{
+        int sentPos = 0;
+        for(int i = 0; i < parSents.size(); i++){
+          if(parSents.get(i) == coveringSent){
+            sentPos = i;
+            break;
+          }
+        }
+        if(sentPos < parSents.size() / 3){
+          feats.add(new Feature("ClinSentPosBegin", true));
+        }else if(sentPos > (2*parSents.size() / 3)){
+          feats.add(new Feature("ClinSentPosEnd", true));
+        }else{
+          feats.add(new Feature("ClinSentPosMiddle", true));
+        }
+      }
+    }
+    
+    
+    
+    List<EventMention> events = JCasUtil.selectCovered(EventMention.class, markable);
+    EventMention longestEvent = null;
+    for(EventMention event : events){
+      if(event.getTypeID() > 0){
+        if(longestEvent == null || (event.getEnd()-event.getBegin()) > (longestEvent.getEnd()-longestEvent.getBegin())){
+          longestEvent = event;
+        }
+      }
+    }
+    if(longestEvent != null){
+      feats.add(new Feature("ClinSemType" + longestEvent.getClass().getSimpleName(), true));
+    }
+    
+    
+    
+    return feats;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,70 @@
+package org.apache.ctakes.coreference.ae.features.salience;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+/*
+ * Citations:
+ * Recasens, de Marneffe, Potts: The Life and Death of Discourse Entities: Identifying Singleton Mentions
+ * NAACL-HLT 2013 short paper, 627-633.
+ * 
+ * This class implements features in Table 3. Since there is highly ambiguous descriptions
+ * of the features (e.g., Sentence Position=End as well as Sentence Position=Last, 
+ * I looked at the source code for the system to determine precisely how the features
+ * were defined.
+ * First, last means literally first or last token in sentence.
+ * Begin, middle, and end mean which third of the sentence is it in.
+ */
+public class GrammaticalRoleFeatureExtractor implements FeatureExtractor1<Markable> {
+
+  public List<Feature> extract(JCas jcas, Markable markable)
+      throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    if(head == null){
+      return feats;
+    }
+    Sentence sent = DependencyUtility.getSentence(jcas, markable);
+    List<ConllDependencyNode> sentNodes = DependencyUtility.getDependencyNodes(jcas, sent);
+//    List<ConllDependencyNode> covering = DependencyUtility.getProgeny(head, sentNodes);
+    int numNodes = sentNodes.size()-1; // remove root whole sentence node
+    
+    feats.add(new Feature("GrammaticalRoleSentencePositionFirst", head.getId() == 1));
+    feats.add(new Feature("GrammaticalRoleSentencePositionLast", head.getId() == numNodes));
+    if(head.getId() < (numNodes / 3)){
+      feats.add(new Feature("GrammaticalRoleSentencePositionBegin", true));
+    }else if(head.getId() > 2*(numNodes/3)){
+      feats.add(new Feature("GrammaticalRoleSentencePositionEnd", true));
+    }else{
+      feats.add(new Feature("GrammaticalRoleSentencePositionMiddle", true));
+    }
+    
+    String deprel = head.getDeprel();
+    if(deprel.equals("nsubj")){
+      feats.add(new Feature("GrammaticalRoleRelSubj", true));
+    }else if(deprel.equals("dobj") || deprel.equals("iobj")){
+      feats.add(new Feature("GrammaticalRoleRelVerbArg", true));
+    }else if(deprel.equals("nn")){
+      feats.add(new Feature("GrammaticalRoleRelNounArg", true));
+    }else if(deprel.equals("root")){
+      feats.add(new Feature("GrammaticalRoleRelRoot", true));
+    }else if(deprel.equals("conj")){
+      feats.add(new Feature("GrammaticalRoleRelConj", true));
+    }else{
+      feats.add(new Feature("GrammaticalRoleRelOther", true));
+    }
+    
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,78 @@
+package org.apache.ctakes.coreference.ae.features.salience;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+/*
+ * Citations:
+ * Recasens, de Marneffe, Potts: The Life and Death of Discourse Entities: Identifying Singleton Mentions
+ * NAACL-HLT 2013 short paper, 627-633.
+ * 
+ * BBN corpus description (for the 18 named entity types)
+ * https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
+ * 
+ * This feature extractor is intended to implement the features in Table 2, described
+ * in the subsection of 3 called "Internal morphosyntax of the mention."
+ * Left off the table are the 18 NE types from CoNLL. Most of these are not relevant
+ * to our task, especially since we resolve person mentions with simple rules.
+ */
+public class MorphosyntacticFeatureExtractor implements FeatureExtractor1<Markable> {
+
+  public List<Feature> extract(JCas jcas, Markable markable)
+      throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    if(head == null){
+      return feats;
+    }
+    List<ConllDependencyNode> covering = DependencyUtility.getProgeny(head, DependencyUtility.getDependencyNodes(jcas, DependencyUtility.getSentence(jcas, markable)));
+
+    if(head.getId() != 0 && (head.getPostag().startsWith("PRP") || 
+        (head.getPostag().equals("DT") && !head.getDeprel().equals("det")))){
+      // 2 conditions -- head is a pronoun POS tag (he, she, it) like PRP or PRP$
+      // or head is a determiner (This, that) that does not have a determiner dependency relation
+      // -- usually marked as nsubj or dobj when used as pronoun (This was..., discussed this with...) 
+      // but would be "det" when used as in "this discussion"
+      feats.add(new Feature("MorphoIsPronoun", true));
+    }else{
+      feats.add(new Feature("MorphoIsPronoun", false));
+    }
+    
+    feats.add(new Feature("MorphoIsProper", head.getPostag().equals("NNP")));
+    
+    // skip animacy and person features for now -- planning to not do person mentions
+    
+    // replace singular/other with plural/other
+    feats.add(new Feature("MorphoPlural", head.getPostag().equals("NNS")));
+    
+    boolean indefinite = false;
+    boolean containsNum = false;
+    for(ConllDependencyNode node : covering){
+      if(node.getPostag().equals("DT") && 
+          (node.getLemma().equals("a") || node.getLemma().equals("an"))){
+        indefinite = true;
+      }
+      
+      if(node.getPostag().equals("CD")){
+        containsNum = true;
+      }
+    }
+    
+    feats.add(new Feature("MorphoIndefinite", indefinite));
+    feats.add(new Feature("MorphoNumeric", containsNum)); // lump together many NE types from OntoNotes (date, time, ordinal, percent, quantity)
+    feats.add(new Feature("MorphoNumModifiers", covering.size()-1));    
+    
+    return feats;
+  }
+
+  
+}



Mime
View raw message