ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1666501 [1/2] - /ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/
Date Fri, 13 Mar 2015 16:23:14 GMT
Author: tmill
Date: Fri Mar 13 16:23:14 2015
New Revision: 1666501

URL: http://svn.apache.org/r1666501
Log:
A bunch of annotators refactored from temporal.

Added:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java
Modified:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainAnnotator.java?rev=1666501&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainAnnotator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainAnnotator.java Fri Mar 13 16:23:14 2015
@@ -0,0 +1,123 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+
+/*
+ * Does not find coreference -- simply turns annotated pairs into chains of clustered mentions
+ */
+public class CoreferenceChainAnnotator extends JCasAnnotator_ImplBase {
+
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    Collection<CoreferenceRelation> pairs = JCasUtil.select(jCas, CoreferenceRelation.class);
+    Map<Annotation,Set<Annotation>> chains = new HashMap<>();
+    
+    for(CoreferenceRelation pair : pairs){
+      Annotation ante = pair.getArg1().getArgument();
+      Annotation ana = pair.getArg2().getArgument();
+      
+      /* 3 cases:
+       * 1) Only antecedent is in a chain -- add anaphor to that chain
+       * 2) Only anaphor is in a chain -- add antecedent to that chain
+       * 3) Both in different chains -- join the chains
+       * 4) Both in same chain -- do nothing
+       * 5) Neither in a chain -- create new chain
+       */
+      if(chains.containsKey(ante) && !chains.containsKey(ana)){
+        // 1
+        chains.get(ante).add(ana);
+        chains.put(ana, chains.get(ante));
+      }else if(chains.containsKey(ana) && !chains.containsKey(ante)){
+        // 2
+        chains.get(ana).add(ante);
+        chains.put(ante, chains.get(ana));
+      }else if(chains.containsKey(ante) && chains.containsKey(ana)){
+        if(!chains.get(ante).equals(chains.get(ana))){
+          // 3
+          Set<Annotation> anteChain = chains.get(ante);
+          Set<Annotation> anaChain = chains.get(ana);
+          anteChain.addAll(anaChain);
+          chains.put(ana, anteChain);
+          // make all annotations in ana chain point to ante chain:
+          for(Annotation markable : anaChain){
+            chains.put(markable, anteChain);
+          }
+        }
+        // else 4, which do nothing
+      }else{
+        // 5
+        Set<Annotation> newChain = new HashSet<Annotation>();
+        newChain.add(ante);
+        newChain.add(ana);
+        chains.put(ante, newChain);
+        chains.put(ana, newChain);
+      }
+    }
+    
+    // convert java Sets into ordered UIMA lists.
+    for(Set<Annotation> mentionSet : new HashSet<Set<Annotation>>(chains.values())){
+      List<Annotation> sortedMentions = new ArrayList<>(mentionSet);
+      Collections.sort(sortedMentions, new AnnotationComparator());
+      CollectionTextRelation chain = new CollectionTextRelation(jCas);
+      NonEmptyFSList list = new NonEmptyFSList(jCas);
+      chain.setMembers(list);
+      list.addToIndexes();
+      for(int i = 0; i < sortedMentions.size(); i++){
+        Annotation mention = sortedMentions.get(i);
+        list.setHead(mention);
+        if(i == (sortedMentions.size() - 1)){
+          list.setTail(new EmptyFSList(jCas));
+          list.getTail().addToIndexes();
+        }else{
+          list.setTail(new NonEmptyFSList(jCas));
+          list = (NonEmptyFSList) list.getTail();
+          list.addToIndexes();
+        }
+      }
+      chain.addToIndexes();
+    }
+  }
+
+  private class AnnotationComparator implements Comparator<Annotation> {
+
+    @Override
+    public int compare(Annotation o1, Annotation o2) {
+      if(o1.getBegin() < o2.getBegin()){
+        return -1;
+      }else if(o1.getBegin() == o2.getBegin() && o1.getEnd() < o2.getEnd()){
+        return -1;
+      }else if(o1.getBegin() == o2.getBegin() && o1.getEnd() > o2.getEnd()){
+        return 1;
+      }else if(o2.getBegin() < o1.getBegin()){
+        return 1;
+      }else{
+        return 0;
+      }
+    }
+  }
+  
+  public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException{
+    return AnalysisEngineFactory.createEngineDescription(CoreferenceChainAnnotator.class);
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java?rev=1666501&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java Fri Mar 13 16:23:14 2015
@@ -0,0 +1,227 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewUriUtil;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+
+public class CoreferenceChainScoringOutput extends JCasAnnotator_ImplBase{
+  public static final String PARAM_OUTPUT_FILENAME = "OutputDirectory";
+  @ConfigurationParameter(
+      name = PARAM_OUTPUT_FILENAME,
+      mandatory = true,
+      description = "Directory to write output"
+      )
+  private String outputFilename;
+  private PrintWriter out = null;
+  
+  public static final String PARAM_GOLD_VIEW_NAME = "GoldViewName";
+  @ConfigurationParameter(
+      name = PARAM_GOLD_VIEW_NAME,
+      mandatory = false,
+      description = "Name of gold view in jcas"
+      )
+  private String goldViewName = null;
+  
+  private int docNum = 0;
+  
+  @Override
+  public void initialize(final UimaContext context) throws ResourceInitializationException{
+    super.initialize(context);
+    
+    try {
+      out = new PrintWriter(outputFilename);
+    } catch (FileNotFoundException e) {
+      e.printStackTrace();
+      throw new ResourceInitializationException(e);
+    }
+  }
+  
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    File filename = new File(ViewUriUtil.getURI(jCas));
+    JCas chainsCas = null;
+    try {
+       chainsCas = goldViewName != null ? jCas.getView(goldViewName) : jCas;
+    } catch (CASException e) {
+      e.printStackTrace();
+      throw new AnalysisEngineProcessException(e);
+    }
+    int chainNum = 1;
+    HashMap<Annotation, Integer> ent2chain = new HashMap<>();
+    if(goldViewName != null) System.out.println("\nGold chains:");
+    else System.out.println("\nSystem chains:");
+    Collection<CollectionTextRelation> rels = JCasUtil.select(chainsCas, CollectionTextRelation.class);
+    if(rels.size() == 0){
+      return;
+    }
+    for(CollectionTextRelation chain : rels){
+      FSList members = chain.getMembers();
+      // if we are doing cluster-mention coreference, some clusters will be singletons, we do not use those in conll scoring
+      if(members instanceof NonEmptyFSList && 
+          ((NonEmptyFSList)members).getTail() instanceof EmptyFSList) continue;
+      
+      while(members instanceof NonEmptyFSList){
+        Annotation mention = (Annotation) ((NonEmptyFSList) members).getHead();
+        ent2chain.put(mention, chainNum);
+        members = ((NonEmptyFSList)members).getTail();
+        System.out.print("Mention: " + mention.getCoveredText());
+        System.out.print(" (" + mention.getBegin() + ", " + mention.getEnd() + ")");
+        System.out.print("  ----->    ");
+      }
+      System.out.println();
+      chainNum++;
+    }
+    
+    // Here we are using newline tokens to delimit sentences because the sentence
+    // breaks that cTAKES creates may not be correct and some gold markables might
+    // wrap sentences which might be confusing to the consumer of this file.
+    out.println("#begin document (" + filename.getPath() + "); part 000");
+    List<BaseToken> tokens = new ArrayList<>(JCasUtil.select(jCas, BaseToken.class));
+    Multiset<Integer> endSet = HashMultiset.create();
+    int tokenId = 0;
+
+    for(int i = 0; i < tokens.size(); i++){
+      BaseToken token = tokens.get(i);
+
+      // if we see a newline token at the end of a sentence break the sentence
+      // only print out if we are not at the start of the sentence:
+      if(token instanceof NewlineToken){
+        if(tokenId > 0){
+          out.println();
+          tokenId = 0;
+        }
+        continue;
+      }
+      
+      int lastInd = token.getEnd();
+      // fix for some bad tokenization
+      if(token.getCoveredText().length() > 1 && token.getCoveredText().endsWith(".")){
+        lastInd = token.getEnd()-1;
+      }
+      List<Markable> markables = new ArrayList<>(JCasUtil.selectCovering(chainsCas, Markable.class, token.getBegin(), lastInd));
+      List<Annotation> startMention = new ArrayList<>();
+      Multiset<Integer> endMention = HashMultiset.create();
+      List<Integer> wholeMention = new ArrayList<>();
+      
+      for(Annotation markable : markables){
+        if(ent2chain.containsKey(markable)){
+          if(markable.getBegin() == token.getBegin()){
+            if(markable.getEnd() == token.getEnd()){
+              wholeMention.add(ent2chain.get(markable));
+            }else{
+              startMention.add(markable);
+            }
+          }else if(markable.getEnd() <= token.getEnd()){
+            if(endMention.contains(ent2chain.get(markable))){
+              System.err.println("There is a duplicate element -- should be handled by multiset");
+            }
+            if(markable.getEnd() < token.getEnd()){
+              System.err.println("There is a markable that ends in the middle of a token!");
+            }
+            endMention.add(ent2chain.get(markable));
+          }
+        }
+      }
+      
+      out.print(filename.getPath());
+      out.print('\t');
+      out.print(docNum);
+      out.print('\t');
+      out.print(tokenId++);
+      out.print('\t');
+      out.print(token instanceof NewlineToken ? "Newline" : token.getCoveredText());
+      out.print('\t');
+      out.print(token.getPartOfSpeech());
+      out.print('\t');
+      // parse bit -- can ignore?
+      out.print('-');  out.print('\t');
+      // predicate lemma -- can ignore?
+      out.print('-'); out.print('\t');
+      // predicate frameset id -- can ignore?
+      out.print('-'); out.print('\t');
+      // word sense 
+      out.print('-'); out.print('\t');
+      // speaker/author
+      out.print('-'); out.print('\t');
+      // named entities
+      out.print('*'); out.print('\t');
+      
+      StringBuffer buff = new StringBuffer();
+//      while(endStack.size() > 0 && endMention.contains(endStack.peek())){
+      for(int ind : endMention){
+//        int ind = endStack.pop();
+//        int ind = endMention.get(j);
+        if(endSet.contains(ind)){
+          buff.append(ind);
+          buff.append(')');
+          buff.append('|');
+        }
+//        endMention.remove(ind);
+      }
+      for(int ind : wholeMention){
+        buff.append('(');
+        buff.append(ind);
+        buff.append(')');
+        buff.append('|');
+      }
+      // sort start mentions by ordering of ending
+      while(startMention.size() > 0){
+        int ind;
+        Annotation latestEnd = null;
+        for(int j = 0; j < startMention.size(); j++){
+          if(latestEnd == null || startMention.get(j).getEnd() > latestEnd.getEnd()){
+            latestEnd = startMention.get(j);
+          }
+        }
+        startMention.remove(latestEnd);
+        ind = ent2chain.get(latestEnd);
+        buff.append('(');
+        buff.append(ind);
+        buff.append('|');
+        endSet.add(ind);
+//        endStack.push(ind);
+      }
+//      for(int ind : endMention){
+//        buff.append(ind);
+//        buff.append(')');
+//        buff.append('|');
+//      }
+      if(buff.length() > 0){
+        out.println(buff.substring(0,  buff.length()-1));
+      }else{
+        out.println("_");
+      }
+//    }
+//    out.println();
+//      lastToken = token;
+    }
+    out.println("#end document " + filename.getPath());
+    out.flush();
+    docNum++;
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java?rev=1666501&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java Fri Mar 13 16:23:14 2015
@@ -0,0 +1,169 @@
+package org.apache.ctakes.coreference.ae;
+
+import static org.apache.ctakes.dependency.parser.util.DependencyUtility.getDependencyNodes;
+import static org.apache.ctakes.dependency.parser.util.DependencyUtility.getProgeny;
+import static org.apache.ctakes.dependency.parser.util.DependencyUtility.getSentence;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.temporal.eval.THYMEData;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+
+public class DeterministicMarkableAnnotator extends JCasAnnotator_ImplBase {
+
+  // list starters like A.  or #1    or 3)
+  static Pattern headerPatt = Pattern.compile("^(([A-Z][\\.\\:\\)])|(#\\d+)|(\\d+[\\.\\:\\)])) *");
+
+  @Override
+  public void initialize(UimaContext uc) throws ResourceInitializationException{
+    super.initialize(uc);
+  }
+
+  @Override
+  public void process(JCas jCas)
+      throws AnalysisEngineProcessException {
+    
+//    createMarkablesUsingConstituencyTrees(jCas);
+    createMarkablesUsingDependencyTrees(jCas);
+    
+  }
+
+  private static void createMarkablesUsingDependencyTrees(JCas jCas) {
+    for(Segment seg : JCasUtil.select(jCas, Segment.class)){
+      for(ConllDependencyNode node : JCasUtil.selectCovered(jCas, ConllDependencyNode.class, seg)){
+        String nodeText = node.getCoveredText().toLowerCase();
+        if(node.getId() == 0){
+          continue;
+        }
+        if(nodeText.matches("\\p{Punct}+")){
+          continue;
+        }
+        // 1) get nouns, and expand the markable to the phrase they cover
+        // 2) get determiners like "this" and "these"
+        // 3) non-passive "it"
+        if(node.getPostag().equals("NN") || node.getPostag().equals("NNS")){
+          if(node.getForm().matches("\\s+")) continue;
+          // TODO fix this godawful hack:
+          if(nodeText.equals("date") || nodeText.equals("tablet") || nodeText.equals("hg") || nodeText.equals("lb") || nodeText.equals("status")
+              || nodeText.equals("capsule") || nodeText.equals("mg") || nodeText.equals("cm")){
+            
+            continue;
+          }
+          int begin = node.getBegin();
+          int end = node.getEnd();
+          if(node.getHead().getId() != 0){
+            List<ConllDependencyNode> progeny = getProgeny(node, getDependencyNodes(jCas, getSentence(jCas, node)));
+            progeny = removeConjunctionNodes(node, progeny);
+            if(progeny.size() > 0){
+              for(ConllDependencyNode child : progeny){
+                if(child.getBegin() < begin){
+                  begin = child.getBegin();
+                }
+                if(child.getEnd() > end){
+                  end = child.getEnd();
+                }
+              }
+            }
+          }
+          Markable markable = new Markable(jCas, begin, end);
+          markable.addToIndexes();
+        }else if(node.getPostag().equals("DT") && !node.getDeprel().equals("det")){
+          Markable markable = new Markable(jCas, node.getBegin(), node.getEnd());
+          markable.addToIndexes();
+        }else if(node.getCoveredText().toLowerCase().equals("it") && !node.getDeprel().contains("pass")){
+          Markable markable = new Markable(jCas, node.getBegin(), node.getEnd());
+          markable.addToIndexes();
+        }
+      }
+    }
+  }
+
+  private static List<ConllDependencyNode> removeConjunctionNodes(ConllDependencyNode originalNode,
+      List<ConllDependencyNode> progeny) {
+    List<ConllDependencyNode> filtered = new ArrayList<>();
+    
+    for(ConllDependencyNode node: progeny){
+      if(node == originalNode) filtered.add(node);
+      
+      boolean blockedByConj = false;
+      for(ConllDependencyNode pathEl : DependencyUtility.getPath(progeny, node, originalNode)){
+        if(pathEl == originalNode) continue;
+        if(pathEl.getDeprel().equals("conj") || pathEl.getDeprel().equals("cc") || pathEl.getPostag().equals(".") || pathEl.getPostag().equals(",")){
+          blockedByConj = true;
+          break;
+        }
+      }
+      if(!blockedByConj){
+        filtered.add(node);
+      }
+    }
+    
+    return filtered;
+  }
+
+  private static void createMarkablesUsingConstituencyTrees(JCas jCas) {
+    // personal pronouns:
+//  for(WordToken token : JCasUtil.select(jCas, WordToken.class)){
+//    if(token.getPartOfSpeech().startsWith("PRP") ||
+//        token.getCoveredText().equalsIgnoreCase("here")){
+//      Markable markable = new Markable(jCas, token.getBegin(), token.getEnd());
+//      markable.addToIndexes();
+//    }
+//  }
+
+  // NPs:
+    for(TreebankNode tree : JCasUtil.select(jCas, TreebankNode.class)){
+      if(tree.getNodeType().equals("NP")){
+        String nodeText = tree.getCoveredText();
+        // cases to skip: 1) already included by pos tag above
+        // 2) existential "there"
+        // 3) proper names 
+        // 4) numbers
+        if(tree.getChildren().size() == 1){
+          if(tree.getChildren(0).getNodeType().equals("PRP") ||
+              tree.getChildren(0).getNodeType().equals("EX") ||
+              tree.getChildren(0).getNodeType().equals("CD")) {
+            continue;
+          }
+        }
+        Markable markable = null;
+        Matcher m = headerPatt.matcher(nodeText);
+        int start = tree.getBegin();
+        int end = tree.getEnd();
+        if(m.find()){
+          start = start + m.end();
+        }
+        if((nodeText.endsWith(".") || nodeText.endsWith(":")) && end-1 > start){
+          end = end-1;
+          //            System.err.println("Adjusting end with pair: (" + start + ", " + end + ")");
+        }
+
+        markable = new Markable(jCas, start, end);
+        markable.addToIndexes();
+
+        // N* modifiers of NPs: (
+        for(int i = 0; i < tree.getChildren().size()-1; i++){
+          TreebankNode child = tree.getChildren(i);
+          if(child instanceof TerminalTreebankNode && child.getNodeType().startsWith("N") && !child.getNodeType().equals("NNP")){
+            markable = new Markable(jCas, child.getBegin(), child.getEnd());
+            markable.addToIndexes();
+          }
+        }
+      }
+    }
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java?rev=1666501&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java Fri Mar 13 16:23:14 2015
@@ -0,0 +1,754 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.features.ChainStackFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.CorefSyntaxFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.DistSemFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.SectionFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.TemporalFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.features.DependencyPathFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.DependencyTreeFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.NamedEntityFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.PartOfSpeechFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.PhraseChunkingExtractor;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.TokenFeaturesExtractor;
+import org.apache.ctakes.temporal.ae.feature.treekernel.DocumentStructureTreeExtractor;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.FloatArray;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.CleartkProcessingException;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+public class EventCoreferenceAnnotator extends RelationExtractorAnnotator {
+
+
+  public static final int DEFAULT_SENT_DIST = 5;
+  public static final String PARAM_SENT_DIST = "SentenceDistance";
+  @ConfigurationParameter(name = PARAM_SENT_DIST, mandatory = false, description = "Number of sentences allowed between coreferent mentions")
+  private int maxSentDist = DEFAULT_SENT_DIST;
+  
+  public static final double DEFAULT_PAR_SIM = 0.5;
+  public static final String PARAM_PAR_SIM = "PararaphSimilarity";
+  @ConfigurationParameter(name = PARAM_PAR_SIM, mandatory = false, description = "Similarity required to pair paragraphs for coreference")
+  private double simThreshold = DEFAULT_PAR_SIM;
+  
+  private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
+  private Map<Markable,Set<String>> markableEnts = null;
+  private List<Markable> markablesByConfidence = null;
+  private Map<Annotation,NonEmptyFSList> chains = null;
+
+  private Logger logger = Logger.getLogger(EventCoreferenceAnnotator.class);
+  
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<? extends DataWriter<String>> dataWriterClass,
+      File outputDirectory,
+      float downsamplingRate) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        EventCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        RelationExtractorAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+        downsamplingRate,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterClass,
+        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory);
+  }
+  
+  public static AnalysisEngineDescription createAnnotatorDescription(String modelPath)
+      throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        EventCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        modelPath);
+  }
+  
+  @Override
+  protected List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> getFeatureExtractors() {
+    List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> featureExtractorList = new ArrayList<>();
+    
+    // pick and choose from base class:
+    featureExtractorList.add(new TokenFeaturesExtractor());
+    featureExtractorList.add(new PartOfSpeechFeaturesExtractor());
+    featureExtractorList.add(new PhraseChunkingExtractor());
+    featureExtractorList.add(new NamedEntityFeaturesExtractor());
+    featureExtractorList.add(new DependencyTreeFeaturesExtractor());
+    featureExtractorList.add(new DependencyPathFeaturesExtractor());
+    
+//    featureList.add(new DistanceFeatureExtractor());
+    featureExtractorList.add(new StringMatchingFeatureExtractor());
+    featureExtractorList.add(new TokenFeatureExtractor());
+    featureExtractorList.add(new SectionFeatureExtractor());
+    featureExtractorList.add(new UMLSFeatureExtractor());
+    featureExtractorList.add(new CorefSyntaxFeatureExtractor());
+    featureExtractorList.add(new TemporalFeatureExtractor());
+//    featureExtractorList.add(new ChainStackFeatureExtractor());
+    
+    featureExtractorList.add(new DocumentStructureTreeExtractor());
+    try{
+      featureExtractorList.add(new DistSemFeatureExtractor());
+    }catch(IOException e){
+      e.printStackTrace();
+    }
+    
+    return featureExtractorList;
+  }
+  
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    if(this.isTraining() && JCasUtil.select(jCas, CoreferenceRelation.class).size() == 0){
+      logger.debug("Skipping document with no gold standard coreference relations.");
+      return;
+    }
+    numClassifications = 0;
+    nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+    markableEnts = new HashMap<>();
+    chains = new HashMap<>();
+    markablesByConfidence = new ArrayList<>(JCasUtil.select(jCas, Markable.class));
+    Collections.sort(markablesByConfidence, new MarkableConfidenceComparator());
+    for(Markable m : markablesByConfidence){
+      markableEnts.put(m, getBestEnt(jCas, m));
+    }
+    super.process(jCas);
+    if(!this.isTraining()){
+      for(NonEmptyFSList chainHead : new HashSet<>(chains.values())){
+        CollectionTextRelation chain = new CollectionTextRelation(jCas);
+        chain.setMembers(chainHead);
+        
+        NonEmptyFSList cur = chainHead;
+        while(cur.getTail() != null){
+          cur = (NonEmptyFSList) cur.getTail();
+        }
+        EmptyFSList tail = new EmptyFSList(jCas);
+        tail.addToIndexes();
+        cur.setTail(tail);
+                
+        chain.addToIndexes();
+      }
+    }
+    logger.debug("This document had : " + numClassifications + " pair classifications");
+    foundAnaphors.clear();
+    chains.clear();
+  }
+
+  @Override
+  public void collectionProcessComplete() throws AnalysisEngineProcessException {
+    super.collectionProcessComplete();
+  }
+
+  @Override
+  protected Iterable<IdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+      JCas jcas, Annotation segment) {
+    
+    return new PairIterable(jcas, segment);
+  }
+
+  public List<IdentifiedAnnotationPair> getClosePairs(JCas jcas, Annotation segment, double confidence){
+    List<Markable> markables = new ArrayList<>(JCasUtil.select(jcas, Markable.class));
+    List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(int i = 1; i < markables.size(); i++){
+      Markable ana = markables.get(i);
+      // only look at anaphors w/in this segment:
+      if(!dominates(segment, ana)){
+        continue;
+      }
+      Set<String> bestAnaTypes = getBestEnt(jcas, ana);
+
+      for(int j = i-1; j >= 0; j--){
+        Markable ante = markables.get(j);  
+        if(ante.getConfidence() < confidence){
+          continue;
+        }
+        
+        // check sentence distance unless this is an anatomical site or medication
+        if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
+            bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
+          int sentdist = sentDist(jcas, ante, ana);
+          if(sentdist > maxSentDist) break;
+        }
+        
+        Set<String> bestAnteTypes = getBestEnt(jcas, ante);
+        
+        // if they both have entity types we need to make sure they match
+        // -- if neither has a sem type or only one is tagged we can let them
+        // try to match.
+        if(bestAnaTypes.size() > 0 && bestAnteTypes.size() > 0){
+          boolean overlap = false;
+          for(String semType : bestAnaTypes){
+            if(bestAnteTypes.contains(semType)){
+              overlap = true;
+            }
+          }
+          // they both correspond to named entities but no overlap in which category of named entity.
+          if(!overlap){
+            continue;
+          }
+        }
+        pairs.add(new IdentifiedAnnotationPair(ante, ana));
+      }
+    }
+    return pairs;
+  }
+
+  public Set<String> getBestEnt(JCas jcas, Markable markable){
+    if(markableEnts.containsKey(markable)) return markableEnts.get(markable);
+//    markableEnts.put(markable, new HashSet<String>());
+    Set<String> bestEnts = new HashSet<>();
+    IdentifiedAnnotation bestEnt = null;
+    Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+    for(IdentifiedAnnotation ent : coveringEnts){
+      if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+      ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent);
+      if(entHead == head){
+        if(bestEnt == null){
+          bestEnt = ent;
+        }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+          // if the span of this entity is bigger than the biggest existing one:
+          bestEnt = ent;
+          otherBestEnts = new HashSet<>();
+        }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+          // there is another one with the exact same span and possibly different type!
+          otherBestEnts.add(ent);
+        }
+      }
+    }
+
+    if(bestEnt!=null){
+      bestEnts.add(bestEnt.getClass().getSimpleName());
+//      markableEnts.get(markable).add(bestEnt.getClass().getSimpleName());
+      for(IdentifiedAnnotation other : otherBestEnts){
+        bestEnts.add(other.getClass().getSimpleName());
+//        markableEnts.get(markable).add(other.getClass().getSimpleName());
+      }
+    }
+    return bestEnts;
+//    return markableEnts.get(markable);
+  }
+  public static boolean dominates(Annotation arg1, Annotation arg2) {
+    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+  }
+
+  public List<IdentifiedAnnotationPair> getParagraphPairs(JCas jcas, Annotation segment){
+    List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+    
+    // CODE FOR PARAGRAPH-BASED MATCHING
+    List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+    double[][] sims = new double[pars.size()][pars.size()];
+    for(int i = 0; i < sims.length; i++){
+      Arrays.fill(sims[i], 0.0);
+    }
+    
+    for(int i = 0; i < pars.size(); i++){
+      // get all pairs within this paragraph
+      List<Markable> curParMarkables = JCasUtil.selectCovered(Markable.class, pars.get(i));
+      for(int anaId = 1; anaId < curParMarkables.size(); anaId++){
+        for(int anteId = anaId-1; anteId >= 0; anteId--){
+          Markable ana = curParMarkables.get(anaId);
+          Markable ante = curParMarkables.get(anteId);
+          int sentdist = sentDist(jcas, ante, ana);
+          if(sentdist > maxSentDist) break;
+          pairs.add(new IdentifiedAnnotationPair(ante, ana));
+        }
+      }
+    }
+    return pairs; 
+  }
+  
+  public List<IdentifiedAnnotationPair> getSimilarPairs(JCas jcas, Annotation segment){
+    List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+    FSArray parVecs = JCasUtil.selectSingle(jcas, FSArray.class);
+
+    List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+    double[][] sims = new double[pars.size()][pars.size()];
+    for(int i = 0; i < sims.length; i++){
+      Arrays.fill(sims[i], 0.0);
+    }
+
+    for(int i = 0; i < pars.size(); i++){
+      List<Markable> curParMarkables = JCasUtil.selectCovered(Markable.class, pars.get(i));
+      FloatArray parVec = (FloatArray) parVecs.get(i);
+      for(int j = i-1; j >= 0; j--){
+        if(sims[i][j] == 0.0){
+          // compute the sim explicitly
+          FloatArray prevParVec = (FloatArray) parVecs.get(j);
+          sims[i][j] = calculateSimilarity(parVec, prevParVec);
+        }
+
+        if(sims[i][j] > simThreshold){
+          // pair up all markables in each paragraph
+          List<Markable> prevParMarkables = JCasUtil.selectCovered(Markable.class, pars.get(j));
+          for(int anaId = 0; anaId < curParMarkables.size(); anaId++){
+            for(int anteId = prevParMarkables.size()-1; anteId >= 0; anteId--){
+              Markable ana = curParMarkables.get(anaId);
+              Markable ante = prevParMarkables.get(anteId);
+              int sentdist = sentDist(jcas, ante, ana);
+              if(sentdist > maxSentDist) break;
+              pairs.add(new IdentifiedAnnotationPair(ante, ana));
+            }
+          }
+        }
+      }
+    }
+    return pairs;
+  }
+  
+  public List<IdentifiedAnnotationPair> getConfidentPairs(JCas jcas, Annotation segment, double threshold){
+    List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+    List<Markable> anas = JCasUtil.selectCovered(Markable.class, segment);
+    
+    for(Markable ana : anas){
+      for(Markable ante : markablesByConfidence){
+        // if we are into the unconfident 
+        if(ante.getConfidence() < threshold){
+          break;
+        }
+        
+        // if the candidate antecedent is after the anafor skip it.
+        if(ante.getBegin() > ana.getBegin() && ante.getEnd() > ante.getEnd()){
+          continue;
+        }
+        
+        // if the anaphor has a sem type make sure the ante matches it
+        boolean match = false;
+        if(markableEnts.get(ana).size() > 0){
+          if(markableEnts.get(ante).size() == 0){
+            match = true;
+          }else{
+            for(String semType : markableEnts.get(ana)){
+              if(markableEnts.get(ante).contains(semType)){
+                match = true;
+                break;
+              }
+            }
+          }
+        }else{
+          match = true;
+        }
+        
+        if(match){
+          pairs.add(new IdentifiedAnnotationPair(ante, ana));
+        }
+      }
+    }
+    return pairs;
+  }
+  /*
+   * Markables that are in a section header are highly salient and prime candidates
+   * as antecedents in coreference. We detect headers as sentences that are the only sentence in a paragraph.
+   * This is probably high recall with some precision hits but thats ok for now. 
+   */
+  public List<IdentifiedAnnotationPair> getSectionHeaderPairs(JCas jcas, Annotation segment, double confidence){
+    List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+    List<Markable> markables = JCasUtil.selectCovered(jcas, Markable.class, segment);
+    for(int i = 0; i < markables.size(); i++){
+      IdentifiedAnnotation ana = markables.get(i);
+      List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, ana.getBegin());
+      for(int j = 0; j < pars.size(); j++){
+        Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
+        List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+        if(coveredSents != null && coveredSents.size() == 1){
+          for(Markable anteCandidate : JCasUtil.selectCovered(jcas, Markable.class, par)){
+            if(anteCandidate.getConfidence() > confidence){
+              pairs.add(new IdentifiedAnnotationPair(anteCandidate, ana));
+            }
+          }
+        }
+      }
+    }
+    return pairs;
+  }
+
+  public List<IdentifiedAnnotationPair> getAlreadyLinkedPairs(JCas jcas, Annotation segment){
+    List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+    List<Markable> segMarkables = new ArrayList<>(JCasUtil.selectCovered(jcas, Markable.class, segment));
+    
+    // if we are testing, there are no chains in the cas yet so we have to look at the 
+    // intermediate data structures we use.
+    for(int i = 0; i < segMarkables.size(); i++){
+      Markable ana = segMarkables.get(i);
+      if(this.isTraining()){
+        for(CollectionTextRelation chain : JCasUtil.select(jcas, CollectionTextRelation.class)){
+          FSList head = chain.getMembers();
+          Markable last = null;
+          while(head instanceof NonEmptyFSList){
+            Markable m = (Markable) ((NonEmptyFSList)head).getHead();
+
+            // ignore markables past the current anaphor or equal to it
+            if(m == null || m.getEnd() > ana.getEnd()){
+              break;
+            }
+            if(!(m.getBegin() == ana.getBegin() && m.getEnd() == ana.getEnd())){
+              last = m;
+            }
+            head = ((NonEmptyFSList)head).getTail();
+          }
+          if(last != null){
+            pairs.add(new IdentifiedAnnotationPair(last, ana));
+          }
+        }
+//      }else{
+//        for(LinkedHashSet<Markable> chain : chains.values()){
+//          Markable last = null;
+//          for(Markable element : chain){
+//            last = element;
+//          }
+//          pairs.add(new IdentifiedAnnotationPair(last, ana));
+//        }
+      }
+    }
+    
+    return pairs;
+  }
+  
+  @Override
+  protected String classify(List<Feature> features)
+      throws CleartkProcessingException {
+    numClassifications++;
+    return super.classify(features);
+  }
+  
+  @Override
+  protected Class<? extends Annotation> getCoveringClass() {
+    return Segment.class;
+  }
+  
+  @Override
+  protected Class<? extends BinaryTextRelation> getRelationClass() {
+    return CoreferenceRelation.class;
+  }
+
+  protected HashSet<IdentifiedAnnotation> foundAnaphors = new HashSet<>(); 
+  int numClassifications = 0;
+  
+  @Override
+  protected void createRelation(
+      JCas jCas,
+      IdentifiedAnnotation ante,
+      IdentifiedAnnotation ana,
+      String predictedCategory) {
+    // check if its already been linked
+    if(!foundAnaphors.contains(ana)){
+      // add the relation to the CAS
+      RelationArgument relArg1 = new RelationArgument(jCas);
+      relArg1.setArgument(ante);
+      relArg1.setRole("Antecedent");
+      relArg1.addToIndexes();
+      RelationArgument relArg2 = new RelationArgument(jCas);
+      relArg2.setArgument(ana);
+      relArg2.setRole("Anaphor");
+      relArg2.addToIndexes();
+      CoreferenceRelation relation = new CoreferenceRelation(jCas);
+      relation.setArg1(relArg1);
+      relation.setArg2(relArg2);
+      relation.setCategory(predictedCategory);
+      relation.addToIndexes();
+      foundAnaphors.add(ana);
+      if(!chains.containsKey(ante)){
+        // new chain
+        NonEmptyFSList anteEl = new NonEmptyFSList(jCas);
+        NonEmptyFSList anaEl = new NonEmptyFSList(jCas);
+        anteEl.setHead(ante);
+        anaEl.setHead(ana);
+        anteEl.setTail(anaEl);
+        anaEl.setTail(null);
+        chains.put(ante, anteEl);
+        chains.put(ana, anteEl);
+        anaEl.addToIndexes();
+        anteEl.addToIndexes();
+      }else{
+        NonEmptyFSList oldChain = chains.get(ante);
+        NonEmptyFSList chainEnd = oldChain;
+        NonEmptyFSList anaEl = new NonEmptyFSList(jCas);
+        anaEl.setHead(ana);
+        anaEl.setTail(null);
+        
+        while(chainEnd.getTail() != null){
+          chainEnd = (NonEmptyFSList) chainEnd.getTail();          
+        }
+        
+        chainEnd.setTail(anaEl);
+        chains.put(ana, oldChain);
+        anaEl.addToIndexes();
+      }
+    }else{
+      logger.error("Greedy coreference resolution violated -- anaphor linked to two candidate antecedents!");
+    }
+  }
+  
+  @Override
+  protected String getRelationCategory(
+      Map<List<Annotation>, BinaryTextRelation> relationLookup,
+      IdentifiedAnnotation ante, IdentifiedAnnotation ana) {
+    String cat = super.getRelationCategory(relationLookup, ante, ana);
+    int dist = sentsBetween(ante, ana);
+    
+    if(cat != null && !cat.equals(NO_RELATION_CATEGORY)){
+      // cat is some coref category
+      foundAnaphors.add(ana);
+      logger.info(String.format("DISTSALIENCE: (%d,%f,1)\n", dist, ante.getConfidence()));    
+    }else{
+      // sample 10 percent of negative examples:
+      if(Math.random() < 0.1){
+        logger.info(String.format("DISTSALIENCE: (%d,%f,0)\n", dist, ante.getConfidence()));
+      }
+    }
+    return cat;
+  }
+
+  public static int sentDist(JCas jcas, IdentifiedAnnotation arg1,
+      IdentifiedAnnotation arg2){
+    return JCasUtil.selectCovered(jcas, Sentence.class, arg1.getBegin(), arg2.getEnd()).size();
+  }
+  
+  public static int sentsBetween(IdentifiedAnnotation arg1,
+      IdentifiedAnnotation arg2) {
+    Collection<Sentence> sents = JCasUtil.selectBetween(Sentence.class, arg1, arg2);
+    return sents.size();
+  }
+  
+  private static double calculateSimilarity(FloatArray f1, FloatArray f2){
+    double sim = 0.0f;
+    double f1len = 0.0;
+    double f2len = 0.0;
+    
+    for(int i = 0; i < f1.size(); i++){
+      sim += (f1.get(i) * f2.get(i));
+      f1len += (f1.get(i) * f1.get(i));
+      f2len += (f2.get(i) * f2.get(i));
+    }
+    f1len = Math.sqrt(f1len);
+    f2len = Math.sqrt(f2len);
+    sim = sim / (f1len * f2len);
+    
+    return sim;
+  }
+  
+  class PairIterable implements Iterable<IdentifiedAnnotationPair> {
+
+    PairIterator iter = null;
+    
+    public PairIterable(JCas jcas, Annotation segment){
+      iter = new PairIterator(jcas, segment);
+    }
+    
+    @Override
+    public Iterator<IdentifiedAnnotationPair> iterator() {
+      return iter;
+    }
+    
+  }
+  
+  class PairIterator implements Iterator<IdentifiedAnnotationPair> {
+
+    JCas jcas = null;
+    Annotation segment = null;
+    // need 2 passes -- first for preliminary pairs, then for linking to
+    // existing chains - could bee FIXME'd by creating uima chains as we go instead
+    // of using placeholder chains but that is substantially more complicated.
+    List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+    List<IdentifiedAnnotationPair> pass2Pairs = null;
+    IdentifiedAnnotationPair next = null;
+    
+    public PairIterator(JCas jcas, Annotation segment) {
+      this.jcas = jcas;
+      this.segment = segment;
+      
+      pairs.addAll(getClosePairs(jcas, segment, 0.0));
+      pairs.addAll(getSectionHeaderPairs(jcas, segment, 0.0));
+//      
+//      pairs.addAll(getConfidentPairs(jcas, segment, 0.25));
+//      if(!isTraining()){
+//        Collections.sort(pairs, new MarkableConfidenceComparator());
+//        Collections.sort(pairs, new IdentifiedAnnotationPairComparator());
+//      }
+    }
+
+    @Override
+    public boolean hasNext() {
+      while(pairs.size() > 0){
+        next = pairs.remove(0);
+        IdentifiedAnnotation ante = next.getArg1();
+        IdentifiedAnnotation ana = next.getArg2();
+        if(dominates(ante, ana) || dominates(ana,ante)) continue;
+        if(!foundAnaphors.contains(ana)){
+          return true;
+        }
+      }
+      
+      if(pass2Pairs == null){
+        pass2Pairs = new ArrayList<>();
+//        pass2Pairs.addAll(getAlreadyLinkedPairs(this.jcas, this.segment));
+      }
+      
+      while(pass2Pairs.size() > 0){
+        next = pass2Pairs.remove(0);
+        IdentifiedAnnotation ante = next.getArg1();
+        IdentifiedAnnotation ana = next.getArg2();
+        if(dominates(ante, ana) || dominates(ana,ante)) continue;
+        if(!foundAnaphors.contains(ana)){
+          return true;
+        }
+      }
+      
+      return false; // if we get this far then there were no good candidates
+    }
+
+    @Override
+    public IdentifiedAnnotationPair next() {
+      numClassifications++;
+      return next;
+    }
+
+    @Override
+    public void remove() {
+      // Optional implementation
+    }
+    
+  }
+  
+  public class MarkablePairConfidenceComparator implements
+  Comparator<IdentifiedAnnotationPair> {
+
+    public int compare(IdentifiedAnnotationPair o1, IdentifiedAnnotationPair o2) {
+      if(o1 == o2) return 0;
+      int sim;
+      IdentifiedAnnotation ante1 = o1.getArg1();
+      IdentifiedAnnotation ante2 = o2.getArg1();
+      IdentifiedAnnotation ana1 = o1.getArg2();
+      IdentifiedAnnotation ana2 = o2.getArg2();
+      
+      // first level sorting is by anaphor:
+      if(ana1.getBegin() != ana2.getBegin()){
+        sim = ana1.getBegin() - ana2.getBegin() > 0 ? 1 : -1;
+      }else if(ana1.getEnd() != ana2.getEnd()){
+        sim = ana1.getEnd() - ana2.getEnd() > 0 ? 1 : -1;
+      }else{
+        // sort by antecedent
+        if(ante1.getConfidence() > ante2.getConfidence()){
+          sim = -1;
+        }else if(ante1.getConfidence() < ante2.getConfidence()){
+          sim = 1;
+        }else{
+          sim = 0;
+        }
+      }
+      
+      return sim;
+    }
+
+  }
+
+  public class MarkableConfidenceComparator implements Comparator<Markable> {
+    public int compare(Markable m1, Markable m2){
+      if(m1 == m2) return 0;
+      if(m1.getConfidence() > m2.getConfidence()){
+        return -1;
+      }else if(m1.getConfidence() < m2.getConfidence()){
+        return 1;
+      }else{
+        return 0;
+      }
+    }
+  }
+  
+  public class IdentifiedAnnotationPairComparator implements Comparator<IdentifiedAnnotationPair> {
+
+    public int compare(IdentifiedAnnotationPair o1, IdentifiedAnnotationPair o2) {
+      if(o1 == o2) return 0;
+      int sim;
+      IdentifiedAnnotation ante1 = o1.getArg1();
+      IdentifiedAnnotation ante2 = o2.getArg1();
+      IdentifiedAnnotation ana1 = o1.getArg2();
+      IdentifiedAnnotation ana2 = o2.getArg2();
+      
+      // first level sorting is by anaphor:
+      if(ana1.getBegin() != ana2.getBegin()){
+        sim = ana1.getBegin() - ana2.getBegin() > 0 ? 1 : -1;
+      }else if(ana1.getEnd() != ana2.getEnd()){
+        sim = ana1.getEnd() - ana2.getEnd() > 0 ? 1 : -1;
+      }else if(ante1.getBegin() != ante2.getBegin()){
+        sim = ante1.getBegin() - ante2.getBegin() > 0 ? 1 : -1;
+      }else if(ante1.getEnd() != ante2.getEnd()){
+        sim = ante1.getEnd() - ante2.getEnd() > 0 ? 1 : -1;
+      }else{
+        sim = 0;
+      }
+      return sim;
+    }
+    
+  }
+  
+  private class AnnotationComparator implements Comparator<Annotation> {
+
+    public AnnotationComparator() {
+    }
+
+    @Override
+    public int compare(Annotation o1, Annotation o2) {
+      if(o1.getBegin() < o2.getBegin()){
+        return -1;
+      }else if(o1.getBegin() == o2.getBegin() && o1.getEnd() < o2.getEnd()){
+        return -1;
+      }else if(o1.getBegin() == o2.getBegin() && o1.getEnd() > o2.getEnd()){
+        return 1;
+      }else if(o2.getBegin() < o1.getBegin()){
+        return 1;
+      }else{
+        return 0;
+      }
+    }
+  }
+
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java?rev=1666501&r1=1666500&r2=1666501&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java Fri Mar 13 16:23:14 2015
@@ -15,11 +15,15 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
-import org.uimafit.component.JCasAnnotator_ImplBase;
-import org.uimafit.descriptor.ConfigurationParameter;
 
+/*
+ * Reads a particular kind of format used by the first generation ctakes
+ * coreference resolver from the ODIE corpus.
+ */
 public class GoldCoreferenceReader extends JCasAnnotator_ImplBase {
 
 	public static final String PARAM_INPUT_DIR = "inputDirectory";

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableAnnotator.java?rev=1666501&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableAnnotator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableAnnotator.java Fri Mar 13 16:23:14 2015
@@ -0,0 +1,225 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.ctakes.temporal.ae.TemporalEntityAnnotator_ImplBase;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
+import org.cleartk.ml.feature.extractor.CombinedExtractor1;
+import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.feature.extractor.TypePathExtractor;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+public class MarkableAnnotator extends TemporalEntityAnnotator_ImplBase {
+
+  public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE = "ProbabilityOfKeepingANegativeExample";
+
+  @ConfigurationParameter(
+      name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+      mandatory = false,
+      description = "probability that a negative example should be retained for training")
+  protected Float probabilityOfKeepingANegativeExample = 1.0f;
+
+  public static final String PARAM_IS_RETRAINING = "IsRetraining";
+  @ConfigurationParameter(
+      name = PARAM_IS_RETRAINING,
+      mandatory = false,
+      description = "Is this version supposed to do retraining (as in positive unlabeled learning)?")
+  protected boolean isRetraining = false;
+
+  public static final double SCORE_THRESHOLD = 0.05;
+  
+  protected FeatureExtractor1 wordTypeExtractor;
+
+  protected FeatureExtractor1 tokenFeatureExtractor;
+
+  protected CleartkExtractor contextFeatureExtractor;
+  
+  protected Random rand = new Random();
+
+  private static final String NON_MARKABLE = "NON_MARKABLE";
+  private static final String MARKABLE = "MARKABLE";
+
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<?> dataWriter,
+      File outputDirectory,
+      float downratio) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createPrimitiveDescription(
+        MarkableAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriter,
+        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory,
+        MarkableAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+        downratio);
+  }
+
+  public static AnalysisEngineDescription createAnnotatorDescription()
+      throws ResourceInitializationException {
+    return AnalysisEngineFactory.createPrimitiveDescription(
+        MarkableAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        String.format(
+            "/%s/model.jar",
+            MarkableAnnotator.class.getName().toLowerCase().replace('.', '/')));
+  }
+
+  @Override
+  public void initialize(UimaContext context) throws ResourceInitializationException {
+    super.initialize(context);
+
+    CombinedExtractor1 allExtractors = new CombinedExtractor1(
+        new CoveredTextExtractor(),
+        new TypePathExtractor(BaseToken.class, "partOfSpeech"));
+    this.contextFeatureExtractor = new CleartkExtractor(
+        BaseToken.class,
+        this.tokenFeatureExtractor,
+        new Preceding(3),
+        new Following(3));
+    
+    this.tokenFeatureExtractor = new CleartkExtractor(BaseToken.class, allExtractors, new Bag(new Covered()));
+    
+//    if(this.isRetraining){
+//      this.dataWriter = initializeDataWriter(context);
+//    }
+  }
+  
+  @Override
+  public void process(JCas jCas, Segment segment) throws AnalysisEngineProcessException {
+    HashSet<Markable> mentions = new HashSet<>(JCasUtil.selectCovered(Markable.class, segment));
+        
+    for(TopTreebankNode root : JCasUtil.selectCovered(TopTreebankNode.class, segment)){
+      Map<Integer, IdentifiedAnnotation> entEnds = new HashMap<>();
+      HashSet<EventMention> events = new HashSet<>(JCasUtil.selectCovered(EventMention.class, root));
+      HashSet<EntityMention> entities = new HashSet<>(JCasUtil.selectCovered(EntityMention.class, root));
+      for(EventMention event : events){
+        entEnds.put(event.getEnd(), event);
+      }
+      for(EntityMention entity : entities){
+        entEnds.put(entity.getEnd(), entity);
+      }
+      recursivelyProcessNode(jCas, root.getChildren(0), mentions, entEnds, 0);
+    }
+  }
+
+  private void recursivelyProcessNode(JCas jCas, TreebankNode node, Set<Markable> mentions, Map<Integer,IdentifiedAnnotation> entEnds, int depth) throws AnalysisEngineProcessException {
+    ArrayList<Feature> features = new ArrayList<>();
+    String category = NON_MARKABLE;
+
+    // node-based features
+    if(node.getParent().getParent() == null) features.add(new Feature("IS_ROOT", true));
+    features.add(new Feature("NODE_LABEL", node.getNodeType()));
+    features.add(new Feature("PARENT_LABEL", node.getParent().getNodeType()));
+    features.add(new Feature("NODE_DEPTH_MORE3", depth > 2));
+    
+    if(node.getLeaf()){
+      features.add(new Feature("IS_LEAF"));
+    }else{
+      StringBuilder buffer = new StringBuilder();
+      for(int i = 0; i < node.getChildren().size(); i++){
+        buffer.append(node.getChildren(i).getNodeType());
+        buffer.append("_");
+        features.add(new Feature("CHILD_BAG", node.getChildren(i).getNodeType()));
+      }
+      features.add(new Feature("PRODUCTION", buffer.toString()));
+      features.add(new Feature("LAST_CHILD", node.getChildren(node.getChildren().size()-1).getNodeType()));
+      features.add(new Feature("FIRST_CHILD", node.getChildren(0).getNodeType()));
+      features.add(new Feature("NODE_UNARY", node.getChildren().size() == 1));
+    }
+    
+    if(entEnds.containsKey(node.getEnd())){
+      IdentifiedAnnotation ent = entEnds.get(node.getEnd());
+      features.add(new Feature("NODE_ENDS_ENT", true));
+      if(ent instanceof EventMention){
+        features.add(new Feature("NODE_ENT_TYPE_" + ent.getTypeID(), true));
+      }
+    }
+    
+    features.addAll(tokenFeatureExtractor.extract(jCas, node));
+      
+    if(this.isTraining() || this.isRetraining){
+      List<Markable> goldMarkables = JCasUtil.selectCovered(Markable.class, node);
+      for(Markable markable : goldMarkables){
+        if(markable.getBegin() == node.getBegin() && markable.getEnd() == node.getEnd()){
+          category = MARKABLE;
+          mentions.remove(markable);
+        }else if(node.getChildren() != null && markable.getBegin() >= node.getBegin() && markable.getEnd() <= node.getEnd()){
+          // check if the mention is crossing two child brackets, if so then just call it a mention:
+          for(int i = 0; i < node.getChildren().size(); i++){
+            TreebankNode child = node.getChildren(i);
+            if( (markable.getBegin() <= child.getBegin() && markable.getEnd() > child.getEnd()) ||
+                (markable.getBegin() < child.getBegin() && markable.getEnd() >= child.getEnd())){
+              category = MARKABLE;
+              mentions.remove(markable);
+            }
+          }
+        }
+      }
+      if(this.isTraining() || category.equals(MARKABLE)){
+        this.dataWriter.write(new Instance<>(category, features));
+      }else{
+        // for re-training, only write non-markables if they are past some threshold:
+//        Map<String,Double> outcomes = this.classifier.score(features);
+//        if(outcomes.get(MARKABLE) < 0.005){
+//          this.dataWriter.write(new Instance<>(category, features));
+//        }
+      }
+    }else{
+      category = this.classifier.classify(features);
+      double score = 0.0;
+      Map<String,Double> outcomes = this.classifier.score(features);
+      
+      score = outcomes.get(MARKABLE);
+      
+      if(category.equals(MARKABLE) || score > SCORE_THRESHOLD){
+        // add to cas
+        Markable markable = new Markable(jCas, node.getBegin(), node.getEnd());
+        markable.addToIndexes();
+      }
+    }
+
+    // now do children if not a leaf & not a mention
+    if(node.getLeaf()) return;
+    
+    for(int i = 0; i < node.getChildren().size(); i++){
+      TreebankNode child = node.getChildren(i);
+      recursivelyProcessNode(jCas, child, mentions, entEnds, depth+1);
+    }
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java?rev=1666501&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java Fri Mar 13 16:23:14 2015
@@ -0,0 +1,87 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.ctakes.coreference.ae.features.salience.ClinicalFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.salience.GrammaticalRoleFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.salience.MorphosyntacticFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.salience.SemanticEnvironmentFeatureExtractor;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+public class MarkableSalienceAnnotator extends CleartkAnnotator<Boolean> {
+
+  List<FeatureExtractor1<Markable>> extractors = new ArrayList<>();
+  
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<? extends DataWriter<Boolean>> dataWriterClass,
+      File outputDirectory) throws ResourceInitializationException{
+    return AnalysisEngineFactory.createEngineDescription(
+        MarkableSalienceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterClass,
+        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory);
+  }
+  
+  public static AnalysisEngineDescription createAnnotatorDescription(String modelPath) throws ResourceInitializationException{
+    return AnalysisEngineFactory.createEngineDescription(
+        MarkableSalienceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        modelPath);
+  }
+  
+  @Override
+  public void initialize(UimaContext context)
+      throws ResourceInitializationException {
+    super.initialize(context);
+    
+    extractors.add(new MorphosyntacticFeatureExtractor());
+    extractors.add(new GrammaticalRoleFeatureExtractor());
+    extractors.add(new SemanticEnvironmentFeatureExtractor());
+    extractors.add(new ClinicalFeatureExtractor());
+  }
+  
+  @Override
+  public void process(JCas jcas) throws AnalysisEngineProcessException {
+    
+    for(Markable markable : JCasUtil.select(jcas, Markable.class)){
+      boolean outcome;
+      List<Feature> features = new ArrayList<>();
+      for(FeatureExtractor1<Markable> extractor : extractors){
+        features.addAll(extractor.extract(jcas, markable));
+      }
+      Instance<Boolean> instance = new Instance<>(features);
+      
+      if(this.isTraining()){
+        outcome = markable.getConfidence() > 0.5;
+        instance.setOutcome(outcome);
+        this.dataWriter.write(instance);
+      }else{
+        Map<Boolean,Double> outcomes = this.classifier.score(features);
+        markable.setConfidence(outcomes.get(true).floatValue());
+      }      
+    }
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1666501&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Fri Mar 13 16:23:14 2015
@@ -0,0 +1,316 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterPartOfSpeechFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.CleartkProcessingException;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+import org.cleartk.util.ViewUriUtil;
+
+import com.google.common.collect.Lists;
+
+public class MentionClusterCoreferenceAnnotator extends CleartkAnnotator<String> {
+  public static final String NO_RELATION_CATEGORY = "-NONE-";
+  public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE =
+      "ProbabilityOfKeepingANegativeExample";
+  @ConfigurationParameter(
+      name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+      mandatory = false,
+      description = "probability that a negative example should be retained for training")
+  protected double probabilityOfKeepingANegativeExample = 0.5;
+
+  protected Random coin = new Random(0);
+
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<? extends DataWriter<String>> dataWriterClass,
+      File outputDirectory,
+      float downsamplingRate) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        MentionClusterCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        RelationExtractorAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+        downsamplingRate,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterClass,
+        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory);
+  }
+
+  public static AnalysisEngineDescription createAnnotatorDescription(
+      String modelPath) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        MentionClusterCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        modelPath);
+  }
+
+  private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> featureExtractors = this.getFeatureExtractors();
+
+  protected List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> getFeatureExtractors() {
+    return Lists.newArrayList(
+        new MentionClusterAgreementFeaturesExtractor(),
+//        new MentionClusterPartOfSpeechFeaturesExtractor(),
+        new MentionClusterStringFeaturesExtractor()
+        );
+  }
+  
+  protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+      JCas jcas,
+      IdentifiedAnnotation mention){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()) continue;
+      
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) getMostRecent(members, mention);
+      if(EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > 5) continue;
+      pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+    }
+    return pairs;
+  }
+
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    // lookup from pair of annotations to binary text relation
+    // note: assumes that there will be at most one relation per pair
+    Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
+    relationLookup = new HashMap<>();
+    if (this.isTraining()) {
+      for (CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)) {
+        for(IdentifiedAnnotation mention : new ListIterable<IdentifiedAnnotation>(cluster.getMembers())){
+          CollectionTextRelationIdentifiedAnnotationRelation relation = 
+              new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+//          IdentifiedAnnotation mention = (IdentifiedAnnotation) arg.getArgument();
+          relation.setCluster(cluster);
+          relation.setMention(mention);
+          relation.setCategory("CoreferenceClusterMember");
+          relation.addToIndexes();
+          // The key is a list of args so we can do bi-directional lookup
+          CollectionTextRelationIdentifiedAnnotationPair key = new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention);
+          if(relationLookup.containsKey(key)){
+            String cat = relationLookup.get(key).getCategory();
+            System.err.println("Error in: "+ ViewUriUtil.getURI(jCas).toString());
+            System.err.println("Error! This attempted relation " + relation.getCategory() + " already has a relation " + cat + " at this span: " + mention.getCoveredText());
+          }
+          relationLookup.put(key, relation);
+        }
+      }
+    }
+
+    
+    for(Segment segment : JCasUtil.select(jCas, Segment.class)){
+      for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
+        boolean singleton = true;
+        for(CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs(jCas, mention)){
+          CollectionTextRelation cluster = pair.getCluster();
+          // apply all the feature extractors to extract the list of features
+          List<Feature> features = new ArrayList<>();
+          for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.featureExtractors) {
+            List<Feature> feats = extractor.extract(jCas, cluster, mention);
+            if (feats != null)  features.addAll(feats);
+          }
+
+          // sanity check on feature values
+          for (Feature feature : features) {
+            if (feature.getValue() == null) {
+              feature.setValue("NULL");
+              String message = String.format("Null value found in %s from %s", feature, features);
+              System.err.println(message);
+              //            throw new IllegalArgumentException(String.format(message, feature, features));
+            }
+          }
+
+          // during training, feed the features to the data writer
+          if (this.isTraining()) {
+            String category = this.getRelationCategory(relationLookup, cluster, mention);
+            if (category == null) {
+              continue;
+            }
+
+            // create a classification instance and write it to the training data
+            this.dataWriter.write(new Instance<>(category, features));
+          }
+
+          // during classification feed the features to the classifier and create
+          // annotations
+          else {
+            String predictedCategory = this.classify(features);
+
+            // add a relation annotation if a true relation was predicted
+            if (!predictedCategory.equals(NO_RELATION_CATEGORY)) {
+              createRelation(jCas, cluster, mention, predictedCategory);
+              singleton = false;
+              // break here for "closest-first" greedy decoding strategy (Soon et al., 2001), terminology from Lasalle and Denis (2013),
+              // for "best first" need to keep track of all relations with scores and only keep the highest
+              break;
+            }
+          }
+        }
+        // if we got this far and never matched up the 
+        if(singleton){
+          // make the markable it's own cluster:
+          CollectionTextRelation chain = new CollectionTextRelation(jCas);
+          NonEmptyFSList list = new NonEmptyFSList(jCas);
+          list.setHead(mention);
+          list.setTail(new EmptyFSList(jCas));
+          chain.setMembers(list);
+          chain.addToIndexes();
+          list.addToIndexes();
+          list.getTail().addToIndexes();
+        }
+      }
+    }
+  }
+  
+  /**
+   * Looks up the arguments in the specified lookup table and converts the
+   * relation into a label for classification
+   * 
+   * @return If this category should not be processed for training return
+   *         <i>null</i> otherwise it returns the label sent to the datawriter
+   */
+  protected String getRelationCategory(
+      Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup,
+      CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) {
+    CollectionTextRelationIdentifiedAnnotationRelation relation = 
+        relationLookup.get(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+    String category;
+    if (relation != null) {
+      category = relation.getCategory();
+    } else if (coin.nextDouble() <= this.probabilityOfKeepingANegativeExample) {
+      category = NO_RELATION_CATEGORY;
+    } else {
+      category = null;
+    }
+    return category;
+  }
+
+  /**
+   * Predict an outcome given a set of features. By default, this simply
+   * delegates to the object's <code>classifier</code>. Subclasses may override
+   * this method to implement more complex classification procedures.
+   * 
+   * @param features
+   *          The features to be classified.
+   * @return The predicted outcome (label) for the features.
+   */
+  protected String classify(List<Feature> features) throws CleartkProcessingException {
+    return this.classifier.classify(features);
+  }
+
+  /**
+   * Create a UIMA relation type based on arguments and the relation label. This
+   * allows subclasses to create/define their own types: e.g. coreference can
+   * create CoreferenceRelation instead of BinaryTextRelation
+   * 
+   * @param jCas
+   *          - JCas object, needed to create new UIMA types
+   * @param arg1
+   *          - First argument to relation
+   * @param arg2
+   *          - Second argument to relation
+   * @param predictedCategory
+   *          - Name of relation
+   */
+  protected void createRelation(
+      JCas jCas,
+      CollectionTextRelation cluster,
+      IdentifiedAnnotation mention,
+      String predictedCategory) {
+    // add the relation to the CAS
+    CollectionTextRelationIdentifiedAnnotationRelation relation = new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+    relation.setCluster(cluster);
+    relation.setMention(mention);
+    relation.setCategory(predictedCategory);
+    relation.addToIndexes();
+    
+//    RelationArgument arg = new RelationArgument(jCas);
+//    arg.setArgument(mention);
+    ListFactory.append(jCas, cluster.getMembers(), mention);    
+  }
+
+  private static Annotation getMostRecent(NonEmptyFSList list, Annotation focus){
+    NonEmptyFSList cur = list;
+    Annotation annot = (Annotation) cur.getHead();
+    
+    while(cur.getTail() instanceof NonEmptyFSList){
+      cur = (NonEmptyFSList) cur.getTail();
+      if(((Annotation)cur.getHead()).getEnd() < focus.getEnd()){
+        annot = (Annotation) cur.getHead();
+      }else{
+        break;
+      }
+    }
+
+    return annot;
+  }
+  
+  public static class CollectionTextRelationIdentifiedAnnotationPair {
+    private final CollectionTextRelation cluster;
+    private final IdentifiedAnnotation mention;
+    
+    public CollectionTextRelationIdentifiedAnnotationPair(CollectionTextRelation cluster, IdentifiedAnnotation mention){
+      this.cluster = cluster;
+      this.mention = mention;
+    }
+    
+    public final CollectionTextRelation getCluster(){
+      return this.cluster;
+    }
+    
+    public final IdentifiedAnnotation getMention(){
+      return this.mention;
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+      CollectionTextRelationIdentifiedAnnotationPair other = (CollectionTextRelationIdentifiedAnnotationPair) obj;
+      return (this.cluster == other.cluster &&
+          this.mention == other.mention);
+    }
+    
+    @Override
+    public int hashCode() {
+      return 31*cluster.hashCode() + (mention==null ? 0 : mention.hashCode());
+    }
+  }
+
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java?rev=1666501&r1=1666500&r2=1666501&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java Fri Mar 13 16:23:14 2015
@@ -3,13 +3,8 @@ package org.apache.ctakes.coreference.ae
 import java.io.File;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
-import java.util.Set;
 
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.ctakes.coreference.ae.features.DistanceFeatureExtractor;
@@ -17,8 +12,6 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor;
 import org.apache.ctakes.coreference.util.CorefConst;
-import org.apache.ctakes.coreference.util.Span;
-import org.apache.ctakes.coreference.util.SpanAlignment;
 import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
@@ -31,16 +24,16 @@ import org.apache.ctakes.typesystem.type
 import org.apache.log4j.Logger;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.jcas.tcas.DocumentAnnotation;
 import org.apache.uima.resource.ResourceInitializationException;
-import org.cleartk.classifier.CleartkAnnotator;
-import org.cleartk.classifier.DataWriter;
-import org.cleartk.classifier.jar.DefaultDataWriterFactory;
-import org.cleartk.classifier.jar.GenericJarClassifierFactory;
-import org.uimafit.factory.AnalysisEngineFactory;
-import org.uimafit.util.JCasUtil;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
 
 public class NamedEntityCoreferenceResolver extends RelationExtractorAnnotator {
 	
@@ -82,8 +75,8 @@ public class NamedEntityCoreferenceResol
   }
   
 	@Override
-	protected List<RelationFeaturesExtractor> getFeatureExtractors() {
-		List<RelationFeaturesExtractor> extractors = new ArrayList<RelationFeaturesExtractor>();
+	protected List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> getFeatureExtractors() {
+		List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> extractors = new ArrayList<>();
 		
 		extractors.add(new DistanceFeatureExtractor());
 		extractors.add(new StringMatchingFeatureExtractor());



Mime
View raw message