Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id CC490200B2D for ; Thu, 16 Jun 2016 16:52:11 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id CA8A11602C5; Thu, 16 Jun 2016 14:52:11 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id A7759160A51 for ; Thu, 16 Jun 2016 16:52:09 +0200 (CEST) Received: (qmail 76357 invoked by uid 500); 16 Jun 2016 14:52:08 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 76348 invoked by uid 99); 16 Jun 2016 14:52:08 -0000 Received: from pnap-us-west-generic-nat.apache.org (HELO spamd3-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 16 Jun 2016 14:52:08 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd3-us-west.apache.org (ASF Mail Server at spamd3-us-west.apache.org) with ESMTP id 4BB8F181360 for ; Thu, 16 Jun 2016 14:52:08 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd3-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: 0.374 X-Spam-Level: X-Spam-Status: No, score=0.374 tagged_above=-999 required=6.31 tests=[KAM_ASCII_DIVIDERS=0.8, KAM_LAZY_DOMAIN_SECURITY=1, RP_MATCHES_RCVD=-1.426] autolearn=disabled Received: from mx1-lw-eu.apache.org ([10.40.0.8]) by localhost (spamd3-us-west.apache.org [10.40.0.10]) (amavisd-new, port 10024) with ESMTP id gDjdLBhtasDz for ; Thu, 16 Jun 2016 14:51:55 +0000 (UTC) Received: from mailrelay1-us-west.apache.org (mailrelay1-us-west.apache.org [209.188.14.139]) by mx1-lw-eu.apache.org (ASF Mail Server at mx1-lw-eu.apache.org) with ESMTP id 7E7585F232 for ; Thu, 16 Jun 2016 14:51:54 +0000 (UTC) Received: from svn01-us-west.apache.org (svn.apache.org [10.41.0.6]) by mailrelay1-us-west.apache.org (ASF Mail Server at mailrelay1-us-west.apache.org) with ESMTP id 5D8FFE0D0C for ; Thu, 16 Jun 2016 14:51:53 +0000 (UTC) Received: from svn01-us-west.apache.org (localhost [127.0.0.1]) by svn01-us-west.apache.org (ASF Mail Server at svn01-us-west.apache.org) with ESMTP id 79E0F3A06E1 for ; Thu, 16 Jun 2016 14:51:52 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1748736 [2/5] - in /ctakes/trunk/ctakes-coreference: ./ src/main/java/org/apache/ctakes/coreference/ae/ src/main/java/org/apache/ctakes/coreference/ae/features/ src/main/java/org/apache/ctakes/coreference/ae/features/cluster/ src/main/java... Date: Thu, 16 Jun 2016 14:51:51 -0000 To: commits@ctakes.apache.org From: tmill@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20160616145152.79E0F3A06E1@svn01-us-west.apache.org> archived-at: Thu, 16 Jun 2016 14:52:12 -0000 Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,525 @@ +package org.apache.ctakes.coreference.ae; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.apache.ctakes.core.util.ListFactory; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAttributeFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor; +import org.apache.ctakes.coreference.ae.pairing.cluster.ClusterMentionPairer_ImplBase; +import org.apache.ctakes.coreference.ae.pairing.cluster.ClusterPairer; +import org.apache.ctakes.coreference.ae.pairing.cluster.HeadwordPairer; +import org.apache.ctakes.coreference.ae.pairing.cluster.SectionHeaderPairer; +import org.apache.ctakes.coreference.ae.pairing.cluster.SentenceDistancePairer; +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments; +import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation; +import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation; +import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textsem.Markable; +import org.apache.ctakes.typesystem.type.textspan.Segment; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.EmptyFSList; +import org.apache.uima.jcas.cas.NonEmptyFSList; +import org.apache.uima.resource.ResourceInitializationException; +import org.cleartk.ml.CleartkAnnotator; +import org.cleartk.ml.CleartkProcessingException; +import org.cleartk.ml.DataWriter; +import org.cleartk.ml.Feature; +import org.cleartk.ml.Instance; +import org.cleartk.ml.feature.extractor.FeatureExtractor1; +import org.cleartk.ml.jar.DefaultDataWriterFactory; +import org.cleartk.ml.jar.DirectoryDataWriterFactory; +import org.cleartk.ml.jar.GenericJarClassifierFactory; +import org.cleartk.util.ViewUriUtil; + +public class MentionClusterCoreferenceAnnotator extends CleartkAnnotator { + public static final String NO_RELATION_CATEGORY = "-NONE-"; + public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE = + "ProbabilityOfKeepingANegativeExample"; + @ConfigurationParameter( + name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE, + mandatory = false, + description = "probability that a negative example should be retained for training") + protected double probabilityOfKeepingANegativeExample = 0.5; + + public static final String PARAM_USE_EXISTING_ENCODERS="UseExistingEncoders"; + @ConfigurationParameter(name = PARAM_USE_EXISTING_ENCODERS, + mandatory=false, + description = "Whether to use encoders in output directory during data writing; if we are making multiple calls") + private boolean useExistingEncoders=false; + + protected Random coin = new Random(0); + + boolean greedyFirst = true; + + private static DataWriter classDataWriter = null; + + public static AnalysisEngineDescription createDataWriterDescription( + Class> dataWriterClass, + File outputDirectory, + float downsamplingRate) throws ResourceInitializationException { + return AnalysisEngineFactory.createEngineDescription( + MentionClusterCoreferenceAnnotator.class, + CleartkAnnotator.PARAM_IS_TRAINING, + true, + MentionClusterCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE, + downsamplingRate, + DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, + dataWriterClass, + DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, + outputDirectory); + } + + public static AnalysisEngineDescription createAnnotatorDescription( + String modelPath) throws ResourceInitializationException { + return AnalysisEngineFactory.createEngineDescription( + MentionClusterCoreferenceAnnotator.class, + CleartkAnnotator.PARAM_IS_TRAINING, + false, + GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, + modelPath); + } + + private List> relationExtractors = this.getFeatureExtractors(); + private List> mentionExtractors = this.getMentionExtractors(); + private List pairExtractors = this.getPairExtractors(); + +// private Set markableStrings = null; + + protected List> getFeatureExtractors() { + List> extractors = new ArrayList<>(); + extractors.add(new MentionClusterAgreementFeaturesExtractor()); + extractors.add(new MentionClusterStringFeaturesExtractor()); + extractors.add(new MentionClusterSectionFeaturesExtractor()); + extractors.add(new MentionClusterUMLSFeatureExtractor()); + extractors.add(new MentionClusterDepHeadExtractor()); + extractors.add(new MentionClusterStackFeaturesExtractor()); + extractors.add(new MentionClusterSalienceFeaturesExtractor()); + extractors.add(new MentionClusterAttributeFeaturesExtractor()); +// extractors.add(new MentionClusterAttributeVectorExtractor()); // does nothing yet + +// extractors.add(new MentionClusterDistanceFeaturesExtractor()); + + try { +// extractors.add(new MentionClusterDistSemExtractor("org/apache/ctakes/coreference/distsem/mimic_vectors.txt")); +// extractors.add(new MentionClusterDistSemExtractor("org/apache/ctakes/coreference/distsem/deps.words")); + extractors.add(new MentionClusterSemTypeDepPrefsFeatureExtractor()); + } catch (IOException e) { + e.printStackTrace(); + } + + return extractors; + } + + protected List> getMentionExtractors(){ + List> extractors = new ArrayList<>(); + // mention features from pairwise system: + extractors.add(new MentionClusterAgreementFeaturesExtractor()); + extractors.add(new MentionClusterSectionFeaturesExtractor()); + extractors.add(new MentionClusterUMLSFeatureExtractor()); + extractors.add(new MentionClusterDepHeadExtractor()); + extractors.add(new MentionClusterSalienceFeaturesExtractor()); + +// try{ +// extractors.add(new MentionClusterMentionFeaturesExtractor("org/apache/ctakes/coreference/distsem/ties1mil.lowercase.txt")); +// }catch(CleartkExtractorException e){ +// e.printStackTrace(); +// } + extractors.add(new MentionClusterAttributeFeaturesExtractor()); + + return extractors; + } + + protected List getPairExtractors(){ + List pairers = new ArrayList<>(); + int sentDist = 5; + pairers.add(new SentenceDistancePairer(sentDist)); + pairers.add(new SectionHeaderPairer(sentDist)); + pairers.add(new ClusterPairer(Integer.MAX_VALUE)); + pairers.add(new HeadwordPairer()); + return pairers; + } + + protected Iterable getCandidateRelationArgumentPairs( + JCas jcas, + Markable mention){ + LinkedHashSet pairs = new LinkedHashSet<>(); + for(ClusterMentionPairer_ImplBase pairer : this.pairExtractors){ + pairs.addAll(pairer.getPairs(jcas, mention)); + } + + return pairs; + } + + private void resetPairers(JCas jcas){ + for(ClusterMentionPairer_ImplBase pairer : this.pairExtractors){ + pairer.reset(jcas); + } + } + + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + + if(this.useExistingEncoders && classDataWriter != null){ + this.dataWriter = classDataWriter; + }else if(this.isTraining()){ + classDataWriter = this.dataWriter; + } + } + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + // lookup from pair of annotations to binary text relation + // note: assumes that there will be at most one relation per pair + this.resetPairers(jCas); + + Map relationLookup; + relationLookup = new HashMap<>(); + if (this.isTraining()) { + for (CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)) { + for(IdentifiedAnnotation mention : JCasUtil.select(cluster.getMembers(), Markable.class)){ + CollectionTextRelationIdentifiedAnnotationRelation relation = + new CollectionTextRelationIdentifiedAnnotationRelation(jCas); + relation.setCluster(cluster); + relation.setMention(mention); + relation.setCategory("CoreferenceClusterMember"); + relation.addToIndexes(); + // The key is a list of args so we can do bi-directional lookup + CollectionTextRelationIdentifiedAnnotationPair key = new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention); + if(relationLookup.containsKey(key)){ + String cat = relationLookup.get(key).getCategory(); + System.err.println("Error in: "+ ViewUriUtil.getURI(jCas).toString()); + System.err.println("Error! This attempted relation " + relation.getCategory() + " already has a relation " + cat + " at this span: " + mention.getCoveredText()); + } + relationLookup.put(key, relation); + } + } + } + + + for(Segment segment : JCasUtil.select(jCas, Segment.class)){ + for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){ +// ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention); + boolean singleton = true; + double maxScore = 0.0; + CollectionTextRelation maxCluster = null; + + for(CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs(jCas, mention)){ + CollectionTextRelation cluster = pair.getCluster(); + // apply all the feature extractors to extract the list of features + List features = new ArrayList<>(); + for (RelationFeaturesExtractor extractor : this.relationExtractors) { + List feats = extractor.extract(jCas, cluster, mention); + if (feats != null){ +// Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName())); + features.addAll(feats); + } + } + + for(FeatureExtractor1 extractor : this.mentionExtractors){ + features.addAll(extractor.extract(jCas, mention)); + } + + // here is where feature conjunctions can go (dupFeatures) + List dupFeatures = new ArrayList<>(); + // sanity check on feature values + for (Feature feature : features) { + if (feature.getValue() == null) { + feature.setValue("NULL"); + String message = String.format("Null value found in %s from %s", feature, features); + System.err.println(message); + // throw new IllegalArgumentException(String.format(message, feature, features)); + }else{ +// String prefix = null; + // Durret and Klein style feature conjunctions: pronoun type or pos tag. maybe try umls semantic-type? + /* + if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){ + prefix = "PRO_"+mentionText; + }else if(headNode != null && headNode.getPostag() != null){ + prefix = headNode.getPostag(); + }else{ + prefix = "UNK"; + } + */ + // headword-based feature conjunctions +/* if(headNode != null && headNode.getCoveredText() != null && headMatches(headNode.getCoveredText().toLowerCase(), features)){ + prefix = "HEAD_MATCH"; + }else{ + prefix = "NO_HEAD_MATCH"; + } +*/ + + // UMLS semantic type feature conjunctions + /* + for(Feature feat : features){ + if(feat.getName().startsWith("ClusterSemType")){ + dupFeatures.add(new Feature(feat.getName()+"_"+feature.getName(), feature.getValue())); + } + } + */ + +// if(prefix != null){ +// dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue())); +// } + } + } + + features.addAll(dupFeatures); + + // during training, feed the features to the data writer + if (this.isTraining()) { + String category = this.getRelationCategory(relationLookup, cluster, mention); + if (category == null) { + continue; + } + + // create a classification instance and write it to the training data + this.dataWriter.write(new Instance<>(category, features)); + if(!category.equals(NO_RELATION_CATEGORY)){ + singleton = false; + break; + } + } + + // during classification feed the features to the classifier and create + // annotations + else { + String predictedCategory = this.classify(features); + // TODO look at scores in classifier and try best-pair rather than first-pair? + Map scores = this.classifier.score(features); + + // add a relation annotation if a true relation was predicted + if (!predictedCategory.equals(NO_RELATION_CATEGORY)) { +// Logger.getLogger("MCAnnotator").info(String.format("Making a pair with score %f", scores.get(predictedCategory))); + if(greedyFirst){ + createRelation(jCas, cluster, mention, predictedCategory, scores.get(predictedCategory)); + singleton = false; + // break here for "closest-first" greedy decoding strategy (Soon et al., 2001), terminology from Lasalle and Denis (2013), + // for "best first" need to keep track of all relations with scores and only keep the highest + break; + } + if(scores.get(predictedCategory) > maxScore){ + maxScore = scores.get(predictedCategory); + maxCluster = cluster; + } + } + } + } + if(!this.isTraining() && !greedyFirst && maxCluster != null){ + // make a link with the max cluster + createRelation(jCas, maxCluster, mention, "CoreferenceClusterMember", maxScore); + } + + // if we got this far and never matched up the markable then add it to list. + // do this even during training -- adds non-chain markables to antecedent list which will be seen during testing. + if(singleton){ + // make the markable it's own cluster: + CollectionTextRelation chain = new CollectionTextRelation(jCas); + NonEmptyFSList list = new NonEmptyFSList(jCas); + list.setHead(mention); + list.setTail(new EmptyFSList(jCas)); + chain.setMembers(list); + chain.addToIndexes(); + list.addToIndexes(); + list.getTail().addToIndexes(); + } + } + } + + removeSingletonClusters(jCas); + } + + + /** + * Looks up the arguments in the specified lookup table and converts the + * relation into a label for classification + * + * @return If this category should not be processed for training return + * null otherwise it returns the label sent to the datawriter + */ + protected String getRelationCategory( + Map relationLookup, + CollectionTextRelation cluster, + IdentifiedAnnotation mention) { + CollectionTextRelationIdentifiedAnnotationRelation relation = + relationLookup.get(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention)); + String category; + if (relation != null) { + category = relation.getCategory(); + } else if (coin.nextDouble() <= this.probabilityOfKeepingANegativeExample) { + category = NO_RELATION_CATEGORY; + } else { + category = null; + } + return category; + } + + /** + * Predict an outcome given a set of features. By default, this simply + * delegates to the object's classifier. Subclasses may override + * this method to implement more complex classification procedures. + * + * @param features + * The features to be classified. + * @return The predicted outcome (label) for the features. + */ + protected String classify(List features) throws CleartkProcessingException { + return this.classifier.classify(features); + } + + /** + * Create a UIMA relation type based on arguments and the relation label. This + * allows subclasses to create/define their own types: e.g. coreference can + * create CoreferenceRelation instead of BinaryTextRelation + * + * @param jCas + * - JCas object, needed to create new UIMA types + * @param arg1 + * - First argument to relation + * @param arg2 + * - Second argument to relation + * @param predictedCategory + * - Name of relation + */ + protected void createRelation( + JCas jCas, + CollectionTextRelation cluster, + IdentifiedAnnotation mention, + String predictedCategory, + Double confidence) { + // add the relation to the CAS + CollectionTextRelationIdentifiedAnnotationRelation relation = new CollectionTextRelationIdentifiedAnnotationRelation(jCas); + relation.setCluster(cluster); + relation.setMention(mention); + relation.setCategory(predictedCategory); + relation.setConfidence(confidence); + relation.addToIndexes(); + +// RelationArgument arg = new RelationArgument(jCas); +// arg.setArgument(mention); + ListFactory.append(jCas, cluster.getMembers(), mention); + } + + + private static void removeSingletonClusters(JCas jcas){ + List toRemove = new ArrayList<>(); + for(CollectionTextRelation rel : JCasUtil.select(jcas, CollectionTextRelation.class)){ + NonEmptyFSList head = (NonEmptyFSList) rel.getMembers(); + if(head.getTail() instanceof EmptyFSList){ + toRemove.add(rel); + } + } + + for(CollectionTextRelation rel : toRemove){ + rel.removeFromIndexes(); + } + } + +// private static final boolean dominates(Annotation arg1, Annotation arg2) { +// return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd()); +// } + + /* + public Set getBestEnt(JCas jcas, CollectionTextRelation cluster){ + Set semTypes = new HashSet<>(); + for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){ + semTypes.addAll(getBestEnt(jcas, member)); + } + return semTypes; + } + + public Set getBestEnt(JCas jcas, Markable markable){ + Set bestEnts = new HashSet<>(); + IdentifiedAnnotation bestEnt = null; + Set otherBestEnts = new HashSet<>(); + ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable); + Collection coveringEnts = nodeEntMap.get(head); + for(IdentifiedAnnotation ent : coveringEnts){ + if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities. + ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent); + if(entHead == head){ + if(bestEnt == null){ + bestEnt = ent; + }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){ + // if the span of this entity is bigger than the biggest existing one: + bestEnt = ent; + otherBestEnts = new HashSet<>(); + }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){ + // there is another one with the exact same span and possibly different type! + otherBestEnts.add(ent); + } + } + } + + if(bestEnt!=null){ + bestEnts.add(bestEnt.getClass().getSimpleName()); + for(IdentifiedAnnotation other : otherBestEnts){ + bestEnts.add(other.getClass().getSimpleName()); + } + } + return bestEnts; + } + */ + + public Map getMarkablePairScores(JCas jCas){ + Map scoreMap = new HashMap<>(); + for(CoreferenceRelation reln : JCasUtil.select(jCas, CoreferenceRelation.class)){ + HashableArguments pair = new HashableArguments(reln.getArg1().getArgument(), reln.getArg2().getArgument()); + scoreMap.put(pair, reln.getConfidence()); + } + return scoreMap; + } + + public static class CollectionTextRelationIdentifiedAnnotationPair { + private final CollectionTextRelation cluster; + private final IdentifiedAnnotation mention; + + public CollectionTextRelationIdentifiedAnnotationPair(CollectionTextRelation cluster, IdentifiedAnnotation mention){ + this.cluster = cluster; + this.mention = mention; + } + + public final CollectionTextRelation getCluster(){ + return this.cluster; + } + + public final IdentifiedAnnotation getMention(){ + return this.mention; + } + + @Override + public boolean equals(Object obj) { + CollectionTextRelationIdentifiedAnnotationPair other = (CollectionTextRelationIdentifiedAnnotationPair) obj; + return (this.cluster == other.cluster && + this.mention == other.mention); + } + + @Override + public int hashCode() { + return 31*cluster.hashCode() + (mention==null ? 0 : mention.hashCode()); + } + } + +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,688 @@ +package org.apache.ctakes.coreference.ae; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import org.apache.ctakes.core.util.ListFactory; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAttributeFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistSemExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterMentionFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor; +import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor; +import org.apache.ctakes.coreference.util.ClusterUtils; +import org.apache.ctakes.dependency.parser.util.DependencyUtility; +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments; +import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation; +import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation; +import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation; +import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode; +import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textsem.Markable; +import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention; +import org.apache.ctakes.typesystem.type.textspan.Paragraph; +import org.apache.ctakes.typesystem.type.textspan.Segment; +import org.apache.ctakes.typesystem.type.textspan.Sentence; +import org.apache.log4j.Logger; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.EmptyFSList; +import org.apache.uima.jcas.cas.NonEmptyFSList; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.cleartk.ml.CleartkAnnotator; +import org.cleartk.ml.CleartkProcessingException; +import org.cleartk.ml.DataWriter; +import org.cleartk.ml.Feature; +import org.cleartk.ml.feature.extractor.CleartkExtractorException; +import org.cleartk.ml.feature.extractor.FeatureExtractor1; +import org.cleartk.ml.jar.DefaultDataWriterFactory; +import org.cleartk.ml.jar.DirectoryDataWriterFactory; +import org.cleartk.ml.jar.GenericJarClassifierFactory; +import org.cleartk.ml.svmlight.rank.QidInstance; +import org.cleartk.util.ViewUriUtil; + +public class MentionClusterRankingCoreferenceAnnotator extends CleartkAnnotator { + public static final String NO_RELATION_CATEGORY = "-NONE-"; + public static final String CLUSTER_RELATION_CATEGORY = "CoreferenceClusterMember"; + + public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE = + "ProbabilityOfKeepingANegativeExample"; + @ConfigurationParameter( + name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE, + mandatory = false, + description = "probability that a negative example should be retained for training") + protected double probabilityOfKeepingANegativeExample = 0.5; + + protected Random coin = new Random(0); + + boolean greedyFirst = true; + + private int qid = 0; + + public static AnalysisEngineDescription createDataWriterDescription( + Class> dataWriterClass, + File outputDirectory, + float downsamplingRate) throws ResourceInitializationException { + return AnalysisEngineFactory.createEngineDescription( + MentionClusterRankingCoreferenceAnnotator.class, + CleartkAnnotator.PARAM_IS_TRAINING, + true, + MentionClusterRankingCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE, + downsamplingRate, + DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, + dataWriterClass, + DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, + outputDirectory); + } + + public static AnalysisEngineDescription createAnnotatorDescription( + String modelPath) throws ResourceInitializationException { + return AnalysisEngineFactory.createEngineDescription( + MentionClusterRankingCoreferenceAnnotator.class, + CleartkAnnotator.PARAM_IS_TRAINING, + false, + GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, + modelPath); + } + + private List> relationExtractors = this.getFeatureExtractors(); + private List> mentionExtractors = this.getMentionExtractors(); + + private Set markableStrings = null; + private Map> nodeEntMap = null; + private Map> headWordMarkables = null; + private Map pairScores = null; + + protected List> getFeatureExtractors() { + List> extractors = new ArrayList<>(); + extractors.add(new MentionClusterAgreementFeaturesExtractor()); + extractors.add(new MentionClusterStringFeaturesExtractor()); + extractors.add(new MentionClusterSectionFeaturesExtractor()); + extractors.add(new MentionClusterUMLSFeatureExtractor()); + extractors.add(new MentionClusterDepHeadExtractor()); + extractors.add(new MentionClusterStackFeaturesExtractor()); + extractors.add(new MentionClusterSalienceFeaturesExtractor()); +// extractors.add(new MentionClusterDistanceFeaturesExtractor()); + extractors.add(new MentionClusterAttributeFeaturesExtractor()); + + try { + extractors.add(new MentionClusterDistSemExtractor()); + extractors.add(new MentionClusterSemTypeDepPrefsFeatureExtractor()); + } catch (IOException e) { + e.printStackTrace(); + } + + return extractors; + } + + protected List> getMentionExtractors(){ + List> extractors = new ArrayList<>(); + // mention features from pairwise system: + extractors.add(new MentionClusterAgreementFeaturesExtractor()); + extractors.add(new MentionClusterSectionFeaturesExtractor()); + extractors.add(new MentionClusterUMLSFeatureExtractor()); + extractors.add(new MentionClusterDepHeadExtractor()); + extractors.add(new MentionClusterSalienceFeaturesExtractor()); + + try { + extractors.add(new MentionClusterMentionFeaturesExtractor()); + } catch (CleartkExtractorException e) { + e.printStackTrace(); + } + extractors.add(new MentionClusterAttributeFeaturesExtractor()); + + return extractors; + } + + protected Iterable getCandidateRelationArgumentPairs( + JCas jcas, + IdentifiedAnnotation mention){ + int sentDist = 5; + // using linked hash set ensures no duplicates: + LinkedHashSet pairs = new LinkedHashSet<>(); + pairs.addAll(getSentenceDistancePairs(jcas, mention, sentDist)); + pairs.addAll(getSectionHeaderPairs(jcas, mention, sentDist)); + pairs.addAll(getClusterPairs(jcas, mention, Integer.MAX_VALUE)); + pairs.addAll(getHeadwordMatchPairs(jcas, mention, sentDist)); + + return pairs; + } + + /* + * getExactStringMatchPairs() + * For mentions that have the exact string repeated elsewhere in the document we want to + * allow matching across any distance. We don't use the sentence distance parameter here. + * We make use of a global variable markableStrings that is a HashSet containig all the markable + * strings from this document. + */ + private List getExactStringMatchPairs( + JCas jcas, IdentifiedAnnotation mention, int sentDist) { + List pairs = new ArrayList<>(); + + if(markableStrings.contains(mention.getCoveredText().toLowerCase())){ + for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){ + Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention); + if(mostRecent == null) continue; + + for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){ + if(m == mostRecent) break; + // see if any of the members of the cluster have the exact same string as this + if(m.getCoveredText().toLowerCase().equals(mention.getCoveredText().toLowerCase())){ + pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention)); + break; + } + } + } + } + return pairs; + } + + /* + * getClusterPairs() + * In this method we allow to link to clusters containing more than one mention even if they + * are beyond a sentence distance. First we check whether the most recent mention in the cluster + * is within the specified sentence distance (presumably longer than the sentence distance passed into + * the method that constrains by distance). The wrinkle is that during training many clusters will have multiple + * members but only one before the focus mention. So we need to count the members of a cluster until we + * get to the most recent one in the cluster. If that value is > 1 then we allow the pairing. + */ + private List getClusterPairs( + JCas jcas, IdentifiedAnnotation mention, int sentDist) { + List pairs = new ArrayList<>(); + for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){ + NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers()); + Annotation first = (Annotation) members.getHead(); + if(first == null || mention.getBegin() <= first.getEnd()){ + continue; + } + + IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention); + if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){ + continue; + } + int numMembers=0; + for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){ + numMembers++; + if(m == mostRecent) break; + } + if(numMembers > 1){ + pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention)); + } + } + + return pairs; + } + + /* + * Here we want to add only things that are nearby. First we check the semantic types + * of the cluster we're comparing against. If any member is an Anatomical Site or Medication, + * we add the cluster no matter what. Otherwise we check how many sentences are in between + * the mention and the latest element of the cluster. + */ + protected List getSentenceDistancePairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){ + List pairs = new ArrayList<>(); + Set bestAnaTypes = getBestEnt(jcas, (Markable) mention); + + for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){ + NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers()); + Annotation first = (Annotation) members.getHead(); + if(first == null || mention.getBegin() <= first.getEnd()) continue; + + // check for distance if they are not anatomical site or medication + if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) || + bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){ + + IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention); + if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist) continue; + } + + // check for types of cluster + Set bestClusterTypes = getBestEnt(jcas, cluster); + if(bestAnaTypes.size() > 0 && bestClusterTypes.size() > 0){ + boolean overlap = false; + for(String semType : bestAnaTypes){ + if(bestClusterTypes.contains(semType)){ + overlap = true; + } + } + // they both correspond to named entities but no overlap in which category of named entity. + if(!overlap){ + continue; + } + } + pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention)); + } + return pairs; + } + + /* + * getSectionHeaderPairs() + * Here we want to add clusters where one of the members is on a line all by itself (a section header) + * To do this we leverage the annotatino of Paragraphs, roughly the areas between newlines. If such a + * span only contains one sentence then we consider it a "header" (or also as important a list item). + * If it is a header we add it. Here we use sentDist to not bother adding things that will be added by + * the "sentence distance" method. + */ + protected List getSectionHeaderPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){ + List pairs = new ArrayList<>(); + for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){ + NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers()); + Annotation first = (Annotation) members.getHead(); + if(first == null || mention.getBegin() <= first.getEnd()){ + continue; + } + + // first check if it is sentence distance range -- if so we can ignore because it will be include by other pair generator + IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention); + if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) <= sentDist){ + continue; + } + + // now check if any of the mentions are in a section header + List pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, mention.getBegin()); + for(int j = 0; j < pars.size(); j++){ + boolean match = false; + Paragraph par = pars.get(j); // pars.get(pars.size()-j-1); + List coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par); + if(coveredSents != null && coveredSents.size() == 1){ + // this is sentences that are the same span as paragraphs -- how we model section headers + // see if any of the cluster mentions are in the section header + for(Markable m : JCasUtil.select(members, Markable.class)){ + if(dominates(par, m)){ + pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention)); + match = true; + break; + } + } + } + if(match) break; + } + } + return pairs; + } + + protected List getHeadwordMatchPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){ + List pairs = new ArrayList<>(); + + ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention); + if(headNode == null){ + Logger.getLogger(MentionClusterRankingCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it."); + return pairs; + } + String head = headNode.getCoveredText().toLowerCase(); + if(headWordMarkables.containsKey(head)){ + Set headSet = headWordMarkables.get(head); + for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){ + Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention); + if(mostRecent == null) continue; + for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){ + if(headSet.contains(mostRecent)){ + pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention)); + break; + } + if(m == mostRecent) break; + } + } + } + + return pairs; + } + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + // lookup from pair of annotations to binary text relation + // note: assumes that there will be at most one relation per pair + markableStrings = new HashSet<>(); + nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class); + headWordMarkables = new HashMap<>(); +// pairScores = getMarkablePairScores(jCas); + + Map relationLookup; + relationLookup = new HashMap<>(); + if (this.isTraining()) { + for (CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)) { + for(IdentifiedAnnotation mention : JCasUtil.select(cluster.getMembers(), Markable.class)){ + CollectionTextRelationIdentifiedAnnotationRelation relation = + new CollectionTextRelationIdentifiedAnnotationRelation(jCas); + relation.setCluster(cluster); + relation.setMention(mention); + relation.setCategory("CoreferenceClusterMember"); + relation.addToIndexes(); + // The key is a list of args so we can do bi-directional lookup + CollectionTextRelationIdentifiedAnnotationPair key = new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention); + if(relationLookup.containsKey(key)){ + String cat = relationLookup.get(key).getCategory(); + System.err.println("Error in: "+ ViewUriUtil.getURI(jCas).toString()); + System.err.println("Error! This attempted relation " + relation.getCategory() + " already has a relation " + cat + " at this span: " + mention.getCoveredText()); + } + relationLookup.put(key, relation); + } + } + } + + + for(Segment segment : JCasUtil.select(jCas, Segment.class)){ + for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){ + ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention); + String mentionText = mention.getCoveredText().toLowerCase(); + boolean singleton = true; + double maxScore = Double.NEGATIVE_INFINITY; + CollectionTextRelation maxCluster = null; + List mentionFeatures = new ArrayList<>(); + for(FeatureExtractor1 extractor : this.mentionExtractors){ + mentionFeatures.addAll(extractor.extract(jCas, mention)); + } + + for(CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs(jCas, mention)){ + CollectionTextRelation cluster = pair.getCluster(); + // apply all the feature extractors to extract the list of features + List features = new ArrayList<>(); + features.addAll(mentionFeatures); + + for (RelationFeaturesExtractor extractor : this.relationExtractors) { + List feats = extractor.extract(jCas, cluster, mention); + if (feats != null){ +// Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName())); + features.addAll(feats); + } + } + + + // here is where feature conjunctions can go (dupFeatures) + List dupFeatures = new ArrayList<>(); + // sanity check on feature values + for (Feature feature : features) { + if (feature.getValue() == null) { + feature.setValue("NULL"); + String message = String.format("Null value found in %s from %s", feature, features); + System.err.println(message); + // throw new IllegalArgumentException(String.format(message, feature, features)); + }else{ + String prefix = null; +// if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){ +// prefix = "PRO_"+mentionText; +// }else if(headNode != null && headNode.getPostag() != null){ +// prefix = headNode.getPostag(); +// }else{ +// prefix = "UNK"; +// } + if(prefix != null){ + dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue())); + } + } + } + features.addAll(dupFeatures); + + // during training, feed the features to the data writer + // create a classification instance and write it to the training data + + if (this.isTraining()) { + String category = this.getRelationCategory(relationLookup, cluster, mention); + if (category == null) { + continue; + } + double outVal = 1.0; + if(category.equals(NO_RELATION_CATEGORY)){ + outVal = 0.0; + } + + QidInstance inst = new QidInstance<>(); + inst.setQid(String.valueOf(qid)); + inst.addAll(features); + inst.setOutcome(outVal); + this.dataWriter.write(inst); + if(!category.equals(NO_RELATION_CATEGORY)){ + singleton = false; + break; + } + } + + // during classification feed the features to the classifier and create + // annotations + else { + Double prediction = this.classify(features); + if(prediction > maxScore){ + maxScore = prediction; + maxCluster = cluster; + } + } + } + + markableStrings.add(mention.getCoveredText().toLowerCase()); + + if(headNode != null){ + String head = headNode.getCoveredText().toLowerCase(); + if(!headWordMarkables.containsKey(head)){ + headWordMarkables.put(head, new HashSet()); + } + headWordMarkables.get(head).add(mention); + } + + if(this.isTraining()){ + // write a dummy link with only mention features: + QidInstance inst = new QidInstance<>(); + inst.setQid(String.valueOf(qid)); + for(Feature feat : mentionFeatures){ + if(feat.getName() != null){ + feat.setName("DUMMYLINK_" + feat.getName()); + } + } + inst.addAll(mentionFeatures); + if(singleton){ + inst.setOutcome(1.0); + }else{ + inst.setOutcome(0.0); + } + this.dataWriter.write(inst); + }else{ + Double nullPrediction = this.classify(mentionFeatures); + if(nullPrediction > maxScore){ + // make the markable it's own cluster: + CollectionTextRelation chain = new CollectionTextRelation(jCas); + NonEmptyFSList list = new NonEmptyFSList(jCas); + list.setHead(mention); + list.setTail(new EmptyFSList(jCas)); + chain.setMembers(list); + chain.addToIndexes(); + list.addToIndexes(); + list.getTail().addToIndexes(); + }else{ + createRelation(jCas, maxCluster, mention, CLUSTER_RELATION_CATEGORY); + } + } + qid++; + } + } + + removeSingletonClusters(jCas); + } + + /** + * Looks up the arguments in the specified lookup table and converts the + * relation into a label for classification + * + * @return If this category should not be processed for training return + * null otherwise it returns the label sent to the datawriter + */ + protected String getRelationCategory( + Map relationLookup, + CollectionTextRelation cluster, + IdentifiedAnnotation mention) { + CollectionTextRelationIdentifiedAnnotationRelation relation = + relationLookup.get(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention)); + String category; + if (relation != null) { + category = relation.getCategory(); + } else if (coin.nextDouble() <= this.probabilityOfKeepingANegativeExample) { + category = NO_RELATION_CATEGORY; + } else { + category = null; + } + return category; + } + + /** + * Predict an outcome given a set of features. By default, this simply + * delegates to the object's classifier. Subclasses may override + * this method to implement more complex classification procedures. + * + * @param features + * The features to be classified. + * @return The predicted outcome (label) for the features. + */ + protected Double classify(List features) throws CleartkProcessingException { + return this.classifier.classify(features); + } + + /** + * Create a UIMA relation type based on arguments and the relation label. This + * allows subclasses to create/define their own types: e.g. coreference can + * create CoreferenceRelation instead of BinaryTextRelation + * + * @param jCas + * - JCas object, needed to create new UIMA types + * @param arg1 + * - First argument to relation + * @param arg2 + * - Second argument to relation + * @param predictedCategory + * - Name of relation + */ + protected void createRelation( + JCas jCas, + CollectionTextRelation cluster, + IdentifiedAnnotation mention, + String predictedCategory) { + // add the relation to the CAS + CollectionTextRelationIdentifiedAnnotationRelation relation = new CollectionTextRelationIdentifiedAnnotationRelation(jCas); + relation.setCluster(cluster); + relation.setMention(mention); + relation.setCategory(predictedCategory); + relation.addToIndexes(); + +// RelationArgument arg = new RelationArgument(jCas); +// arg.setArgument(mention); + ListFactory.append(jCas, cluster.getMembers(), mention); + } + + + private void removeSingletonClusters(JCas jcas){ + List toRemove = new ArrayList<>(); + for(CollectionTextRelation rel : JCasUtil.select(jcas, CollectionTextRelation.class)){ + NonEmptyFSList head = (NonEmptyFSList) rel.getMembers(); + if(head.getTail() instanceof EmptyFSList){ + toRemove.add(rel); + } + } + + for(CollectionTextRelation rel : toRemove){ + rel.removeFromIndexes(); + } + } + + private static final boolean dominates(Annotation arg1, Annotation arg2) { + return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd()); + } + + public Set getBestEnt(JCas jcas, CollectionTextRelation cluster){ + Set semTypes = new HashSet<>(); + for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){ + semTypes.addAll(getBestEnt(jcas, member)); + } + return semTypes; + } + + public Set getBestEnt(JCas jcas, Markable markable){ + Set bestEnts = new HashSet<>(); + IdentifiedAnnotation bestEnt = null; + Set otherBestEnts = new HashSet<>(); + ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable); + Collection coveringEnts = nodeEntMap.get(head); + for(IdentifiedAnnotation ent : coveringEnts){ + if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities. + ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent); + if(entHead == head){ + if(bestEnt == null){ + bestEnt = ent; + }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){ + // if the span of this entity is bigger than the biggest existing one: + bestEnt = ent; + otherBestEnts = new HashSet<>(); + }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){ + // there is another one with the exact same span and possibly different type! + otherBestEnts.add(ent); + } + } + } + + if(bestEnt!=null){ + bestEnts.add(bestEnt.getClass().getSimpleName()); + for(IdentifiedAnnotation other : otherBestEnts){ + bestEnts.add(other.getClass().getSimpleName()); + } + } + return bestEnts; + } + + + public Map getMarkablePairScores(JCas jCas){ + Map scoreMap = new HashMap<>(); + for(CoreferenceRelation reln : JCasUtil.select(jCas, CoreferenceRelation.class)){ + HashableArguments pair = new HashableArguments((IdentifiedAnnotation)reln.getArg1().getArgument(), (IdentifiedAnnotation)reln.getArg2().getArgument()); + scoreMap.put(pair, reln.getConfidence()); + } + return scoreMap; + } + + public static class CollectionTextRelationIdentifiedAnnotationPair { + private final CollectionTextRelation cluster; + private final IdentifiedAnnotation mention; + + public CollectionTextRelationIdentifiedAnnotationPair(CollectionTextRelation cluster, IdentifiedAnnotation mention){ + this.cluster = cluster; + this.mention = mention; + } + + public final CollectionTextRelation getCluster(){ + return this.cluster; + } + + public final IdentifiedAnnotation getMention(){ + return this.mention; + } + + @Override + public boolean equals(Object obj) { + CollectionTextRelationIdentifiedAnnotationPair other = (CollectionTextRelationIdentifiedAnnotationPair) obj; + return (this.cluster == other.cluster && + this.mention == other.mention); + } + + @Override + public int hashCode() { + return 31*cluster.hashCode() + (mention==null ? 0 : mention.hashCode()); + } + } + +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,177 @@ +package org.apache.ctakes.coreference.ae; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.dependency.parser.util.DependencyUtility; +import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation; +import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode; +import org.apache.ctakes.typesystem.type.syntax.WordToken; +import org.apache.ctakes.typesystem.type.textsem.Markable; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.EmptyFSList; +import org.apache.uima.jcas.cas.FSList; +import org.apache.uima.jcas.cas.NonEmptyFSList; +import org.apache.uima.resource.ResourceInitializationException; + +public class PersonChainAnnotator extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException { + NonEmptyFSList ptList = new NonEmptyFSList(jcas); + ptList.setHead(null); + NonEmptyFSList weList = new NonEmptyFSList(jcas); + weList.setHead(null); + NonEmptyFSList drList = new NonEmptyFSList(jcas); + drList.setHead(null); + List otherDrs = new ArrayList<>(); + + List words = new ArrayList<>(JCasUtil.select(jcas, WordToken.class)); + for(int i = 0; i < words.size(); i++){ + WordToken word = words.get(i); + String text = word.getCoveredText(); + if(word.getPartOfSpeech().startsWith("PRP")){ + if(text.equalsIgnoreCase("I") || text.equalsIgnoreCase("me") || text.equalsIgnoreCase("my")){ + Markable drMention = new Markable(jcas, word.getBegin(), word.getEnd()); + addToList(jcas, drList, drMention); + }else if(text.equalsIgnoreCase("we") || text.equalsIgnoreCase("us") || text.equalsIgnoreCase("our")){ + Markable weMention = new Markable(jcas, word.getBegin(), word.getEnd()); + addToList(jcas, weList, weMention); + }else if(text.equalsIgnoreCase("it")){ + // do nothing + }else{ + Markable ptMention = new Markable(jcas, word.getBegin(), word.getEnd()); + addToList(jcas, ptList, ptMention); + } + }else if(text.equalsIgnoreCase("dr.")){ + Markable drMention = getDoctorMarkable(jcas, word); //new Markable(jcas, word.getBegin(), words.get(i+1).getEnd()); + addToList(jcas, getCorrectDoctor(jcas, drMention, otherDrs), drMention); + }else if(text.equalsIgnoreCase("mrs.") || text.equalsIgnoreCase("mr.") || text.equalsIgnoreCase("ms.")){ + // TODO - smarter logic for Dr. Firstname Lastname + Markable ptMention = new Markable(jcas, word.getBegin(), words.get(i+1).getEnd()); + addToList(jcas, ptList, ptMention); + }else if(text.equalsIgnoreCase("patient") || text.equalsIgnoreCase("pt")){ + Markable ptMention = new Markable(jcas, word.getBegin(), word.getEnd()); + addToList(jcas, ptList, ptMention); + } + } + + for(NonEmptyFSList otherDr : otherDrs){ + if(otherDr.getHead() != null){ + if(otherDr.getTail() != null){ + endList(jcas, otherDr); + CollectionTextRelation drChain = new CollectionTextRelation(jcas); + drChain.setMembers(otherDr); + drChain.addToIndexes(); + } + } + } + + if(drList.getHead() != null && drList.getTail() != null){ + endList(jcas, drList); + CollectionTextRelation drChain = new CollectionTextRelation(jcas); + drChain.setMembers(drList); + drChain.addToIndexes(); + } + if(ptList.getHead() != null && ptList.getTail() != null){ + endList(jcas, ptList); + CollectionTextRelation ptChain = new CollectionTextRelation(jcas); + ptChain.setMembers(ptList); + ptChain.addToIndexes(); + } + if(weList.getHead() != null && weList.getTail() != null){ + endList(jcas, weList); + CollectionTextRelation weChain = new CollectionTextRelation(jcas); + weChain.setMembers(weList); + weChain.addToIndexes(); + } + } + + public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException { + return AnalysisEngineFactory.createEngineDescription(PersonChainAnnotator.class); + } + + private static void addToList(JCas jcas, NonEmptyFSList list, Markable arg){ + arg.addToIndexes(); + if(list.getHead() == null){ + // first list element: + list.setHead(arg); + }else{ + // subsequent list elements: + NonEmptyFSList cur = list; + while(cur.getTail() != null){ + cur = (NonEmptyFSList)cur.getTail(); + } + NonEmptyFSList tail = new NonEmptyFSList(jcas); + tail.setHead(arg); + cur.setTail(tail); + tail.addToIndexes(); + } + } + + private static void endList(JCas jcas, NonEmptyFSList list){ + NonEmptyFSList cur = list; + while(cur.getTail() != null){ + cur = (NonEmptyFSList)cur.getTail(); + } + EmptyFSList tail = new EmptyFSList(jcas); + cur.setTail(tail); + tail.addToIndexes(); + } + + private static NonEmptyFSList getCorrectDoctor(JCas jcas, Markable mention, List drLists){ + NonEmptyFSList correctDr = null; + if(mention.getCoveredText().length() < 5){ + if(drLists.size() > 0){ + correctDr = drLists.get(0); + } + }else{ + String nameText = mention.getCoveredText().substring(4); + for(NonEmptyFSList drList : drLists){ + FSList curNode = drList; + do{ + String otherName = ((Markable)((NonEmptyFSList)curNode).getHead()).getCoveredText(); + if(otherName.length() >= 5){ + otherName = otherName.substring(4); + if(otherName.contains(nameText) || nameText.contains(otherName)){ + correctDr = drList; + } + } + curNode = ((NonEmptyFSList)curNode).getTail(); + }while(curNode instanceof NonEmptyFSList); + if(correctDr != null) break; + } + } + if(correctDr == null){ + correctDr = new NonEmptyFSList(jcas); + correctDr.setHead(null); + drLists.add(correctDr); + } + return correctDr; + } + + private static Markable getDoctorMarkable(JCas jcas, WordToken drToken){ + Markable markable = null; + + ConllDependencyNode nnpHead = DependencyUtility.getDependencyNode(jcas, drToken); + try{ + while(nnpHead != null && nnpHead.getHead() != null && nnpHead.getHead().getId() != 0 && nnpHead.getHead().getPostag().equals("NNP")){ + nnpHead = nnpHead.getHead(); + } + }catch(NullPointerException e){ + System.err.print("."); + } + + int start = drToken.getBegin(); + int end = nnpHead.getEnd(); + if(end < start) end = drToken.getEnd(); + + markable = new Markable(jcas, start, end); + return markable; + } +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,65 @@ +package org.apache.ctakes.coreference.ae.features; + +import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isGeneric; +import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isHistory; +import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isNegated; +import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isPatient; +import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isUncertain; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textsem.TimeMention; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.ml.Feature; + +public class AttributeFeatureExtractor implements RelationFeaturesExtractor { + + @Override + public List extract(JCas jCas, IdentifiedAnnotation ante, IdentifiedAnnotation ana) + throws AnalysisEngineProcessException { + List features = new ArrayList<>(); + + boolean anaNegated = isNegated(ana); + features.add(new Feature("MC_ana_NEGATED", anaNegated)); + boolean anaUncertain = isUncertain(ana); + features.add(new Feature("MC_ana_UNCERTAIN", anaUncertain)); + boolean anaGen = isGeneric(ana); + features.add(new Feature("MC_ana_GENERIC", anaGen)); + boolean anaSubj = isPatient(ana); + features.add(new Feature("MC_ana_PATIENT", anaSubj)); + boolean anaHist = isHistory(ana); + features.add(new Feature("MC_ana_HISTORY", anaHist)); + boolean anaTimex = isTimex(ana); + features.add(new Feature("MC_ana_TIMEX", anaTimex)); + + boolean anteNegated = isNegated(ante); + features.add(new Feature("MC_ante_NEGATED", anteNegated)); + boolean anteUncertain = isUncertain(ante); + features.add(new Feature("MC_ante_UNCERTAIN", anteUncertain)); + boolean anteGen = isGeneric(ante); + features.add(new Feature("MC_ante_GENERIC", anteGen)); + boolean anteSubj = isPatient(ante); + features.add(new Feature("MC_ante_PATIENT", anteSubj)); + boolean anteHist = isHistory(ante); + features.add(new Feature("MC_ante_HISTORY", anteHist)); + boolean anteTimex = isTimex(ante); + features.add(new Feature("MC_ante_TIMEX", anteTimex)); + + features.add(new Feature("MC_AGREE_NEG", anteNegated == anaNegated)); + features.add(new Feature("MC_AGREE_UNC", anteUncertain == anaUncertain)); + features.add(new Feature("MC_AGREE_TIMEX", anteTimex == anaTimex)); + + return features; + } + + private static boolean isTimex(Annotation a){ + return JCasUtil.selectCovered(TimeMention.class, a).size() > 0; + } + +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,32 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.dependency.parser.util.DependencyUtility; +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.cleartk.ml.Feature; + +public class CorefSyntaxFeatureExtractor implements RelationFeaturesExtractor { + + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList<>(); + + ConllDependencyNode head1 = DependencyUtility.getNominalHeadNode(jCas, arg1); + ConllDependencyNode head2 = DependencyUtility.getNominalHeadNode(jCas, arg2); + + if(head1 != null){ + feats.add(new Feature("Arg1Head", head1.getCoveredText().toLowerCase())); + } + if(head2 != null){ + feats.add(new Feature("Arg2Head", head2.getCoveredText().toLowerCase())); + } + return feats; + } + +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,106 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.apache.ctakes.core.resource.FileLocator; +import org.apache.ctakes.dependency.parser.util.DependencyUtility; +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.utils.distsem.WordEmbeddings; +import org.apache.ctakes.utils.distsem.WordVector; +import org.apache.ctakes.utils.distsem.WordVectorReader; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.cleartk.ml.Feature; +import org.apache.uima.fit.util.JCasUtil; + +public class DistSemFeatureExtractor implements RelationFeaturesExtractor { + + // default value is 0.5 (rather than 0.0) because we don't want to assume OOV words are dissimilar + public static final double DEFAULT_SIM = 0.5; + + private WordEmbeddings words = null; + + public DistSemFeatureExtractor() throws FileNotFoundException, IOException{ + words = WordVectorReader.getEmbeddings(FileLocator.getAsStream("org/apache/ctakes/coreference/distsem/mimic_vectors.txt")); + } + + @Override + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList<>(); + + double sim = 0.0; +// double[] a1vec = getArgVector(arg1); +// double[] a2vec = getArgVector(arg2); +// +// if(a1vec != null && a2vec != null){ +// for(int i = 0; i < a1vec.length; i++){ +// sim += a1vec[i] * a2vec[i]; +// } +// }else{ +// sim = DEFAULT_SIM; +// } +// +// assert !Double.isNaN(sim); +// +// feats.add(new Feature("ARG_SIMILARITY_WORD2VEC", sim)); + + ConllDependencyNode node1 = DependencyUtility.getNominalHeadNode(jCas, arg1); + ConllDependencyNode node2 = DependencyUtility.getNominalHeadNode(jCas, arg2); + String head1 = node1 != null ? node1.getCoveredText().toLowerCase() : null; + String head2 = node2 != null ? node2.getCoveredText().toLowerCase() : null; + if(head1 != null && head2 != null && words.containsKey(head1) && words.containsKey(head2)){ + sim = words.getSimilarity(head1, head2); + }else{ + sim = DEFAULT_SIM; + } + feats.add(new Feature("HEAD_SIMILARITY_WORD2VEC", sim)); + + return feats; + } + + + @SuppressWarnings("unused") + private double[] getArgVector(IdentifiedAnnotation arg){ + double[] vec = null; + + Collection tokens = JCasUtil.selectCovered(BaseToken.class, arg); + + for(BaseToken token : tokens){ + WordVector wv = words.getVector(token.getCoveredText()); + if(wv == null){ + wv = words.getVector(token.getCoveredText().toLowerCase()); + } + if(wv != null){ + if(vec == null){ + vec = new double[wv.size()]; + Arrays.fill(vec, 0.0); + } + for(int i = 0; i < vec.length; i++){ + vec[i] += wv.getValue(i); + } + } + } + + if(vec != null){ + double len = 0.0; + for(int i = 0; i < vec.length; i++){ + len += vec[i]*vec[i]; + } + len = Math.sqrt(len); + assert !Double.isNaN(len); + for(int i = 0; i < vec.length; i++){ + vec[i] /= len; + } + } + return vec; + } +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,29 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.coreference.util.CorefConsts; +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textspan.Sentence; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.cleartk.ml.Feature; +import org.apache.uima.fit.util.JCasUtil; + +public class DistanceFeatureExtractor implements RelationFeaturesExtractor { + + @Override + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList<>(); + feats.add(new Feature("TOK_DIST", + JCasUtil.selectCovered(jCas, BaseToken.class, arg1.getBegin(), arg2.getEnd()).size() / (double)CorefConsts.TOKDIST)); + feats.add(new Feature("SENT_DIST", + JCasUtil.selectCovered(jCas, Sentence.class, arg1.getBegin(), arg2.getEnd()).size() / (double) CorefConsts.NEDIST)); + return feats; + } + +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,24 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.cleartk.ml.Feature; + +public class SalienceFeatureExtractor implements RelationFeaturesExtractor { + + @Override + public List extract(JCas jCas, IdentifiedAnnotation ante, IdentifiedAnnotation ana) + throws AnalysisEngineProcessException { + List feats = new ArrayList<>(); + + feats.add(new Feature("MP_ANTE_SALIENCE", ante.getConfidence())); + feats.add(new Feature("MP_ANA_SALIENCE", ana.getConfidence())); + return feats; + } + +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,56 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textspan.Paragraph; +import org.apache.ctakes.typesystem.type.textspan.Sentence; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.cleartk.ml.Feature; + +public class SectionFeatureExtractor implements RelationFeaturesExtractor { + + public List extract(JCas jcas, IdentifiedAnnotation ante, + IdentifiedAnnotation ana) throws AnalysisEngineProcessException { + List feats = new ArrayList<>(); + boolean anteInHeader = false; + boolean anaInHeader = false; + int antePar = -1; + int anaPar = -1; + + // Find section headers -- paragraphs + List pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class)); + for(int i = 0; i < pars.size(); i++){ + Paragraph par = pars.get(i); + if(par.getBegin() > ana.getEnd()){ + break; + } + if(ante.getBegin() >= par.getBegin() && ante.getEnd() <= par.getEnd()){ + antePar = i; + } + if(ana.getBegin() >= par.getBegin() && ana.getEnd() <= par.getEnd()){ + anaPar = i; + } + List coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par); + if(coveredSents != null && coveredSents.size() == 1){ + if(antePar == i){ + anteInHeader = true; + } + if(anaPar == i){ + anaInHeader = true; + } + } + } + + feats.add(new Feature("AnteInHeader", anteInHeader)); + feats.add(new Feature("AnaInHeader", anaInHeader)); + if(anteInHeader && antePar+1 == anaPar){ + feats.add(new Feature("AnteHeaderHeadsAna", true)); + } + return feats; + } +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,141 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.ml.Feature; + +public class StringMatchingFeatureExtractor implements + RelationFeaturesExtractor { + + @Override + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList<>(); + + // don't extract sim features if one of the markables is a pronoun + if(isPronoun(arg1) || isPronoun(arg2)) return feats; + + String s1 = arg1.getCoveredText(); + String s2 = arg2.getCoveredText(); + Set words1 = contentWords(arg1); + Set words2 = contentWords(arg2); + + feats.add(new Feature("MATCH_EXACT", + s1.equalsIgnoreCase(s2))); + feats.add(new Feature("MATCH_START", + startMatch(s1,s2))); + feats.add(new Feature("MATCH_END", + endMatch(s1,s2))); + feats.add(new Feature("MATCH_SOON", + soonMatch(s1,s2))); + feats.add(new Feature("MATCH_OVERLAP", + wordOverlap(words1, words2))); + feats.add(new Feature("MATCH_SUBSTRING", + wordSubstring(words1, words2))); + return feats; + } + + public static boolean startMatch (String a, String b) { + int ia = a.indexOf(" "); + int ib = b.indexOf(" "); + String aa = a.substring(0, ia==-1?(a.length()>5?5:a.length()):ia); + String bb = b.substring(0, ib==-1?(b.length()>5?5:b.length()):ib); + return aa.equalsIgnoreCase(bb); + } + + public static boolean endMatch (String a, String b) { + int ia = a.lastIndexOf(" "); + int ib = b.lastIndexOf(" "); + String aa = a.substring(ia==-1?(a.length()>5?a.length()-5:0):ia+1); + String bb = b.substring(ib==-1?(b.length()>5?b.length()-5:0):ib+1); + return aa.equalsIgnoreCase(bb); + } + + public static boolean soonMatch (String s1, String s2) { + String sl1 = nonDetSubstr(s1.toLowerCase()); + String sl2 = nonDetSubstr(s2.toLowerCase()); + return sl1.equals(sl2); + } + + public static String nonDetSubstr (String s) { + if(s.startsWith("the ")) return s.substring(4); + if(s.startsWith("a ")) return s.substring(2); + if(s.startsWith("this ")) return s.substring(5); + if(s.startsWith("that ")) return s.substring(5); + if(s.startsWith("these ")) return s.substring(6); + if(s.startsWith("those ")) return s.substring(6); + return s; + } + + public static boolean wordOverlap(Set t1, Set t2) { + for (String s : t2){ + if (t1.contains(s)){ + return true; + } + } + return false; + } + + public static boolean wordSubstring(Set t1, Set t2){ + for(String s1 : t1){ + for(String s2 : t2){ + if(s1.contains(s2) || s2.contains(s1)) return true; + } + } + return false; + } + + public static Set contentWords(Annotation a1){ + Set words = new HashSet<>(); + for(BaseToken tok : JCasUtil.selectCovered(BaseToken.class, a1)){ + words.add(tok.getCoveredText().toLowerCase()); + } + return words; + } + + public static boolean isPronoun(IdentifiedAnnotation a1){ + List tokens = JCasUtil.selectCovered(BaseToken.class, a1); + + if(tokens.size() != 1){ + return false; + } + + BaseToken token = tokens.get(0); + if(token.getPartOfSpeech() == null){ + return false; + } + if(token.getPartOfSpeech().startsWith("PRP")) return true; + if(token.getPartOfSpeech().equals("DT")) return true; + + + return false; + } + + public static boolean inQuote(JCas jcas, Annotation a){ + boolean inQuote = false; + String docText = jcas.getDocumentText(); + + // Logic: Find the newline preceding this mention, if there is a quote in between + // the start of the line and the start of the mention then the mention is inside quotes. + // not foolproof but probably pretty accurate. + int lastNewline = docText.lastIndexOf("\n", a.getBegin()); + if(lastNewline != 0){ + int firstQuote = docText.indexOf('"', lastNewline); + if(firstQuote != 0){ + inQuote = true; + } + } + + return inQuote; + } +} Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java?rev=1748736&view=auto ============================================================================== --- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java (added) +++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java Thu Jun 16 14:51:51 2016 @@ -0,0 +1,54 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.dependency.parser.util.DependencyUtility; +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode; +import org.apache.ctakes.typesystem.type.textsem.EventMention; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.cleartk.ml.Feature; + +public class TemporalFeatureExtractor implements RelationFeaturesExtractor { + + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList<>(); + + String a1dtr = getDocTimeRelForArg(jCas, arg1); + String a2dtr = getDocTimeRelForArg(jCas, arg2); + + feats.add(new Feature("Arg1DTR_" + a1dtr, true)); + feats.add(new Feature("Arg2DTR_" + a2dtr, true)); + + if(a1dtr.equals(a2dtr)){ + if(!a1dtr.equals("NA")){ + feats.add(new Feature("DTR_Match", true)); + } + } + + return feats; + } + + private static String getDocTimeRelForArg(JCas jCas, IdentifiedAnnotation arg){ + String dtr = "NA"; + + // find EventMentions and grab their event properties + ConllDependencyNode node = DependencyUtility.getNominalHeadNode(jCas, arg); + if(node != null){ + List events = JCasUtil.selectCovered(jCas, EventMention.class, node); + for(EventMention event : events){ + if(event.getClass().getSimpleName().equals("EventMention")){ + if(event.getEvent() != null && event.getEvent().getProperties() != null && event.getEvent().getProperties().getDocTimeRel() != null){ + dtr = event.getEvent().getProperties().getDocTimeRel(); + } + } + } + } + return dtr; + } +} \ No newline at end of file