Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 08254115CA for ; Thu, 9 May 2013 15:00:25 +0000 (UTC) Received: (qmail 66853 invoked by uid 500); 9 May 2013 14:59:00 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 65915 invoked by uid 500); 9 May 2013 14:58:57 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 52490 invoked by uid 99); 9 May 2013 14:48:39 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 May 2013 14:48:39 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 09 May 2013 14:48:36 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 19C8523888EA; Thu, 9 May 2013 14:48:16 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1480668 - in /ctakes/sandbox/ctakes-coref-cleartk: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/ctakes/ src/main/java/org/apache/ctakes/coreference/ src/main/java/org/apache/ctakes/... Date: Thu, 09 May 2013 14:48:15 -0000 To: commits@ctakes.apache.org From: tmill@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130509144816.19C8523888EA@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: tmill Date: Thu May 9 14:48:14 2013 New Revision: 1480668 URL: http://svn.apache.org/r1480668 Log: First checkin of cleartk-based coreference resolution module. Added: ctakes/sandbox/ctakes-coref-cleartk/.classpath ctakes/sandbox/ctakes-coref-cleartk/.project ctakes/sandbox/ctakes-coref-cleartk/pom.xml ctakes/sandbox/ctakes-coref-cleartk/src/ ctakes/sandbox/ctakes-coref-cleartk/src/main/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/SpanOffsetComparator.java Added: ctakes/sandbox/ctakes-coref-cleartk/.classpath URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/.classpath?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/.classpath (added) +++ ctakes/sandbox/ctakes-coref-cleartk/.classpath Thu May 9 14:48:14 2013 @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Added: ctakes/sandbox/ctakes-coref-cleartk/.project URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/.project?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/.project (added) +++ ctakes/sandbox/ctakes-coref-cleartk/.project Thu May 9 14:48:14 2013 @@ -0,0 +1,24 @@ + + + ctakes-coref-cleartk + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.m2e.core.maven2Nature + org.eclipse.jdt.core.javanature + org.apache.uima.pear.UimaNature + + Added: ctakes/sandbox/ctakes-coref-cleartk/pom.xml URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/pom.xml?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/pom.xml (added) +++ ctakes/sandbox/ctakes-coref-cleartk/pom.xml Thu May 9 14:48:14 2013 @@ -0,0 +1,34 @@ + + 4.0.0 + new-coref + + org.apache.ctakes + ctakes + 3.1.0-SNAPSHOT + + + + org.apache.ctakes + ctakes-type-system + + + org.apache.ctakes + ctakes-utils + + + org.apache.ctakes + ctakes-relation-extractor + + + org.cleartk + cleartk + 0.5.2-SNAPSHOT + pom + + + org.cleartk + cleartk-ml + 1.2.1 + + + \ No newline at end of file Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java Thu May 9 14:48:14 2013 @@ -0,0 +1,91 @@ +package org.apache.ctakes.coreference.ae; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +import org.apache.ctakes.core.util.DocumentIDAnnotationUtil; +import org.apache.ctakes.coreference.util.Span; +import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation; +import org.apache.ctakes.typesystem.type.relation.RelationArgument; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.uimafit.component.JCasAnnotator_ImplBase; +import org.uimafit.descriptor.ConfigurationParameter; + +public class GoldCoreferenceReader extends JCasAnnotator_ImplBase { + + public static final String PARAM_INPUT_DIR = "inputDirectory"; + + @ConfigurationParameter( + name = PARAM_INPUT_DIR, + description = "Directory at which the gold standard is located", + mandatory = true + ) + protected String goldDir = "/home/tmill"; + + + @Override + public void initialize(UimaContext aContext) + throws ResourceInitializationException { + super.initialize(aContext); + } + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException { + HashMap goldSpan2id = new HashMap(); + ArrayList goldSpans = new ArrayList(); + + String docId = DocumentIDAnnotationUtil.getDocumentID(jcas); + File f = new File(goldDir + File.separator + docId); + int id = 0; + BufferedReader br = null; + try{ + br = new BufferedReader(new FileReader(f)); + String l; + while ((l = br.readLine())!=null) { + String[] spanPair = l.split("\\t"); + IdentifiedAnnotation anteMention = new IdentifiedAnnotation(jcas); + if (!goldSpan2id.containsKey(spanPair[0])){ + goldSpan2id.put(spanPair[0], ++id); + String[] s = spanPair[0].split("[-:]"); + int[] a = new int[s.length]; + for (int i = 0; i < s.length; i++) + a[i] = Integer.parseInt(s[i]); +// goldSpans.add(new Span(a)); + anteMention.setBegin(a[0]); + anteMention.setEnd(a[a.length-1]); + } + IdentifiedAnnotation anaMention = new IdentifiedAnnotation(jcas); + if (!goldSpan2id.containsKey(spanPair[1])){ + goldSpan2id.put(spanPair[1], ++id); + String[] s = spanPair[1].split("[-:]"); + int[] a = new int[s.length]; + for (int i = 0; i < s.length; i++) + a[i] = Integer.parseInt(s[i]); + goldSpans.add(new Span(a)); + anaMention.setBegin(a[0]); + anaMention.setEnd(a[a.length-1]); + } + RelationArgument arg1 = new RelationArgument(jcas); + arg1.setArgument(anteMention); + RelationArgument arg2 = new RelationArgument(jcas); + CoreferenceRelation rel = new CoreferenceRelation(jcas); + rel.setArg1(arg1); + rel.setArg2(arg2); + // TODO - continue along this line... +// goldPairs.add(new int[]{goldSpan2id.get(spanPair[0]), goldSpan2id.get(spanPair[1])}); + // ppt_arr.add(new int[]{span2id.get(p[0]), span2id.get(p[1])}); + } + br.close(); + }catch(IOException e){ + throw new AnalysisEngineProcessException(e); + } + } +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java Thu May 9 14:48:14 2013 @@ -0,0 +1,81 @@ +package org.apache.ctakes.coreference.ae; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.apache.ctakes.coreference.ae.features.DistanceFeatureExtractor; +import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor; +import org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor; +import org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor; +import org.apache.ctakes.coreference.util.CorefConst; +import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator; +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.textsem.EntityMention; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textspan.Sentence; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.jcas.tcas.DocumentAnnotation; +import org.uimafit.util.JCasUtil; + +public class NamedEntityCoreferenceResolver extends RelationExtractorAnnotator { + + @Override + protected List getFeatureExtractors() { + List extractors = new ArrayList(); + + extractors.add(new DistanceFeatureExtractor()); + extractors.add(new StringMatchingFeatureExtractor()); + extractors.add(new TokenFeatureExtractor()); + extractors.add(new UMLSFeatureExtractor()); + + return extractors; + } + + @Override + protected List getCandidateRelationArgumentPairs( + JCas jcas, Annotation coveringAnnotation) { + List pairs = new ArrayList(); + + List markables = getDocumentMarkables(jcas, coveringAnnotation); + for (int i = 0; i < markables.size(); i++) { + IdentifiedAnnotation arg1 = markables.get(i); + for(int j = i+1; j < markables.size(); j++){ + IdentifiedAnnotation arg2 = markables.get(j); + int sentdist = sentDist(jcas, arg1, arg2); + if(sentdist > CorefConst.NE_DIST) break; + if(contains(arg1, arg2) || contains(arg2, arg1)) continue; + + pairs.add(new IdentifiedAnnotationPair(arg1, arg2)); + } + } + return pairs; + } + + private boolean contains(Annotation arg1, + Annotation arg2) { + return (arg1.getBegin() >= arg2.getBegin() && arg1.getEnd() <= arg2.getEnd() || + arg2.getBegin() >= arg1.getBegin() && arg2.getEnd() <= arg1.getEnd()); + } + + private int sentDist(JCas jcas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) { + Collection sents = JCasUtil.selectCovered(jcas, Sentence.class, arg1.getBegin(), arg2.getEnd()); + return sents.size(); + } + + private List getDocumentMarkables(JCas jcas, Annotation coveringAnnotation) { + Collection mentions = (JCasUtil.select(jcas, EntityMention.class)); +// expandToNP(mentions); +// mergeNP(mentions); +// elevateAdjectives(mentions); + return new ArrayList(mentions); + } + + @Override + protected Class getCoveringClass() { + return DocumentAnnotation.class; + } + +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java Thu May 9 14:48:14 2013 @@ -0,0 +1,29 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.ctakes.coreference.util.CorefConst; +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textspan.Sentence; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.cleartk.classifier.Feature; +import org.uimafit.util.JCasUtil; + +public class DistanceFeatureExtractor implements RelationFeaturesExtractor { + + @Override + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList(); + feats.add(new Feature("TOK_DIST", + JCasUtil.selectCovered(jCas, BaseToken.class, arg1.getBegin(), arg2.getEnd()).size() / CorefConst.TOKEN_DIST)); + feats.add(new Feature("SENT_DIST", + JCasUtil.selectCovered(jCas, Sentence.class, arg1.getBegin(), arg2.getEnd()).size() / CorefConst.NE_DIST)); + return feats; + } + +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java Thu May 9 14:48:14 2013 @@ -0,0 +1,99 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.classifier.Feature; +import org.uimafit.util.JCasUtil; + +public class StringMatchingFeatureExtractor implements + RelationFeaturesExtractor { + + @Override + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList(); + + String s1 = arg1.getCoveredText(); + String s2 = arg2.getCoveredText(); + Set words1 = contentWords(arg1); + Set words2 = contentWords(arg2); + + feats.add(new Feature("MATCH_EXACT", + s1.equalsIgnoreCase(s2))); + feats.add(new Feature("MATCH_START", + startMatch(s1,s2))); + feats.add(new Feature("MATCH_END", + endMatch(s1,s2))); + feats.add(new Feature("MATCH_SOON", + soonMatch(s1,s2))); + feats.add(new Feature("MATCH_OVERLAP", + wordOverlap(words1, words2))); + feats.add(new Feature("MATCH_SUBSTRING", + wordSubstring(words1, words2))); + return feats; + } + + public static boolean startMatch (String a, String b) { + int ia = a.indexOf(" "); + int ib = b.indexOf(" "); + String aa = a.substring(0, ia==-1?(a.length()>5?5:a.length()):ia); + String bb = b.substring(0, ib==-1?(b.length()>5?5:b.length()):ib); + return aa.equalsIgnoreCase(bb); + } + + public static boolean endMatch (String a, String b) { + int ia = a.lastIndexOf(" "); + int ib = b.lastIndexOf(" "); + String aa = a.substring(ia==-1?(a.length()>5?a.length()-5:0):ia); + String bb = b.substring(ib==-1?(b.length()>5?b.length()-5:0):ib); + return aa.equalsIgnoreCase(bb); + } + + public static boolean soonMatch (String s1, String s2) { + String sl1 = nonDetSubstr(s1.toLowerCase()); + String sl2 = nonDetSubstr(s2.toLowerCase()); + return sl1.equals(sl2); + } + + public static String nonDetSubstr (String s) { + if(s.startsWith("the ")) return s.substring(4); + if(s.startsWith("a ")) return s.substring(2); + if(s.startsWith("this ")) return s.substring(5); + if(s.startsWith("that ")) return s.substring(5); + if(s.startsWith("these ")) return s.substring(6); + if(s.startsWith("those ")) return s.substring(6); + return s; + } + + public static boolean wordOverlap(Set t1, Set t2) { + for (String s : t2){ + if (t1.contains(s)){ + return true; + } + } + return false; + } + + public static boolean wordSubstring(Set t1, Set t2){ + // TODO + return false; + } + + public static Set contentWords(Annotation a1){ + Set words = new HashSet(); + for(BaseToken tok : JCasUtil.selectCovered(BaseToken.class, a1)){ + words.add(tok.getCoveredText().toLowerCase()); + } + return words; + } +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java Thu May 9 14:48:14 2013 @@ -0,0 +1,65 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.cleartk.classifier.Feature; +import org.uimafit.util.JCasUtil; + +public class TokenFeatureExtractor implements RelationFeaturesExtractor { + + @Override + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList(); + + String s1 = arg1.getCoveredText().toLowerCase(); + String s2 = arg2.getCoveredText().toLowerCase(); + + boolean dem1 = isDemonstrative(s1); + boolean dem2 = isDemonstrative(s2); + + feats.add(new Feature("TOKEN_DEM1", dem1)); + feats.add(new Feature("TOKEN_DEM2", dem2)); + feats.add(new Feature("TOKEN_DEF1", isDefinite(s1))); + feats.add(new Feature("TOKEN_DEF2", isDefinite(s2))); + feats.add(new Feature("TOKEN_NUMAGREE", + numberSingular(arg1) == numberSingular(arg2))); + return feats; + } + + public static boolean isDemonstrative (String s) { + if (s.startsWith("this") || + s.startsWith("that") || + s.startsWith("these") || + s.startsWith("those")) + return true; + else return false; + } + + public static boolean isDefinite (String s) { + return s.startsWith("the "); + } + + // FYI - old code used treebanknode types and found head using head rules filled in by the parser + // not sure if there is an appreciable difference... + public static boolean numberSingular(IdentifiedAnnotation arg){ + List tokens = new ArrayList(JCasUtil.selectCovered(BaseToken.class, arg)); + for (int i = tokens.size()-1; i >=0; i--){ + BaseToken t = tokens.get(i); + String pos = t.getPartOfSpeech(); + if (pos.equals("NN") || pos.equals("NNP")){ + return true; + }else if (pos.equals("NNS") || pos.equals("NNPS")){ + return false; + } + } + return true; + } +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java Thu May 9 14:48:14 2013 @@ -0,0 +1,46 @@ +package org.apache.ctakes.coreference.ae.features; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + +import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor; +import org.apache.ctakes.typesystem.type.refsem.UmlsConcept; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.cleartk.classifier.Feature; + +public class UMLSFeatureExtractor implements RelationFeaturesExtractor { + + @Override + public List extract(JCas jCas, IdentifiedAnnotation arg1, + IdentifiedAnnotation arg2) throws AnalysisEngineProcessException { + List feats = new ArrayList(); + + feats.add(new Feature("UMLS_ALIAS", alias(arg1, arg2))); + + return feats; + } + + public static boolean alias(IdentifiedAnnotation a1, IdentifiedAnnotation a2){ + FSArray fsa = a1.getOntologyConceptArr(); + HashSet cuis = new HashSet(); + for(int i = 0; i < fsa.size(); i++){ + if(fsa.get(i) instanceof UmlsConcept){ + cuis.add(((UmlsConcept)fsa.get(i)).getCui()); + } + } + fsa = a2.getOntologyConceptArr(); + for(int i = 0; i < fsa.size(); i++){ + if(fsa.get(i) instanceof UmlsConcept){ + if(cuis.contains(((UmlsConcept)fsa.get(i)).getCui())){ + return true; + } + } + } + + return false; + } +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java Thu May 9 14:48:14 2013 @@ -0,0 +1,61 @@ +package org.apache.ctakes.coreference.eval; + +import java.io.File; +import java.util.List; + +import org.apache.ctakes.relationextractor.eval.XMIReader; +import org.apache.uima.collection.CollectionReader; +import org.cleartk.eval.AnnotationStatistics; +import org.cleartk.eval.Evaluation_ImplBase; +import org.uimafit.factory.CollectionReaderFactory; +import org.uimafit.factory.TypeSystemDescriptionFactory; + +public class EvaluationOfCoreferencePairs extends + Evaluation_ImplBase> { + + public static final String GOLD_VIEW_NAME = "GOLD_VIEW"; + + public EvaluationOfCoreferencePairs(File baseDirectory) { + super(baseDirectory); + // TODO Auto-generated constructor stub + } + + @Override + protected CollectionReader getCollectionReader(List items) + throws Exception { + // convert the List to a String[] + String[] paths = new String[items.size()]; + for (int i = 0; i < paths.length; ++i) { + paths[i] = items.get(i).getPath(); + } + + // return a reader that will load each of the XMI files + return CollectionReaderFactory.createCollectionReader( + XMIReader.class, + TypeSystemDescriptionFactory.createTypeSystemDescription(), + XMIReader.PARAM_FILES, + paths); + } + + @Override + protected void train(CollectionReader collectionReader, File directory) + throws Exception { + // TODO Auto-generated method stub + + } + + @Override + protected AnnotationStatistics test(CollectionReader collectionReader, + File directory) throws Exception { + // TODO Auto-generated method stub + return null; + } + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java Thu May 9 14:48:14 2013 @@ -0,0 +1,226 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.ctakes.coreference.eval; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader; +import org.apache.ctakes.core.util.DocumentIDAnnotationUtil; +import org.apache.ctakes.coreference.ae.GoldCoreferenceReader; +import org.apache.ctakes.typesystem.type.structured.DocumentID; +import org.apache.uima.UIMAException; +import org.apache.uima.UIMAFramework; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.impl.XmiCasSerializer; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.InvalidXMLException; +import org.apache.uima.util.XMLInputSource; +import org.apache.uima.util.XMLParser; +import org.apache.uima.util.XMLSerializer; +import org.cleartk.util.Options_ImplBase; +import org.cleartk.util.ViewURIUtil; +import org.cleartk.util.cr.FilesCollectionReader; +import org.kohsuke.args4j.Option; +import org.uimafit.component.JCasAnnotator_ImplBase; +import org.uimafit.component.ViewCreatorAnnotator; +import org.uimafit.descriptor.ConfigurationParameter; +import org.uimafit.factory.AggregateBuilder; +import org.uimafit.factory.AnalysisEngineFactory; +import org.uimafit.factory.CollectionReaderFactory; +import org.uimafit.factory.ConfigurationParameterFactory; +import org.uimafit.pipeline.SimplePipeline; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + + +/** + * Use this to do batch preprocessing and conversion to CAS XMI files for use in experiments. + * @author lbecker + * + */ +public class PreprocessAndWriteXmi { + + + public static class Options extends Options_ImplBase { + + @Option(name = "-t", + aliases = "--textRoot", + usage = "specify the directory contraining the textFiles (for example /NLP/Corpus/Relations/mipacq/text/train", + required = true) + public File textRoot; + + @Option(name = "-g", + aliases = "--goldRoot", + usage = "specify the directory containing the gold standard files (for example: /NLP/Corpus/Relations/mipacq/xml/train", + required = true) + public File goldRoot; + + @Option(name = "-o", + aliases = "--outputRoot", + usage = "specify the directory to write out CAS XMI files", + required = true) + public File outputRoot; + } + + + public static void main(String[] args) throws IOException, UIMAException { + Options options = new Options(); + options.parseOptions(args); + + File textRoot = options.textRoot; + File goldRoot = options.goldRoot; + File outputRoot = options.outputRoot; + + CollectionReader reader = CollectionReaderFactory.createCollectionReader( + FilesCollectionReader.class, + FilesCollectionReader.PARAM_ROOT_FILE, textRoot.getPath()); + + File preprocessDescFile = new File("desc/analysis_engine/CoreferencePreprocessor.xml"); + AnalysisEngine preprocessing = createPreprocessingAE(preprocessDescFile); +// AggregateBuilder aggregate = new AggregateBuilder(); +// aggregate.add(AnalysisEngineFactory.createPrimitiveDescription(componentClass, configurationData) +// AnalysisEngine preprocessing = aggregate.createAggregate(); + + AnalysisEngine goldAnnotator = createGoldAnnotator(goldRoot); + + AnalysisEngine serializer = AnalysisEngineFactory.createPrimitive( + PreprocessAndWriteXmi.SerializeDocumentToXMI.class, + PreprocessAndWriteXmi.SerializeDocumentToXMI.PARAM_OUTPUT_DIRECTORY, + outputRoot.getPath()); + + SimplePipeline.runPipeline(reader, preprocessing, goldAnnotator, serializer); + } + + + public static AnalysisEngine createPreprocessingAE(File preprocessDescFile) throws IOException, InvalidXMLException, ResourceInitializationException { + // create the pre-processing pipeline + XMLParser parser = UIMAFramework.getXMLParser(); + XMLInputSource source = new XMLInputSource(preprocessDescFile); + AnalysisEngineDescription desc = parser.parseAnalysisEngineDescription(source); + return UIMAFramework.produceAnalysisEngine(desc); + } + + + public static AnalysisEngine createGoldAnnotator(File goldRoot) + throws ResourceInitializationException { + // pipeline to read manual annotations into the gold view, not the default view + AggregateBuilder goldAnnotatorBuilder = new AggregateBuilder(); + goldAnnotatorBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ViewCreatorAnnotator.class, + ViewCreatorAnnotator.PARAM_VIEW_NAME, + EvaluationOfCoreferencePairs.GOLD_VIEW_NAME)); + goldAnnotatorBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + CopyDocumentTextToGoldView.class)); + goldAnnotatorBuilder.add( + AnalysisEngineFactory.createPrimitiveDescription(DocumentIDAnnotator.class), + CAS.NAME_DEFAULT_SOFA, EvaluationOfCoreferencePairs.GOLD_VIEW_NAME); + goldAnnotatorBuilder.add( + AnalysisEngineFactory.createPrimitiveDescription( + GoldCoreferenceReader.class, + GoldCoreferenceReader.PARAM_INPUT_DIR, + goldRoot.getPath()), +// "asdf"), + CAS.NAME_DEFAULT_SOFA, EvaluationOfCoreferencePairs.GOLD_VIEW_NAME); + AnalysisEngine goldAnnotator = goldAnnotatorBuilder.createAggregate(); + return goldAnnotator; + } + + public static class CopyDocumentTextToGoldView extends JCasAnnotator_ImplBase { + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + try { + JCas goldView = jCas.getView(EvaluationOfCoreferencePairs.GOLD_VIEW_NAME); + goldView.setDocumentText(jCas.getDocumentText()); + } catch (CASException e) { + throw new AnalysisEngineProcessException(e); + } + } + } + + public static class SerializeDocumentToXMI extends JCasAnnotator_ImplBase { + public static final String PARAM_OUTPUT_DIRECTORY = ConfigurationParameterFactory + .createConfigurationParameterName(SerializeDocumentToXMI.class, "outputDirectory"); + + @ConfigurationParameter(mandatory = true, description = "Specifies the output directory in which to write xmi files") + private File outputDirectory; + + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + if (!this.outputDirectory.exists()) { + this.outputDirectory.mkdirs(); + } + } + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + try { + JCas goldView = jCas.getView(EvaluationOfCoreferencePairs.GOLD_VIEW_NAME); + String documentID = DocumentIDAnnotationUtil.getDocumentID(goldView); + if (documentID == null) { + throw new IllegalArgumentException("No documentID for CAS:\n" + jCas); + } + File outFile = new File(this.outputDirectory, documentID + ".xmi"); + FileOutputStream stream = new FileOutputStream(outFile); + try { + ContentHandler handler = new XMLSerializer(stream).getContentHandler(); + new XmiCasSerializer(jCas.getTypeSystem()).serialize(jCas.getCas(), handler); + } finally { + stream.close(); + } + } catch (UIMAException e) { + throw new AnalysisEngineProcessException(e); + } catch (SAXException e) { + throw new AnalysisEngineProcessException(e); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + } + + /** + * Class for adding DocumentID annotations. + * + * Needed because {@link FilesInDirectoryCollectionReader} creates {@link DocumentID} annotations + * but doesn't allow specific files to be loaded, while {@link FilesCollectionReader} allows + * specific files to be loaded but creates URIs instead of {@link DocumentID} annotations. + */ + public static class DocumentIDAnnotator extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + String documentID = new File(ViewURIUtil.getURI(jCas)).getName(); + DocumentID documentIDAnnotation = new DocumentID(jCas); + documentIDAnnotation.setDocumentID(documentID); + documentIDAnnotation.addToIndexes(); + } + + } + +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java Thu May 9 14:48:14 2013 @@ -0,0 +1,6 @@ +package org.apache.ctakes.coreference.util; + +public class CorefConst { + public static final int NE_DIST = 20; + public static final int TOKEN_DIST = 600; +} Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java?rev=1480668&view=auto ============================================================================== --- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java (added) +++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java Thu May 9 14:48:14 2013 @@ -0,0 +1,153 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.ctakes.coreference.util; + +public class Span { + + int[][] s; + int length; + + public Span () {} + public Span (int[] i) { + if (i.length % 2 == 0) { + length = 0; + s = new int[i.length/2][2]; + for (int j = 0; j < i.length; j+=2) { + s[j/2][0] = i[j]; + s[j/2][1] = i[j+1]; + length += i[j+1] - i[j]; + } + } + } + + public Span (String str) { + String[] ss = str.split("[-:]"); + if (ss.length % 2 == 0) { + s = new int[ss.length/2][2]; + for (int i = 0; i < ss.length; i+=2) { + s[i/2][0] = Integer.parseInt(ss[i]); + s[i/2][1] = Integer.parseInt(ss[i+1]); + length += s[i/2][1] - s[i/2][0]; + } + } + } + + public int size () { return s.length; } + public int length () { return length; } + public int[] get (int i) { return s[i]; } + + public Span tail () { + if (s.length==1) return new Span(); + int[] ret = new int[(s.length-1)*2]; + for (int i = 1; i < s.length; i++) { + ret[(i-1)*2] = s[i][0]; + ret[i*2-1] = s[i][1]; + } + return new Span(ret); + } + + // 2 * intersect / (length of s1 + length of s2) + public static double score (Span s1, Span s2) { + double a = 0; + double b = 0; + // there is a more efficient way + for (int i = 0; i < s1.size(); i++) + for (int j = 0; j < s2.size(); j++) + a += overlap(s1.get(i), s2.get(j)); + for (int i = 0; i < s1.size(); i++) + b += s1.get(i)[1] - s1.get(i)[0]; + for (int i = 0; i < s2.size(); i++) + b += s2.get(i)[1] - s2.get(i)[0]; + return a==0 ? -1 : a/b; + } + + private static int overlap (int[] a, int[] b) { + int ret; + if (a[0] >= b[0]) + ret = (a[1]>b[1] ? b[1] : a[1]) - a[0]; + else + ret = (a[1] { + + @Override + public int compare(Span o1, Span o2) { + int ret; + ret = o1.get(0)[0] - o2.get(0)[0]; + if (ret!=0) return ret; + else { + ret = o1.get(0)[1] - o2.get(0)[1]; + if (ret!=0) return ret; + else { + int s1 = o1.size(); + int s2 = o2.size(); + if (s1==1 && s2>1) return -1; + else if (s1>1 && s2==1) return 1; + else if (s1==1 && s2==1) return 0; + else return compare(o1.tail(), o2.tail()); + } + } + } + +}