ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1480668 - in /ctakes/sandbox/ctakes-coref-cleartk: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/ctakes/ src/main/java/org/apache/ctakes/coreference/ src/main/java/org/apache/ctakes/...
Date Thu, 09 May 2013 14:48:15 GMT
Author: tmill
Date: Thu May  9 14:48:14 2013
New Revision: 1480668

URL: http://svn.apache.org/r1480668
Log:
First checkin of cleartk-based coreference resolution module.

Added:
    ctakes/sandbox/ctakes-coref-cleartk/.classpath
    ctakes/sandbox/ctakes-coref-cleartk/.project
    ctakes/sandbox/ctakes-coref-cleartk/pom.xml
    ctakes/sandbox/ctakes-coref-cleartk/src/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/SpanOffsetComparator.java

Added: ctakes/sandbox/ctakes-coref-cleartk/.classpath
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/.classpath?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/.classpath (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/.classpath Thu May  9 14:48:14 2013
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" output="target/classes" path="src/main/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>

Added: ctakes/sandbox/ctakes-coref-cleartk/.project
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/.project?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/.project (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/.project Thu May  9 14:48:14 2013
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ctakes-coref-cleartk</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.apache.uima.pear.UimaNature</nature>
+	</natures>
+</projectDescription>

Added: ctakes/sandbox/ctakes-coref-cleartk/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/pom.xml?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/pom.xml (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/pom.xml Thu May  9 14:48:14 2013
@@ -0,0 +1,34 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <artifactId>new-coref</artifactId>
+  <parent>
+  	<groupId>org.apache.ctakes</groupId>
+  	<artifactId>ctakes</artifactId>
+  	<version>3.1.0-SNAPSHOT</version>
+  </parent>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-type-system</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-utils</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-relation-extractor</artifactId>
+    </dependency>
+    <dependency>
+    	<groupId>org.cleartk</groupId>
+    	<artifactId>cleartk</artifactId>
+    	<version>0.5.2-SNAPSHOT</version>
+    	<type>pom</type>
+    </dependency>
+    <dependency>
+    	<groupId>org.cleartk</groupId>
+    	<artifactId>cleartk-ml</artifactId>
+    	<version>1.2.1</version>
+    </dependency>
+  </dependencies>
+</project>
\ No newline at end of file

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,91 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.coreference.util.Span;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
+
+public class GoldCoreferenceReader extends JCasAnnotator_ImplBase {
+
+	public static final String PARAM_INPUT_DIR = "inputDirectory";
+
+	@ConfigurationParameter(
+			name = PARAM_INPUT_DIR,
+			description = "Directory at which the gold standard is located",
+			mandatory = true
+			)
+	protected String goldDir = "/home/tmill";
+
+	
+	@Override
+	public void initialize(UimaContext aContext)
+			throws ResourceInitializationException {
+		super.initialize(aContext);
+	}
+	
+	@Override
+	public void process(JCas jcas) throws AnalysisEngineProcessException {
+		HashMap<String, Integer> goldSpan2id = new HashMap<String, Integer>();
+		ArrayList<Span> goldSpans = new ArrayList<Span>();
+		
+		String docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
+		File f = new File(goldDir + File.separator + docId);
+		int id = 0;
+		BufferedReader br = null;
+		try{
+			br = new BufferedReader(new FileReader(f));
+			String l;
+			while ((l = br.readLine())!=null) {
+				String[] spanPair = l.split("\\t");
+				IdentifiedAnnotation anteMention = new IdentifiedAnnotation(jcas);
+				if (!goldSpan2id.containsKey(spanPair[0])){
+					goldSpan2id.put(spanPair[0], ++id);
+					String[] s = spanPair[0].split("[-:]");
+					int[] a = new int[s.length];
+					for (int i = 0; i < s.length; i++)
+						a[i] = Integer.parseInt(s[i]);
+//					goldSpans.add(new Span(a));
+					anteMention.setBegin(a[0]);
+					anteMention.setEnd(a[a.length-1]);
+				}
+				IdentifiedAnnotation anaMention = new IdentifiedAnnotation(jcas);
+				if (!goldSpan2id.containsKey(spanPair[1])){
+					goldSpan2id.put(spanPair[1], ++id);
+					String[] s = spanPair[1].split("[-:]");
+					int[] a = new int[s.length];
+					for (int i = 0; i < s.length; i++)
+						a[i] = Integer.parseInt(s[i]);
+					goldSpans.add(new Span(a));
+					anaMention.setBegin(a[0]);
+					anaMention.setEnd(a[a.length-1]);
+				}
+				RelationArgument arg1 = new RelationArgument(jcas);
+				arg1.setArgument(anteMention);
+				RelationArgument arg2 = new RelationArgument(jcas);
+				CoreferenceRelation rel = new CoreferenceRelation(jcas);
+				rel.setArg1(arg1);
+				rel.setArg2(arg2);
+				// TODO - continue along this line...
+//				goldPairs.add(new int[]{goldSpan2id.get(spanPair[0]), goldSpan2id.get(spanPair[1])});
+				//			ppt_arr.add(new int[]{span2id.get(p[0]), span2id.get(p[1])});
+			}
+			br.close();
+		}catch(IOException e){
+			throw new AnalysisEngineProcessException(e);
+		}
+	}
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,81 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.ctakes.coreference.ae.features.DistanceFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor;
+import org.apache.ctakes.coreference.util.CorefConst;
+import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.uimafit.util.JCasUtil;
+
+public class NamedEntityCoreferenceResolver extends RelationExtractorAnnotator {
+	
+	@Override
+	protected List<RelationFeaturesExtractor> getFeatureExtractors() {
+		List<RelationFeaturesExtractor> extractors = new ArrayList<RelationFeaturesExtractor>();
+		
+		extractors.add(new DistanceFeatureExtractor());
+		extractors.add(new StringMatchingFeatureExtractor());
+		extractors.add(new TokenFeatureExtractor());
+		extractors.add(new UMLSFeatureExtractor());
+		
+		return extractors;
+	}
+	
+	@Override
+	protected List<IdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+			JCas jcas, Annotation coveringAnnotation) {
+		List<IdentifiedAnnotationPair> pairs = new ArrayList<IdentifiedAnnotationPair>();
+		
+		List<IdentifiedAnnotation> markables = getDocumentMarkables(jcas, coveringAnnotation);
+		for (int i = 0; i < markables.size(); i++) {
+			IdentifiedAnnotation arg1 = markables.get(i);
+			for(int j = i+1; j < markables.size(); j++){
+				IdentifiedAnnotation arg2 = markables.get(j);
+				int sentdist = sentDist(jcas, arg1, arg2);
+				if(sentdist > CorefConst.NE_DIST) break;
+				if(contains(arg1, arg2) || contains(arg2, arg1)) continue;
+				
+				pairs.add(new IdentifiedAnnotationPair(arg1, arg2));
+			}
+		}
+		return pairs;
+	}
+
+	private boolean contains(Annotation arg1,
+			Annotation arg2) {
+		return (arg1.getBegin() >= arg2.getBegin() && arg1.getEnd() <= arg2.getEnd()
||
+				arg2.getBegin() >= arg1.getBegin() && arg2.getEnd() <= arg1.getEnd());
+	}
+
+	private int sentDist(JCas jcas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) {
+		Collection<Sentence> sents = JCasUtil.selectCovered(jcas, Sentence.class, arg1.getBegin(),
arg2.getEnd());
+		return sents.size();
+	}
+
+	private List<IdentifiedAnnotation> getDocumentMarkables(JCas jcas, Annotation coveringAnnotation)
{
+		Collection<EntityMention> mentions = (JCasUtil.select(jcas, EntityMention.class));
+//		expandToNP(mentions);
+//		mergeNP(mentions);
+//		elevateAdjectives(mentions);
+		return new ArrayList<IdentifiedAnnotation>(mentions);
+	}
+
+	@Override
+	protected Class<? extends Annotation> getCoveringClass() {
+		return DocumentAnnotation.class;
+	}
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,29 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.coreference.util.CorefConst;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.classifier.Feature;
+import org.uimafit.util.JCasUtil;
+
+public class DistanceFeatureExtractor implements RelationFeaturesExtractor {
+
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+		List<Feature> feats = new ArrayList<Feature>();
+		feats.add(new Feature("TOK_DIST",
+				  JCasUtil.selectCovered(jCas, BaseToken.class, arg1.getBegin(), arg2.getEnd()).size()
/ CorefConst.TOKEN_DIST));
+		feats.add(new Feature("SENT_DIST",
+				JCasUtil.selectCovered(jCas, Sentence.class, arg1.getBegin(), arg2.getEnd()).size() /
CorefConst.NE_DIST));
+		return feats;
+	}
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,99 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.uimafit.util.JCasUtil;
+
+public class StringMatchingFeatureExtractor implements
+		RelationFeaturesExtractor {
+
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+		List<Feature> feats = new ArrayList<Feature>();
+		
+		String s1 = arg1.getCoveredText();
+		String s2 = arg2.getCoveredText();
+		Set<String> words1 = contentWords(arg1);
+		Set<String> words2 = contentWords(arg2);
+		
+		feats.add(new Feature("MATCH_EXACT",
+				s1.equalsIgnoreCase(s2)));
+		feats.add(new Feature("MATCH_START",
+				startMatch(s1,s2)));
+		feats.add(new Feature("MATCH_END",
+				endMatch(s1,s2)));
+		feats.add(new Feature("MATCH_SOON",
+				soonMatch(s1,s2)));
+		feats.add(new Feature("MATCH_OVERLAP",
+				wordOverlap(words1, words2)));
+		feats.add(new Feature("MATCH_SUBSTRING",
+				wordSubstring(words1, words2)));
+		return feats;
+	}
+
+	public static boolean startMatch (String a, String b) {
+		int ia = a.indexOf(" ");
+		int ib = b.indexOf(" ");
+		String aa = a.substring(0, ia==-1?(a.length()>5?5:a.length()):ia);
+		String bb = b.substring(0, ib==-1?(b.length()>5?5:b.length()):ib);
+		return aa.equalsIgnoreCase(bb);
+	}
+
+	public static boolean endMatch (String a, String b) {
+		int ia = a.lastIndexOf(" ");
+		int ib = b.lastIndexOf(" ");
+		String aa = a.substring(ia==-1?(a.length()>5?a.length()-5:0):ia);
+		String bb = b.substring(ib==-1?(b.length()>5?b.length()-5:0):ib);
+		return aa.equalsIgnoreCase(bb);
+	}
+
+	public static boolean soonMatch (String s1, String s2) {
+		String sl1 = nonDetSubstr(s1.toLowerCase());
+		String sl2 = nonDetSubstr(s2.toLowerCase());
+		return sl1.equals(sl2);
+	}
+
+	public static String nonDetSubstr (String s) {
+		if(s.startsWith("the ")) return s.substring(4);
+		if(s.startsWith("a ")) return s.substring(2);
+		if(s.startsWith("this ")) return s.substring(5);
+		if(s.startsWith("that ")) return s.substring(5);
+		if(s.startsWith("these ")) return s.substring(6);
+		if(s.startsWith("those ")) return s.substring(6);
+		return s;
+	}
+
+	public static boolean wordOverlap(Set<String> t1, Set<String> t2) {
+		for (String s : t2){
+			if (t1.contains(s)){
+				return true;
+			}
+		}
+		return false;
+	}
+
+	public static boolean wordSubstring(Set<String> t1, Set<String> t2){
+		// TODO
+		return false;
+	}
+	
+	public static Set<String> contentWords(Annotation a1){
+		Set<String> words = new HashSet<String>();
+		for(BaseToken tok : JCasUtil.selectCovered(BaseToken.class, a1)){
+			words.add(tok.getCoveredText().toLowerCase());
+		}
+		return words;
+	}
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,65 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.classifier.Feature;
+import org.uimafit.util.JCasUtil;
+
+public class TokenFeatureExtractor implements RelationFeaturesExtractor {
+
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+		List<Feature> feats = new ArrayList<Feature>();
+		
+		String s1 = arg1.getCoveredText().toLowerCase();
+		String s2 = arg2.getCoveredText().toLowerCase();
+		
+		boolean dem1 = isDemonstrative(s1);
+		boolean dem2 = isDemonstrative(s2);
+		
+		feats.add(new Feature("TOKEN_DEM1", dem1));
+		feats.add(new Feature("TOKEN_DEM2", dem2));
+		feats.add(new Feature("TOKEN_DEF1", isDefinite(s1)));
+		feats.add(new Feature("TOKEN_DEF2", isDefinite(s2)));
+		feats.add(new Feature("TOKEN_NUMAGREE",
+				numberSingular(arg1) == numberSingular(arg2)));
+		return feats;
+	}
+	
+	public static boolean isDemonstrative (String s) {
+		if (s.startsWith("this") ||
+				s.startsWith("that") ||
+				s.startsWith("these") ||
+				s.startsWith("those"))
+				return true;
+		else return false;
+	}
+	
+	public static boolean isDefinite (String s) {
+		return s.startsWith("the ");
+	}
+
+	// FYI - old code used treebanknode types and found head using head rules filled in by the
parser
+	// not sure if there is an appreciable difference...
+	public static boolean numberSingular(IdentifiedAnnotation arg){
+		List<BaseToken> tokens = new ArrayList<BaseToken>(JCasUtil.selectCovered(BaseToken.class,
arg));
+		for (int i = tokens.size()-1; i >=0; i--){
+			BaseToken t = tokens.get(i);
+			String pos = t.getPartOfSpeech();
+			if (pos.equals("NN") || pos.equals("NNP")){
+				return true;
+			}else if (pos.equals("NNS") || pos.equals("NNPS")){
+				return false;
+			}
+		}
+		return true;
+	}
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,46 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.cleartk.classifier.Feature;
+
+public class UMLSFeatureExtractor implements RelationFeaturesExtractor {
+
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+		List<Feature> feats = new ArrayList<Feature>();
+		
+		feats.add(new Feature("UMLS_ALIAS", alias(arg1, arg2)));
+		
+		return feats;
+	}
+
+	public static boolean alias(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
+		FSArray fsa = a1.getOntologyConceptArr();
+		HashSet<String> cuis = new HashSet<String>();
+		for(int i = 0; i < fsa.size(); i++){
+			if(fsa.get(i) instanceof UmlsConcept){
+				cuis.add(((UmlsConcept)fsa.get(i)).getCui());
+			}
+		}
+		fsa = a2.getOntologyConceptArr();
+		for(int i = 0; i < fsa.size(); i++){
+			if(fsa.get(i) instanceof UmlsConcept){
+				if(cuis.contains(((UmlsConcept)fsa.get(i)).getCui())){
+					return true;
+				}
+			}
+		}
+		
+		return false;
+	}
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,61 @@
+package org.apache.ctakes.coreference.eval;
+
+import java.io.File;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.eval.XMIReader;
+import org.apache.uima.collection.CollectionReader;
+import org.cleartk.eval.AnnotationStatistics;
+import org.cleartk.eval.Evaluation_ImplBase;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
+
+public class EvaluationOfCoreferencePairs extends
+		Evaluation_ImplBase<File, AnnotationStatistics<String>> {
+
+	public static final String GOLD_VIEW_NAME = "GOLD_VIEW";
+
+	public EvaluationOfCoreferencePairs(File baseDirectory) {
+		super(baseDirectory);
+		// TODO Auto-generated constructor stub
+	}
+
+	@Override
+	protected CollectionReader getCollectionReader(List<File> items)
+			throws Exception {
+		// convert the List<File> to a String[]
+		String[] paths = new String[items.size()];
+		for (int i = 0; i < paths.length; ++i) {
+			paths[i] = items.get(i).getPath();
+		}
+
+		// return a reader that will load each of the XMI files
+		return CollectionReaderFactory.createCollectionReader(
+				XMIReader.class,
+				TypeSystemDescriptionFactory.createTypeSystemDescription(),
+				XMIReader.PARAM_FILES,
+				paths);
+	}
+
+	@Override
+	protected void train(CollectionReader collectionReader, File directory)
+			throws Exception {
+		// TODO Auto-generated method stub
+		
+	}
+
+	@Override
+	protected AnnotationStatistics<String> test(CollectionReader collectionReader,
+			File directory) throws Exception {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		// TODO Auto-generated method stub
+
+	}
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,226 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.coreference.eval;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.coreference.ae.GoldCoreferenceReader;
+import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.uima.UIMAException;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XMLParser;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.Options_ImplBase;
+import org.cleartk.util.ViewURIUtil;
+import org.cleartk.util.cr.FilesCollectionReader;
+import org.kohsuke.args4j.Option;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.component.ViewCreatorAnnotator;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.factory.ConfigurationParameterFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Use this to do batch preprocessing and conversion to CAS XMI files for use in experiments.
+ * @author lbecker
+ *
+ */
+public class PreprocessAndWriteXmi {
+
+	
+	public static class Options extends Options_ImplBase {
+
+		@Option(name = "-t", 
+				aliases = "--textRoot", 
+				usage = "specify the directory contraining the textFiles (for example /NLP/Corpus/Relations/mipacq/text/train",
+				required = true)
+		public File textRoot;
+
+		@Option(name = "-g",
+				aliases = "--goldRoot",
+				usage = "specify the directory containing the gold standard files (for example: /NLP/Corpus/Relations/mipacq/xml/train",
+        required = true)
+		public File goldRoot;
+		
+		@Option(name = "-o",
+				aliases = "--outputRoot",
+				usage = "specify the directory to write out CAS XMI files",
+				required = true)
+		public File outputRoot;
+	}
+	
+
+	public static void main(String[] args) throws IOException, UIMAException {
+	    Options options = new Options();
+	    options.parseOptions(args);
+    
+	    File textRoot = options.textRoot;
+	    File goldRoot = options.goldRoot;
+	    File outputRoot = options.outputRoot;
+	    
+	    CollectionReader reader = CollectionReaderFactory.createCollectionReader(
+	    		FilesCollectionReader.class,
+	    		FilesCollectionReader.PARAM_ROOT_FILE, textRoot.getPath());
+	    
+	    File preprocessDescFile = new File("desc/analysis_engine/CoreferencePreprocessor.xml");
+	    AnalysisEngine preprocessing = createPreprocessingAE(preprocessDescFile);
+//	    AggregateBuilder aggregate = new AggregateBuilder();
+//	    aggregate.add(AnalysisEngineFactory.createPrimitiveDescription(componentClass, configurationData)
+//	    AnalysisEngine preprocessing = aggregate.createAggregate();
+	    
+	    AnalysisEngine goldAnnotator = createGoldAnnotator(goldRoot);
+	    
+	    AnalysisEngine serializer = AnalysisEngineFactory.createPrimitive(
+				PreprocessAndWriteXmi.SerializeDocumentToXMI.class, 
+				PreprocessAndWriteXmi.SerializeDocumentToXMI.PARAM_OUTPUT_DIRECTORY, 
+				outputRoot.getPath());
+				
+	    SimplePipeline.runPipeline(reader, preprocessing, goldAnnotator, serializer);
+	}
+
+	
+	public static AnalysisEngine createPreprocessingAE(File preprocessDescFile) throws IOException,
InvalidXMLException, ResourceInitializationException {
+	    // create the pre-processing pipeline
+	      XMLParser parser = UIMAFramework.getXMLParser();
+	      XMLInputSource source = new XMLInputSource(preprocessDescFile);
+	      AnalysisEngineDescription desc = parser.parseAnalysisEngineDescription(source);
+	      return UIMAFramework.produceAnalysisEngine(desc);
+	}
+	
+	
+	public static AnalysisEngine createGoldAnnotator(File goldRoot)
+			throws ResourceInitializationException {
+		// pipeline to read manual annotations into the gold view, not the default view
+	      AggregateBuilder goldAnnotatorBuilder = new AggregateBuilder();
+	      goldAnnotatorBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+	          ViewCreatorAnnotator.class,
+	          ViewCreatorAnnotator.PARAM_VIEW_NAME,
+	          EvaluationOfCoreferencePairs.GOLD_VIEW_NAME));
+	      goldAnnotatorBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+	          CopyDocumentTextToGoldView.class));
+	      goldAnnotatorBuilder.add(
+	          AnalysisEngineFactory.createPrimitiveDescription(DocumentIDAnnotator.class),
+	          CAS.NAME_DEFAULT_SOFA, EvaluationOfCoreferencePairs.GOLD_VIEW_NAME);
+	      goldAnnotatorBuilder.add(
+	          AnalysisEngineFactory.createPrimitiveDescription(
+	            GoldCoreferenceReader.class,
+	            GoldCoreferenceReader.PARAM_INPUT_DIR,
+	            goldRoot.getPath()),
+//	            "asdf"),
+	          CAS.NAME_DEFAULT_SOFA, EvaluationOfCoreferencePairs.GOLD_VIEW_NAME);
+	      AnalysisEngine goldAnnotator = goldAnnotatorBuilder.createAggregate();
+	      return goldAnnotator;
+	}
+	
+  public static class CopyDocumentTextToGoldView extends JCasAnnotator_ImplBase {
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      try {
+        JCas goldView = jCas.getView(EvaluationOfCoreferencePairs.GOLD_VIEW_NAME);
+        goldView.setDocumentText(jCas.getDocumentText());
+      } catch (CASException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+	
+	public static class SerializeDocumentToXMI extends JCasAnnotator_ImplBase {
+		public static final String PARAM_OUTPUT_DIRECTORY = ConfigurationParameterFactory
+		.createConfigurationParameterName(SerializeDocumentToXMI.class, "outputDirectory");
+
+		@ConfigurationParameter(mandatory = true, description = "Specifies the output directory
in which to write xmi files")
+		private File outputDirectory;
+
+		@Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.outputDirectory.exists()) {
+        this.outputDirectory.mkdirs();
+      }
+    }
+
+    @Override
+		public void process(JCas jCas) throws AnalysisEngineProcessException {
+			try {
+			  JCas goldView = jCas.getView(EvaluationOfCoreferencePairs.GOLD_VIEW_NAME);
+			  String documentID = DocumentIDAnnotationUtil.getDocumentID(goldView);
+			  if (documentID == null) {
+			    throw new IllegalArgumentException("No documentID for CAS:\n" + jCas);
+			  }
+        File outFile = new File(this.outputDirectory, documentID + ".xmi");
+        FileOutputStream stream = new FileOutputStream(outFile);
+        try {
+          ContentHandler handler = new XMLSerializer(stream).getContentHandler();
+          new XmiCasSerializer(jCas.getTypeSystem()).serialize(jCas.getCas(), handler);
+        } finally {
+          stream.close();
+        }
+			} catch (UIMAException e) {
+				throw new AnalysisEngineProcessException(e);
+			} catch (SAXException e) {
+				throw new AnalysisEngineProcessException(e);
+			} catch (IOException e) {
+				throw new AnalysisEngineProcessException(e);
+      }	
+		}
+		
+	}
+	
+  /**
+   * Class for adding DocumentID annotations.
+   * 
+   * Needed because {@link FilesInDirectoryCollectionReader} creates {@link DocumentID} annotations
+   * but doesn't allow specific files to be loaded, while {@link FilesCollectionReader} allows
+   * specific files to be loaded but creates URIs instead of {@link DocumentID} annotations.
+   */
+  public static class DocumentIDAnnotator extends JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      String documentID = new File(ViewURIUtil.getURI(jCas)).getName();
+      DocumentID documentIDAnnotation = new DocumentID(jCas);
+      documentIDAnnotation.setDocumentID(documentID);
+      documentIDAnnotation.addToIndexes();
+    }
+
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/CorefConst.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,6 @@
+package org.apache.ctakes.coreference.util;
+
+public class CorefConst {
+	public static final int NE_DIST = 20;
+	public static final int TOKEN_DIST = 600;
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/Span.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.coreference.util;
+
+public class Span {
+
+	int[][] s;
+	int length;
+
+	public Span () {}
+	public Span (int[] i) {
+		if (i.length % 2 == 0) {
+			length = 0;
+			s = new int[i.length/2][2];
+			for (int j = 0; j < i.length; j+=2) {
+				s[j/2][0] = i[j];
+				s[j/2][1] = i[j+1];
+				length += i[j+1] - i[j];
+			}
+		}
+	}
+
+	public Span (String str) {
+		String[] ss = str.split("[-:]");
+		if (ss.length % 2 == 0) {
+			s = new int[ss.length/2][2];
+			for (int i = 0; i < ss.length; i+=2) {
+				s[i/2][0] = Integer.parseInt(ss[i]);
+				s[i/2][1] = Integer.parseInt(ss[i+1]);
+				length += s[i/2][1] - s[i/2][0];
+			}
+		}
+	}
+
+	public int size () { return s.length; }
+	public int length () { return length; }
+	public int[] get (int i) { return s[i]; }
+
+	public Span tail () {
+		if (s.length==1) return new Span();
+		int[] ret = new int[(s.length-1)*2];
+		for (int i = 1; i < s.length; i++) {
+			ret[(i-1)*2] = s[i][0];
+			ret[i*2-1] = s[i][1];
+		}
+		return new Span(ret);
+	}
+
+	// 2 * intersect / (length of s1 + length of s2)
+	public static double score (Span s1, Span s2) {
+		double a = 0;
+		double b = 0;
+		// there is a more efficient way
+		for (int i = 0; i < s1.size(); i++)
+			for (int j = 0; j < s2.size(); j++)
+				a += overlap(s1.get(i), s2.get(j));
+		for (int i = 0; i < s1.size(); i++)
+			b += s1.get(i)[1] - s1.get(i)[0];
+		for (int i = 0; i < s2.size(); i++)
+			b += s2.get(i)[1] - s2.get(i)[0];
+		return a==0 ? -1 : a/b;
+	}
+
+	private static int overlap (int[] a, int[] b) {
+	    int ret;
+		if (a[0] >= b[0])
+			ret = (a[1]>b[1] ? b[1] : a[1]) - a[0];
+		else
+			ret = (a[1]<b[1] ? a[1] : b[1]) - b[0];
+		if ((ret*=2) < 0) ret= 0;
+		return ret;
+	}
+
+	public double gap () {
+		return 0;
+	}
+
+	public String toString () {
+		StringBuffer sb = new StringBuffer();
+		for (int i[] : s)
+			sb.append(i[0]).append("-").append(i[1]).append(":");
+		sb.deleteCharAt(sb.length()-1);
+		return sb.toString();
+	}
+
+	@Override
+	public boolean equals (Object o) {
+		if (! (o instanceof Span)) return false;
+		Span span = (Span) o;
+		if (size() != span.size()) return false;
+		SpanOffsetComparator soc = new SpanOffsetComparator();
+		return soc.compare(this, span)==0;
+	}
+
+	@Override
+	public int hashCode () {
+		return toString().hashCode();
+	}
+//////////////////////////////////////////////////////////////////////////////
+//	int c;
+//	static int[][] m;
+//	public Span (char c) {
+//		switch(c) {
+//		case 'A': this.c=0; break;
+//		case 'C': this.c=1; break;
+//		case 'G': this.c=2; break;
+//		case 'T': this.c=3; break;
+//		}
+//		mat();
+//	}
+//	public Span () {}
+//	private void mat() {
+//		m = new int[4][4];
+//		m[0][0] = 2;
+//		m[0][1] = -1;
+//		m[0][2] = 1;
+//		m[0][3] = -1;
+//		m[1][0] = -1;
+//		m[1][1] = 2;
+//		m[1][2] = -1;
+//		m[1][3] = 1;
+//		m[2][0] = 1;
+//		m[2][1] = -1;
+//		m[2][2] = 2;
+//		m[2][3] = -1;
+//		m[3][0] = -1;
+//		m[3][1] = 1;
+//		m[3][2] = -1;
+//		m[3][3] = 2;
+//	}
+//	public int get () { return c; }
+//	public String toString() { if (c==0) return "A"; if (c==1) return "C"; if (c==2) return
"G"; if (c==3) return "T"; return ""; }
+//	public static double score (Span s1, Span s2) {
+//		return m[s1.get()][s2.get()];
+//	}
+//	public double gap () { return -2; }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/SpanOffsetComparator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/SpanOffsetComparator.java?rev=1480668&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/SpanOffsetComparator.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/SpanOffsetComparator.java
Thu May  9 14:48:14 2013
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.coreference.util;
+
+import java.util.Comparator;
+
+public class SpanOffsetComparator implements Comparator<Span> {
+
+	@Override
+	public int compare(Span o1, Span o2) {
+		int ret;
+		ret = o1.get(0)[0] - o2.get(0)[0];
+		if (ret!=0) return ret;
+		else {
+			ret = o1.get(0)[1] - o2.get(0)[1];
+			if (ret!=0) return ret;
+			else {
+				int s1 = o1.size();
+				int s2 = o2.size();
+				if (s1==1 && s2>1) return -1;
+				else if (s1>1 && s2==1) return 1;
+				else if (s1==1 && s2==1) return 0;
+				else return compare(o1.tail(), o2.tail());
+			}
+		}
+	}
+
+}



Mime
View raw message