ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1583702 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration: ./ EventDurationDistribution.java
Date Tue, 01 Apr 2014 15:08:53 GMT
Author: dligach
Date: Tue Apr  1 15:08:53 2014
New Revision: 1583702

URL: http://svn.apache.org/r1583702
Log:
created new package for duration-related stuff

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventDurationDistribution.java
  (with props)

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventDurationDistribution.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventDurationDistribution.java?rev=1583702&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventDurationDistribution.java
(added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventDurationDistribution.java
Tue Apr  1 15:08:53 2014
@@ -0,0 +1,251 @@
+package org.apache.ctakes.temporal.duration;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.relationextractor.eval.XMIReader;
+import org.apache.ctakes.temporal.ae.feature.duration.Utils;
+import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.util.JCasUtil;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Multiset;
+
+/**
+ * Extract durations of event mentions (e.g. sign/symptom or disease/disorder).
+ * 
+ * @author dmitriy dligach
+ */
+public class EventDurationDistribution {
+
+  private static Class<? extends EventMention> targetClass = SignSymptomMention.class;
+  
+  public static class Options  {
+
+    @Option(
+        name = "--input-dir",
+        usage = "specify the path to the directory containing the xmi files",
+        required = true)
+    public File inputDirectory;
+  }
+  
+	public static void main(String[] args) throws Exception {
+		
+	  Options options = new Options();
+	  CmdLineParser parser = new CmdLineParser(options);
+	  parser.parseArgument(args);
+	  
+	  
+		List<File> trainFiles = Arrays.asList(options.inputDirectory.listFiles());
+    CollectionReader collectionReader = getCollectionReader(trainFiles);
+		
+    AnalysisEngine temporalDurationExtractor = AnalysisEngineFactory.createPrimitive(
+    		TemporalDurationExtractor.class);
+    		
+		SimplePipeline.runPipeline(collectionReader, temporalDurationExtractor);
+	}
+  
+  public static class TemporalDurationExtractor extends JCasAnnotator_ImplBase {
+    
+    // regular expression to match temporal durations in time mention annotations
+    private final static String REGEX = "(sec|min|hour|hrs|day|week|wk|month|year|yr|decade)";
+    
+    // mapping between temporal durations and their normalized forms
+    private final static Map<String, String> MAPPING = ImmutableMap.<String, String>builder()
+        .put("sec", "second")
+        .put("min", "minute")
+        .put("hour", "hour")
+        .put("hrs", "hour")
+        .put("day", "day")
+        .put("week", "week")
+        .put("wk", "week")
+        .put("month", "month")
+        .put("year", "year")
+        .put("yr", "year")
+        .put("decade", "decade")
+        .build(); 
+    
+    // unique temporal bins; all time mentions will be classified into one of them
+    private final static List<String> BINS = Arrays.asList(
+        "second",
+        "minute",
+        "hour",
+        "day",
+        "week",
+        "month",
+        "year",
+        "decade");
+    
+    // max distance between an event and the time mention that defines the event's duration
+    private final static int MAXDISTANCE = 2;
+
+    // regex to match different time granularities (e.g. 'day', 'month')
+    Pattern pattern = Pattern.compile(REGEX, Pattern.CASE_INSENSITIVE);
+    
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+
+      Collection<DocumentID> ids = JCasUtil.select(jCas, DocumentID.class);
+      String fileName = ids.iterator().next().getDocumentID();
+      String mentionText = fileName.split("\\.")[0]; // e.g. "smoker.txt"
+
+      // counts of different time granularities for this sign/symptom
+      Multiset<String> durationDistribution = HashMultiset.create();
+
+      for(EventMention mention : JCasUtil.select(jCas, targetClass)) {
+        if(mention.getCoveredText().equals(mentionText)) {
+          if(isNegated(jCas, mention) || isMedicationPattern(jCas, mention)) {
+            continue;
+          }
+          
+          TimeMention nearestTimeMention = getNearestTimeMention(jCas, mention);
+          if(nearestTimeMention != null) {
+            Matcher matcher = pattern.matcher(nearestTimeMention.getCoveredText());
+
+            // need the loop to handle things like 'several days/weeks'
+            while(matcher.find()) {
+              String matchedDuration = matcher.group(); // e.g. "wks"
+              String normalizedDuration = MAPPING.get(matchedDuration);
+              durationDistribution.add(normalizedDuration);
+            }
+          }
+        }
+      }
+
+      if(durationDistribution.size() > 0) { 
+        System.out.println(Utils.formatDistribution(mentionText, durationDistribution, ",
", true) + "[" + durationDistribution.size() + " instances]");
+      }else{
+        System.out.println(mentionText + ": No duration information found.");
+      }
+    }
+    
+    /**
+     * Return true if sign/symptom is negated.
+     * TODO: using rules for now; switch to using a negation module
+     */
+    private static boolean isNegated(JCas jCas, EventMention mention) {
+      
+      for(BaseToken token : JCasUtil.selectPreceding(jCas, BaseToken.class, mention, 3))
{
+        if(token.getCoveredText().equals("no")) {
+          return true;
+        }
+      }
+      
+      return false;
+    }
+
+    /**
+     * Return true of this is a medication pattern. 
+     * E.g. five (5) ml po qid  (4 times a day) as needed for heartburn for 2 weeks.
+     */
+    private static boolean isMedicationPattern(JCas jCas, EventMention mention) {
+      
+      for(BaseToken token : JCasUtil.selectPreceding(jCas, BaseToken.class, mention, 1))
{
+        if(token.getCoveredText().equals("for")) {
+          return true;
+        }
+      }
+           
+      return false;
+    }
+    
+    /**
+     * Find nearest time mention that is within allowable distance. 
+     * Return null if none found.
+     */
+    private static TimeMention getNearestTimeMention(JCas jCas, EventMention mention) {
+
+      List<TimeMention> timeMentions = JCasUtil.selectFollowing(jCas, TimeMention.class,
mention, 1);
+      if(timeMentions.size() < 1) {
+        return null;
+      }
+      
+      assert timeMentions.size() == 1;
+      
+      TimeMention nearestTimeMention = timeMentions.get(0);
+      int distance = JCasUtil.selectBetween(jCas, BaseToken.class, mention, nearestTimeMention).size();
+      if(distance > MAXDISTANCE) {
+        return null;
+      }
+      
+      return nearestTimeMention;
+    }
+    
+    @SuppressWarnings("unused")
+    private static String getAnnotationContext(Annotation annotation, int maxContextWindowSize)
{
+      
+      String text = annotation.getCAS().getDocumentText();
+      int begin = Math.max(0, annotation.getBegin() - maxContextWindowSize);
+      int end = Math.min(text.length(), annotation.getEnd() + maxContextWindowSize);
+      
+      return text.substring(begin, end).replaceAll("[\r\n]", " ");
+    }
+    
+    @SuppressWarnings("unused")
+    private static String formatDistribution(Multiset<String> durationDistribution)
{
+      
+      List<String> durationBins = Arrays.asList("second", "minute", "hour", "day",
"week", "month", "year", "decade");
+      List<Integer> durationValues = new LinkedList<Integer>();
+      
+      for(String durationBin : durationBins) {
+        durationValues.add(durationDistribution.count(durationBin));
+      }
+
+      Joiner joiner = Joiner.on(',');
+      return joiner.join(durationValues);
+    }
+  }
+  
+  private static CollectionReader getCollectionReader(List<File> items) throws Exception
{
+
+    String[] paths = new String[items.size()];
+    Collections.sort(items, new FileSizeComparator());
+    for (int i = 0; i < paths.length; ++i) {
+      paths[i] = items.get(i).getPath();
+    }
+    
+    return CollectionReaderFactory.createCollectionReader(
+        XMIReader.class,
+        XMIReader.PARAM_FILES,
+        paths);
+  }
+  
+  public static class FileSizeComparator implements Comparator<File> {
+
+    @Override
+    public int compare(File o1, File o2) {
+      if(o1.length() > o2.length()){
+        return 1;
+      }else if(o1.length() < o2.length()){
+        return -1;
+      }else{
+        return 0;
+      }
+    } 
+  }
+}

Propchange: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventDurationDistribution.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message