ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From stevenbeth...@apache.org
Subject svn commit: r1486371 - in /ctakes/trunk/ctakes-temporal/src/main: java/org/apache/ctakes/temporal/ae/TimeAnnotator.java java/org/apache/ctakes/temporal/ae/feature/TimeWordTypeExractor.java resources/org/apache/ctakes/temporal/time_word_types.txt
Date Sat, 25 May 2013 22:03:45 GMT
Author: stevenbethard
Date: Sat May 25 22:03:44 2013
New Revision: 1486371

URL: http://svn.apache.org/r1486371
Log:
Adds features based on gazetteer of time expression words to TimeAnnotator. Gazetteer was
derived from normalization grammar: https://github.com/bethard/timenorm/blob/master/src/main/resources/info/bethard/timenorm/en.grammar

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeWordTypeExractor.java
  (with props)
    ctakes/trunk/ctakes-temporal/src/main/resources/org/apache/ctakes/temporal/time_word_types.txt
  (with props)
Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java?rev=1486371&r1=1486370&r2=1486371&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java
(original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java
Sat May 25 22:03:44 2013
@@ -22,6 +22,7 @@ import java.io.File;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.ctakes.temporal.ae.feature.TimeWordTypeExractor;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.textsem.TimeMention;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
@@ -93,7 +94,8 @@ public class TimeAnnotator extends Tempo
         new CoveredTextExtractor(),
         new CharacterCategoryPatternExtractor(PatternType.REPEATS_MERGED),
         new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
-        new TypePathExtractor(BaseToken.class, "partOfSpeech"));
+        new TypePathExtractor(BaseToken.class, "partOfSpeech"),
+        new TimeWordTypeExractor());
 
     this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
     this.tokenFeatureExtractors.add(allExtractors);

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeWordTypeExractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeWordTypeExractor.java?rev=1486371&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeWordTypeExractor.java
(added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeWordTypeExractor.java
Sat May 25 22:03:44 2013
@@ -0,0 +1,57 @@
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.timeml.util.TimeWordsExtractor;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.Maps;
+import com.google.common.io.Resources;
+
+public class TimeWordTypeExractor implements SimpleFeatureExtractor {
+  
+  private static final String FEATURE_NAME = "TimeWordType";
+  
+  private static final String LOOKUP_PATH = "/org/apache/ctakes/temporal/time_word_types.txt";
+  
+  private Map<String, String> wordTypes;
+  
+  public TimeWordTypeExractor() throws ResourceInitializationException {
+    this.wordTypes = Maps.newHashMap();
+    URL url = TimeWordsExtractor.class.getResource(LOOKUP_PATH);
+    try {
+      for (String line : Resources.readLines(url, Charsets.US_ASCII)) {
+        String[] typeAndWord = line.split("\\s+");
+        if (typeAndWord.length != 2) {
+          throw new IllegalArgumentException("Expected '<type> <word>', found:
" + line);
+        }
+        this.wordTypes.put(typeAndWord[1], typeAndWord[0]);
+      }
+    } catch (IOException e) {
+      throw new ResourceInitializationException(e);
+    }
+  }
+
+  @Override
+  public List<Feature> extract(JCas view, Annotation focusAnnotation)
+      throws CleartkExtractorException {
+    String type = this.wordTypes.get(focusAnnotation.getCoveredText().toLowerCase());
+    List<Feature> features;
+    if (type == null) {
+      features = Collections.emptyList();
+    } else {
+      features = Collections.singletonList(new Feature(FEATURE_NAME, type));
+    }
+    return features;
+  }
+}

Propchange: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeWordTypeExractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeWordTypeExractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/trunk/ctakes-temporal/src/main/resources/org/apache/ctakes/temporal/time_word_types.txt
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/resources/org/apache/ctakes/temporal/time_word_types.txt?rev=1486371&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/resources/org/apache/ctakes/temporal/time_word_types.txt
(added)
+++ ctakes/trunk/ctakes-temporal/src/main/resources/org/apache/ctakes/temporal/time_word_types.txt
Sat May 25 22:03:44 2013
@@ -0,0 +1,177 @@
+Number one
+Number two
+Number three
+Number four
+Number five
+Number six
+Number seven
+Number eight
+Number nine
+Number ten
+Number eleven
+Number twelve
+Number thirteen
+Number fourteen
+Number fifteen
+Number sixteen
+Number seventeen
+Number eighteen
+Number nineteen
+Number twenty
+Number thirty
+Number forty
+Number fifty
+Number sixty
+Number seventy
+Number eighty
+Number ninety
+Number hundred
+Number thousand
+Number couple
+Unit second
+Unit minute
+Unit hour
+Unit day
+Unit week
+Unit month
+Unit period
+Unit quarter
+Unit year
+Unit decade
+Unit century
+Unit seconds
+Unit minutes
+Unit hours
+Unit days
+Unit weeks
+Unit months
+Unit quarters
+Unit years
+Unit decades
+Unit centuries
+PartOfDay morning
+PartOfDay afternoon
+PartOfDay evening
+PartOfDay night
+PartOfDay overnight
+DayOfWeek monday
+DayOfWeek tuesday
+DayOfWeek wednesday
+DayOfWeek thursday
+DayOfWeek friday
+DayOfWeek saturday
+DayOfWeek sunday
+DayOfWeek mon
+DayOfWeek tue
+DayOfWeek tues
+DayOfWeek wed
+DayOfWeek thu
+DayOfWeek thur
+DayOfWeek thurs
+DayOfWeek fri
+DayOfWeek sat
+DayOfWeek sun
+WeekendOfWeek weekend
+MonthOfYear january
+MonthOfYear february
+MonthOfYear march
+MonthOfYear april
+MonthOfYear may
+MonthOfYear june
+MonthOfYear july
+MonthOfYear august
+MonthOfYear september
+MonthOfYear october
+MonthOfYear november
+MonthOfYear december
+MonthOfYear jan
+MonthOfYear feb
+MonthOfYear mar
+MonthOfYear apr
+MonthOfYear may
+MonthOfYear jun
+MonthOfYear jul
+MonthOfYear aug
+MonthOfYear sep
+MonthOfYear sept
+MonthOfYear oct
+MonthOfYear nov
+MonthOfYear dec
+SeasonOfYear spring
+SeasonOfYear summer
+SeasonOfYear fall
+SeasonOfYear autumn
+SeasonOfYear winter
+DecadeOfCentury twenties
+DecadeOfCentury thirties
+DecadeOfCentury forties
+DecadeOfCentury fifties
+DecadeOfCentury sixties
+DecadeOfCentury seventies
+DecadeOfCentury eighties
+DecadeOfCentury nineties
+Time now
+Time today
+Time tonight
+Time yesterday
+Time tomorrow
+Time noon
+Time midday
+TimeReference previous
+TimeReference previously
+TimeReference recent
+TimeReference recently
+TimeReference current
+TimeReference currently
+TimeReference already
+TimeReference yet
+TimeReference future
+TimeReference soon
+Frequency every
+Frequency each
+Frequency hourly
+Frequency daily
+Frequency weekly
+Frequency monthly
+Frequency quarterly
+Frequency yearly
+Frequency annually
+Frequency mornings
+Frequency afternoons
+Frequency evenings
+Frequency nights
+Frequency springs
+Frequency summers
+Frequency falls
+Frequency autumns
+Frequency winters
+Adjuster last
+Adjuster past
+Adjuster previous
+Adjuster preceding
+Adjuster latest
+Adjuster earlier
+Adjuster ago
+Adjuster next
+Adjuster coming
+Adjuster following
+Modifier almost
+Modifier about
+Modifier around
+Modifier less
+Modifier than
+Modifier nearly
+Modifier more
+Modifier over
+Modifier least
+Modifier end
+Modifier start
+Modifier beginning
+Modifier early
+Modifier earlier
+Modifier mid
+Modifier middle
+Modifier late
+Modifier later
+Modifier part
+Modifier post

Propchange: ctakes/trunk/ctakes-temporal/src/main/resources/org/apache/ctakes/temporal/time_word_types.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: ctakes/trunk/ctakes-temporal/src/main/resources/org/apache/ctakes/temporal/time_word_types.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message