ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1703425 - in /ctakes/trunk: ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ae/ ctakes-core/src/main/java/org/apache/ctakes/core/cc/ ctakes-core/src/main/java/org/apache/ctakes/core/util/ ctakes-temporal/src/main/...
Date Wed, 16 Sep 2015 15:21:22 GMT
Author: seanfinan
Date: Wed Sep 16 15:21:20 2015
New Revision: 1703425

URL: http://svn.apache.org/r1703425
Log:
CTAKES-376 Created WordTokenUtil with getCanonicalForm method.  Returns canonical form or
covered text or constant "MISSING_WORDTOKEN_TEXT"
Refactored code to use WordTokenUtil.getCanonicalform, plus a little boy scout

Added:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/WordTokenUtil.java
Modified:
    ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ae/ExtractionPrepAnnotator.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/NormalizedFilesInDirectoryCasConsumer.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TemporalRelationRuleAnnotator.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/OverlappedHeadFeaturesExtractor.java

Modified: ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ae/ExtractionPrepAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ae/ExtractionPrepAnnotator.java?rev=1703425&r1=1703424&r2=1703425&view=diff
==============================================================================
--- ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ae/ExtractionPrepAnnotator.java
(original)
+++ ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ae/ExtractionPrepAnnotator.java
Wed Sep 16 15:21:20 2015
@@ -18,189 +18,174 @@
  */
 package org.apache.ctakes.clinicalpipeline.ae;
 
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.JFSIndexRepository;
-import org.apache.uima.jcas.cas.FSArray;
-import org.apache.uima.jcas.cas.TOP;
-import org.apache.uima.resource.ResourceInitializationException;
-
-import org.apache.ctakes.typesystem.type.refsem.OntologyConcept;
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.WordToken;
-import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
-import org.apache.ctakes.typesystem.type.textspan.Segment;
-import org.apache.ctakes.typesystem.type.util.Pair;
-import org.apache.ctakes.typesystem.type.util.Pairs;
+import org.apache.ctakes.core.util.WordTokenUtil;
+import org.apache.ctakes.typesystem.type.refsem.OntologyConcept;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.util.Pair;
+import org.apache.ctakes.typesystem.type.util.Pairs;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JFSIndexRepository;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.TOP;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
 
 /**
  * UIMA annotator that prepares the CAS for output - performs
  * some (final) updates to the CAS
- * 
+ *
  * @author Mayo Clinic
  */
 public class ExtractionPrepAnnotator extends JCasAnnotator_ImplBase {
-	private String iv_annotVerPropKey;
-	private int iv_annotVer;
+   private String iv_annotVerPropKey;
+   private int iv_annotVer;
 
-	/**
-	 * Method invoked by UIMA framework to initialize this annotator
-	 */
-	public void initialize(UimaContext aCtx)
-			throws ResourceInitializationException {
-		
-		super.initialize(aCtx);
-
-		try {
-			iv_annotVer = ((Integer) aCtx.getConfigParameterValue("AnnotationVersion")).intValue();
-			iv_annotVerPropKey = (String) aCtx.getConfigParameterValue("AnnotationVersionPropKey");
-		} catch (Exception e) {
-			throw new ResourceInitializationException(e);
-		}
-	
-	}
-
-	/**
-	 * Method invoked by UIMA framework to process a document
-	 */
-	public void process(JCas jcas)
-			throws AnalysisEngineProcessException {
-		generateUidValues(jcas);
-		generateTokenNormForms(jcas);
-		assignNamedEntityFeats(jcas);
-		storeAnnotationVersion(jcas);
-	}
-
-	
-	/**
-	 * Stores annotation version as a property JCas object.
-	 * 
-	 * @param jcas
-	 */
-	private void storeAnnotationVersion(JCas jcas) {
-	 	FSIterator<TOP> itr = jcas.getJFSIndexRepository().getAllIndexedFS(Pairs.type);
-		if (itr == null || !itr.hasNext())
-			return;
-
-		Pairs props = (Pairs) itr.next(); 
-
-		// create a new property array that is one item bigger
-		FSArray propArr = props.getPairs();
-		FSArray newPropArr = new FSArray(jcas, propArr.size() + 1);
-		for (int i = 0; i < propArr.size(); i++) {
-			newPropArr.set(i, propArr.get(i));
-		}
-
-		Pair annotVerProp = new Pair(jcas);    		
-		annotVerProp.setAttribute(iv_annotVerPropKey);
-		annotVerProp.setValue(String.valueOf(iv_annotVer));
-
-		// add annotation version prop as last item in array
-		newPropArr.set(newPropArr.size() - 1, annotVerProp);
-		props.setPairs(newPropArr);
-	}
-
-	/**
-	 * Generates UID values for all IdentifiedAnnotation objects.
-	 * This is just a numeric identifier, assigned sequentially.
-	 */
-	private void generateUidValues(JCas jcas) {
-		int uid = 0;
-		Iterator itr = jcas.getJFSIndexRepository().getAnnotationIndex(
-				IdentifiedAnnotation.type).iterator();
-		while (itr.hasNext()) {
-			IdentifiedAnnotation idAnnot = (IdentifiedAnnotation) itr.next();
-			idAnnot.setId(uid);
-			uid++;
-		}
-	}
-
-	/**
-	 * Generates normalized form for each token annotation.
-	 * Considers whether it is a <code>WordToken</code> with a canonical form
-	 */
-	private void generateTokenNormForms(JCas jcas) {
-		JFSIndexRepository indexes = jcas.getJFSIndexRepository();
-
-		// Determine and set the normalized form for each <code>BaseToken</code>
-		Iterator btaItr = indexes.getAnnotationIndex(BaseToken.type).iterator();
-		while (btaItr.hasNext()) {
-			BaseToken bta = (BaseToken) btaItr.next();
-			String normForm = null;
-			if (!(bta instanceof WordToken)) {
-				normForm = bta.getCoveredText();
-			} else {
-				WordToken wta = (WordToken) bta;
-				String canonicalForm = wta.getCanonicalForm();
-
-
-				// The norm form is the canonical form, if there is one
-				// Otherwise the norm form is the token's text.
-				if ((canonicalForm != null)	&& (canonicalForm.length() > 0)) {
-					normForm = canonicalForm;
-				} else {
-					normForm = wta.getCoveredText();
-				}
-			}
-			bta.setNormalizedForm(normForm);
-		}
-	}
-
-	/**
-	 * Assigns OID and segmentID values to NamedEntities
-	 */
-	private void assignNamedEntityFeats(JCas jcas) {
-		JFSIndexRepository indexes = jcas.getJFSIndexRepository();
-		// Set keySet = new HashSet();
-		// List dupList = new ArrayList();
-
-		Set segmentSet = new HashSet();
-		Iterator segmentItr = indexes.getAnnotationIndex(Segment.type).iterator();
-		while (segmentItr.hasNext()) {
-			segmentSet.add(segmentItr.next());
-		}
-
-		// For each NE, assign segment ID and assign ontology concept OIDs if applicable
-		Iterator neItr = indexes.getAnnotationIndex(IdentifiedAnnotation.type).iterator();
-		while (neItr.hasNext()) {
-			
-			IdentifiedAnnotation neAnnot = (IdentifiedAnnotation) neItr.next();
-
-			// assign segment ID
-			Iterator segItr = segmentSet.iterator();
-			while (segItr.hasNext()) {
-				Segment seg = (Segment) segItr.next();
-				// see if NE is inside this segment
-				if ((neAnnot.getBegin() >= seg.getBegin())
-						&& (neAnnot.getEnd() <= seg.getEnd())) {
-					// found segment for this NE
-					neAnnot.setSegmentID(seg.getId());
-					break;
-				}
-			}
-
-			// assign ontology concept OID values
-			FSArray ocArr = neAnnot.getOntologyConceptArr();
-			if (ocArr != null) {
-				for (int i = 0; i < ocArr.size(); i++) {
-					OntologyConcept oc = (OntologyConcept) ocArr.get(i);
-					String code = oc.getCode();
-					String scheme = oc.getCodingScheme();
-
-					StringBuffer oid = new StringBuffer();
-					oid.append(code);
-					oid.append("#");
-					oid.append(scheme);
-					oc.setOid(oid.toString());
-				}
-			}
-		}
-	}
+   /**
+    * Method invoked by UIMA framework to initialize this annotator
+    */
+   public void initialize( UimaContext aCtx )
+         throws ResourceInitializationException {
+
+      super.initialize( aCtx );
+
+      try {
+         iv_annotVer = ((Integer)aCtx.getConfigParameterValue( "AnnotationVersion" )).intValue();
+         iv_annotVerPropKey = (String)aCtx.getConfigParameterValue( "AnnotationVersionPropKey"
);
+      } catch ( Exception e ) {
+         throw new ResourceInitializationException( e );
+      }
+
+   }
+
+   /**
+    * Method invoked by UIMA framework to process a document
+    */
+   public void process( JCas jcas )
+         throws AnalysisEngineProcessException {
+      generateUidValues( jcas );
+      generateTokenNormForms( jcas );
+      assignNamedEntityFeats( jcas );
+      storeAnnotationVersion( jcas );
+   }
+
+
+   /**
+    * Stores annotation version as a property JCas object.
+    *
+    * @param jcas
+    */
+   private void storeAnnotationVersion( JCas jcas ) {
+      FSIterator<TOP> itr = jcas.getJFSIndexRepository().getAllIndexedFS( Pairs.type
);
+      if ( itr == null || !itr.hasNext() ) {
+         return;
+      }
+
+      Pairs props = (Pairs)itr.next();
+
+      // create a new property array that is one item bigger
+      FSArray propArr = props.getPairs();
+      FSArray newPropArr = new FSArray( jcas, propArr.size() + 1 );
+      for ( int i = 0; i < propArr.size(); i++ ) {
+         newPropArr.set( i, propArr.get( i ) );
+      }
+
+      Pair annotVerProp = new Pair( jcas );
+      annotVerProp.setAttribute( iv_annotVerPropKey );
+      annotVerProp.setValue( String.valueOf( iv_annotVer ) );
+
+      // add annotation version prop as last item in array
+      newPropArr.set( newPropArr.size() - 1, annotVerProp );
+      props.setPairs( newPropArr );
+   }
+
+   /**
+    * Generates UID values for all IdentifiedAnnotation objects.
+    * This is just a numeric identifier, assigned sequentially.
+    */
+   private void generateUidValues( JCas jcas ) {
+      int uid = 0;
+      Iterator itr = jcas.getJFSIndexRepository().getAnnotationIndex(
+            IdentifiedAnnotation.type ).iterator();
+      while ( itr.hasNext() ) {
+         IdentifiedAnnotation idAnnot = (IdentifiedAnnotation)itr.next();
+         idAnnot.setId( uid );
+         uid++;
+      }
+   }
+
+   /**
+    * Generates normalized form for each token annotation.
+    * Considers whether it is a <code>WordToken</code> with a canonical form
+    */
+   private void generateTokenNormForms( final JCas jcas ) {
+      final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
+      // Determine and set the normalized form for each <code>BaseToken</code>
+      for ( Annotation annotation : indexes.getAnnotationIndex( BaseToken.type ) ) {
+         if ( annotation instanceof WordToken ) {
+            ((WordToken)annotation).setNormalizedForm( WordTokenUtil.getCanonicalForm( (WordToken)annotation
) );
+         }
+      }
+   }
+
+   /**
+    * Assigns OID and segmentID values to NamedEntities
+    */
+   private void assignNamedEntityFeats( JCas jcas ) {
+      JFSIndexRepository indexes = jcas.getJFSIndexRepository();
+      // Set keySet = new HashSet();
+      // List dupList = new ArrayList();
+
+      Set segmentSet = new HashSet();
+      Iterator segmentItr = indexes.getAnnotationIndex( Segment.type ).iterator();
+      while ( segmentItr.hasNext() ) {
+         segmentSet.add( segmentItr.next() );
+      }
+
+      // For each NE, assign segment ID and assign ontology concept OIDs if applicable
+      Iterator neItr = indexes.getAnnotationIndex( IdentifiedAnnotation.type ).iterator();
+      while ( neItr.hasNext() ) {
+
+         IdentifiedAnnotation neAnnot = (IdentifiedAnnotation)neItr.next();
+
+         // assign segment ID
+         Iterator segItr = segmentSet.iterator();
+         while ( segItr.hasNext() ) {
+            Segment seg = (Segment)segItr.next();
+            // see if NE is inside this segment
+            if ( (neAnnot.getBegin() >= seg.getBegin())
+                 && (neAnnot.getEnd() <= seg.getEnd()) ) {
+               // found segment for this NE
+               neAnnot.setSegmentID( seg.getId() );
+               break;
+            }
+         }
+
+         // assign ontology concept OID values
+         FSArray ocArr = neAnnot.getOntologyConceptArr();
+         if ( ocArr != null ) {
+            for ( int i = 0; i < ocArr.size(); i++ ) {
+               OntologyConcept oc = (OntologyConcept)ocArr.get( i );
+               String code = oc.getCode();
+               String scheme = oc.getCodingScheme();
+
+               StringBuffer oid = new StringBuffer();
+               oid.append( code );
+               oid.append( "#" );
+               oid.append( scheme );
+               oc.setOid( oid.toString() );
+            }
+         }
+      }
+   }
 }
\ No newline at end of file

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/NormalizedFilesInDirectoryCasConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/NormalizedFilesInDirectoryCasConsumer.java?rev=1703425&r1=1703424&r2=1703425&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/NormalizedFilesInDirectoryCasConsumer.java
(original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/NormalizedFilesInDirectoryCasConsumer.java
Wed Sep 16 15:21:20 2015
@@ -18,83 +18,76 @@
  */
 package org.apache.ctakes.core.cc;
 
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.Iterator;
-
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.core.util.WordTokenUtil;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
 import org.apache.uima.collection.CasConsumer_ImplBase;
-import org.apache.uima.jcas.JFSIndexRepository;
 import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JFSIndexRepository;
+import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.ResourceProcessException;
 
-import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
-import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import java.io.*;
 
 /**
- * For each CAS a local file with the document text is written to a directory specifed by
a parameter.  
- * This CAS consumer does not make use of any annotation information in the cas except for
the document 
- * id specified the CommonTypeSystem.xml descriptor.  The document id will be the name of
the file written 
- * for each CAS.  
- * 
- * This CAS consumer may be useful if you want to write the results of a collection reader
and/or CAS 
- * initializer to the local file system.  For example, a JDBC Collection Reader may read
XML documents 
- * from a database and a specialized cas initializer may convert the XML to plain text. 
The 
+ * For each CAS a local file with the document text is written to a directory specifed by
a parameter.
+ * This CAS consumer does not make use of any annotation information in the cas except for
the document
+ * id specified the CommonTypeSystem.xml descriptor.  The document id will be the name of
the file written
+ * for each CAS.
+ * <p/>
+ * This CAS consumer may be useful if you want to write the results of a collection reader
and/or CAS
+ * initializer to the local file system.  For example, a JDBC Collection Reader may read
XML documents
+ * from a database and a specialized cas initializer may convert the XML to plain text. 
The
  * FilesInDirectoryCasConsumer can now be used to write the plain text to local plain text
files.
  */
 
 public class NormalizedFilesInDirectoryCasConsumer extends CasConsumer_ImplBase {
 
-	public static final String PARAM_OUTPUTDIR = "OutputDirectory";
+   public static final String PARAM_OUTPUTDIR = "OutputDirectory";
+
+   File iv_outputDirectory;
 
-	File iv_outputDirectory;
-	
-	public void initialize() throws ResourceInitializationException 
-	{
-	    String outputDirectoryName = (String)getConfigParameterValue(PARAM_OUTPUTDIR);
-	    iv_outputDirectory = new File(outputDirectoryName);
-	    if(!iv_outputDirectory.exists() || !iv_outputDirectory.isDirectory())
-	    	throw new ResourceInitializationException(
-	    			new Exception("Parameter setting 'OutputDirectory' does not point to an existing
directory."));
-	}
-	
-	public void processCas(CAS cas) throws ResourceProcessException 
-	{
-		try 
-		{
-			JCas jcas;
-			jcas = cas.getJCas();
-		
-			StringBuffer normalizedText = new StringBuffer();
-			
-			JFSIndexRepository indexes = jcas.getJFSIndexRepository();
-	        Iterator<?> tokenItr = indexes.getAnnotationIndex(WordToken.type).iterator();
-	        while (tokenItr.hasNext())
-	        {
-	        	WordToken token = (WordToken) tokenItr.next();
-	        	String tokenNormText = token.getCanonicalForm();
-	        	normalizedText.append(tokenNormText+" ");	        
-	        }	        	
-			String documentID = DocumentIDAnnotationUtil.getDocumentID(jcas);
-			writeToFile(documentID, normalizedText.toString());
-		}
-		catch(Exception e)
-		{
-			throw new ResourceProcessException(e);
-		}
-	}
-	
-	private void writeToFile(String documentID, String documentText) throws IOException
-	{
-		File outputFile = new File(iv_outputDirectory, documentID);
-		outputFile.createNewFile();
-		OutputStream out = new BufferedOutputStream(new FileOutputStream(outputFile));
-		out.write(documentText.getBytes());
-		out.flush();
-		out.close();
-	}
+   public void initialize() throws ResourceInitializationException {
+      String outputDirectoryName = (String)getConfigParameterValue( PARAM_OUTPUTDIR );
+      iv_outputDirectory = new File( outputDirectoryName );
+      if ( !iv_outputDirectory.exists() || !iv_outputDirectory.isDirectory() ) {
+         throw new ResourceInitializationException(
+               new Exception( "Parameter setting 'OutputDirectory' does not point to an existing
directory." ) );
+      }
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void processCas( final CAS cas ) throws ResourceProcessException {
+      try {
+         final JCas jcas = cas.getJCas();
+         final StringBuilder normalizedText = new StringBuilder();
+         final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
+         // Determine and set the normalized form for each <code>BaseToken</code>
+         for ( Annotation annotation : indexes.getAnnotationIndex( WordToken.type ) ) {
+            if ( annotation instanceof WordToken ) {
+               normalizedText.append( WordTokenUtil.getCanonicalForm( (WordToken)annotation
) );
+               normalizedText.append( " " );
+            }
+         }
+         final String documentID = DocumentIDAnnotationUtil.getDocumentID( jcas );
+         writeToFile( documentID, normalizedText.toString() );
+      } catch ( CASException | IOException multE ) {
+         throw new ResourceProcessException( multE );
+      }
+   }
+
+   private void writeToFile( String documentID, String documentText ) throws IOException
{
+      File outputFile = new File( iv_outputDirectory, documentID );
+      outputFile.createNewFile();
+      OutputStream out = new BufferedOutputStream( new FileOutputStream( outputFile ) );
+      out.write( documentText.getBytes() );
+      out.flush();
+      out.close();
+   }
 }

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/WordTokenUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/WordTokenUtil.java?rev=1703425&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/WordTokenUtil.java
(added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/WordTokenUtil.java
Wed Sep 16 15:21:20 2015
@@ -0,0 +1,45 @@
+package org.apache.ctakes.core.util;
+
+
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.log4j.Logger;
+
+import javax.annotation.concurrent.Immutable;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 9/16/2015
+ */
+@Immutable
+final public class WordTokenUtil {
+
+   static private final Logger LOGGER = Logger.getLogger( "WordTokenUtil" );
+
+   static private final String MISSING_WORDTOKEN_TEXT = "MISSING_WORDTOKEN_TEXT";
+
+   private WordTokenUtil() {
+   }
+
+
+   /**
+    * In some pipelines LVG is not run, hence a canonical form does not exist.
+    * In order to prevent NPEs, this method checks for null values of canonical form and
covered text
+    *
+    * @param wordToken of interest
+    * @return The first non-null of the word token's canonical form, covered text or {@link
#MISSING_WORDTOKEN_TEXT}.
+    */
+   static public String getCanonicalForm( final WordToken wordToken ) {
+      final String canonicalForm = wordToken.getCanonicalForm();
+      if ( canonicalForm != null && !canonicalForm.isEmpty() ) {
+         return canonicalForm;
+      }
+      final String coveredText = wordToken.getCoveredText();
+      if ( coveredText == null ) {
+         return MISSING_WORDTOKEN_TEXT;
+      }
+      return coveredText;
+   }
+
+
+}

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TemporalRelationRuleAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TemporalRelationRuleAnnotator.java?rev=1703425&r1=1703424&r2=1703425&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TemporalRelationRuleAnnotator.java
(original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TemporalRelationRuleAnnotator.java
Wed Sep 16 15:21:20 2015
@@ -1,114 +1,115 @@
 package org.apache.ctakes.temporal.ae;
 
 
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.ctakes.temporal.ae.feature.DependencyParseUtils;
+import com.google.common.collect.Lists;
+import org.apache.ctakes.core.util.WordTokenUtil;
 import org.apache.ctakes.typesystem.type.relation.RelationArgument;
 import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.TimeMention;
-//import org.apache.ctakes.typesystem.type.textspan.Segment;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.jcas.JCas;
 import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
 import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
 
-import com.google.common.collect.Lists;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+//import org.apache.ctakes.typesystem.type.textspan.Segment;
 
 public class TemporalRelationRuleAnnotator extends JCasAnnotator_ImplBase {
 
-	@SuppressWarnings("null")
-	@Override
-	public void process(JCas jCas) throws AnalysisEngineProcessException {
-
-		//1: linking E0-T0, E1-T1:
-		Collection<Sentence> sents = JCasUtil.select(jCas, Sentence.class);
-		List<Sentence> sentList = Lists.newArrayList();
-		sentList.addAll(sents);
-		EventMention admission = null;
-		//		EventMention discharge = null;
-		//		TimeMention  admissionDate = null;
-		//		TimeMention  dischargeDate = null;
-		int sentListLength 		= sentList.size();
-		if( sentListLength >=4 ){//the first 4 sentences are discharge date and admission date
-			for (int i=0; i<4; i+=2){
-				Sentence currentSent = sentList.get(i);
-				Sentence nextSent	 = sentList.get(i+1);
-				List<EventMention> currentEvents = JCasUtil.selectCovered(jCas, EventMention.class,
currentSent);
-				List<TimeMention>  nextTimes  = JCasUtil.selectCovered(jCas, TimeMention.class,
nextSent);
-
-				int currentSize = currentEvents == null ? 0 : currentEvents.size();
-				int nextTimeSize = nextTimes == null? 0 : nextTimes.size();
-
-				if(currentSize==0 || nextTimeSize ==0) continue;
-
-				EventMention currentEvent = currentEvents.get(0);
-				TimeMention  nextTime = nextTimes.get(0);
-
-				if(i == 0){
-					admission = currentEvent;
-					//					admissionDate = nextTime;
-					//				}else{
-					//					discharge = currentEvent;
-					//					dischargeDate = nextTime;
-				}
-
-				createRelation(jCas, currentEvent, nextTime, "OVERLAP");
-			}
-		}
-
-		//rule 3: link Timexes with the same strings
-		Collection<TimeMention> times = JCasUtil.select(jCas, TimeMention.class);
-		List<TimeMention> allTimes = Lists.newArrayList();
-		allTimes.addAll(times);
-		int timeNum = allTimes.size();
-		if(timeNum > 2){
-			for(int i=0; i<timeNum-1; i++){
-				TimeMention firstTime = allTimes.get(i);
-				for(int j=i+1;j<timeNum; j++){
-					TimeMention secondTime = allTimes.get(j);
-					if(sameTime(jCas, firstTime, secondTime)){
-						createRelation(jCas, secondTime, firstTime, "OVERLAP");
-					}
-				}
-			}
-		}
-
-		//2: linking coreferent event pairs, lift section restriction
-		Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
-
-		Collection<EventMention> allEvents = JCasUtil.select(jCas, EventMention.class);
-		List<EventMention> realEvents = new ArrayList<>();
-		//filtering events
-		for(EventMention event : allEvents){
-			// filter out ctakes events
-			if(event.getClass().equals(EventMention.class)){
-				realEvents.add(event);
-			}
-		}
-		allEvents = realEvents;
-
-		for(Sentence sent : sentences){
-			List<EventMention> currentEvents = JCasUtil.selectCovered(jCas, EventMention.class,
sent);
-			//filter out ctakes events
-			realEvents = new ArrayList<>();
-			for(EventMention event : currentEvents){
-				// filter out ctakes events
-				if(event.getClass().equals(EventMention.class)){
-					realEvents.add(event);
-				}
-			}
-			currentEvents = realEvents;
+   @SuppressWarnings( "null" )
+   @Override
+   public void process( JCas jCas ) throws AnalysisEngineProcessException {
+
+      //1: linking E0-T0, E1-T1:
+      Collection<Sentence> sents = JCasUtil.select( jCas, Sentence.class );
+      List<Sentence> sentList = Lists.newArrayList();
+      sentList.addAll( sents );
+      EventMention admission = null;
+      //		EventMention discharge = null;
+      //		TimeMention  admissionDate = null;
+      //		TimeMention  dischargeDate = null;
+      int sentListLength = sentList.size();
+      if ( sentListLength >= 4 ) {//the first 4 sentences are discharge date and admission
date
+         for ( int i = 0; i < 4; i += 2 ) {
+            Sentence currentSent = sentList.get( i );
+            Sentence nextSent = sentList.get( i + 1 );
+            List<EventMention> currentEvents = JCasUtil.selectCovered( jCas, EventMention.class,
currentSent );
+            List<TimeMention> nextTimes = JCasUtil.selectCovered( jCas, TimeMention.class,
nextSent );
+
+            int currentSize = currentEvents == null ? 0 : currentEvents.size();
+            int nextTimeSize = nextTimes == null ? 0 : nextTimes.size();
+
+            if ( currentSize == 0 || nextTimeSize == 0 ) {
+               continue;
+            }
+
+            EventMention currentEvent = currentEvents.get( 0 );
+            TimeMention nextTime = nextTimes.get( 0 );
+
+            if ( i == 0 ) {
+               admission = currentEvent;
+               //					admissionDate = nextTime;
+               //				}else{
+               //					discharge = currentEvent;
+               //					dischargeDate = nextTime;
+            }
+
+            createRelation( jCas, currentEvent, nextTime, "OVERLAP" );
+         }
+      }
+
+      //rule 3: link Timexes with the same strings
+      Collection<TimeMention> times = JCasUtil.select( jCas, TimeMention.class );
+      List<TimeMention> allTimes = Lists.newArrayList();
+      allTimes.addAll( times );
+      int timeNum = allTimes.size();
+      if ( timeNum > 2 ) {
+         for ( int i = 0; i < timeNum - 1; i++ ) {
+            TimeMention firstTime = allTimes.get( i );
+            for ( int j = i + 1; j < timeNum; j++ ) {
+               TimeMention secondTime = allTimes.get( j );
+               if ( sameTime( jCas, firstTime, secondTime ) ) {
+                  createRelation( jCas, secondTime, firstTime, "OVERLAP" );
+               }
+            }
+         }
+      }
+
+      //2: linking coreferent event pairs, lift section restriction
+      Collection<Sentence> sentences = JCasUtil.select( jCas, Sentence.class );
+
+      Collection<EventMention> allEvents = JCasUtil.select( jCas, EventMention.class
);
+      List<EventMention> realEvents = new ArrayList<>();
+      //filtering events
+      for ( EventMention event : allEvents ) {
+         // filter out ctakes events
+         if ( event.getClass().equals( EventMention.class ) ) {
+            realEvents.add( event );
+         }
+      }
+      allEvents = realEvents;
+
+      for ( Sentence sent : sentences ) {
+         List<EventMention> currentEvents = JCasUtil.selectCovered( jCas, EventMention.class,
sent );
+         //filter out ctakes events
+         realEvents = new ArrayList<>();
+         for ( EventMention event : currentEvents ) {
+            // filter out ctakes events
+            if ( event.getClass().equals( EventMention.class ) ) {
+               realEvents.add( event );
+            }
+         }
+         currentEvents = realEvents;
 
-			//get dependent pairs:
+         //get dependent pairs:
 //			int eventNum = currentEvents.size();
 //			if(eventNum >= 4){
 //				EventMention first = currentEvents.get(0);
@@ -135,110 +136,115 @@ public class TemporalRelationRuleAnnotat
 //				}
 //			}
 
-			//remove current Events from allEvents:
-			for(EventMention event:currentEvents){
-				allEvents.remove(event);
-				//check if current event is the admission event
-				if(admission != null && event!=admission && event.getCoveredText().toLowerCase().startsWith("admitted")){
-					createRelation(jCas, event, admission, "OVERLAP");
-				}
-			}
-
-			for(EventMention arg1: currentEvents){
-				for(EventMention arg2: allEvents){
-					if(hasOverlapNNs(jCas, arg1, arg2)){//hasSameSemanticType(jCas, arg1, arg2) &&

-						createRelation(jCas, arg2, arg1, "OVERLAP");
-					}
-				}
-			}
-
-		}
-	}
-
-	//	private static boolean hasSameSemanticType(JCas jCas, EventMention arg1,
-	//			EventMention arg2) {
-	//		List<EventMention> arg1Events = JCasUtil.selectCovered(jCas, EventMention.class,
arg1);
-	//		List<EventMention> arg2Events = JCasUtil.selectCovered(jCas, EventMention.class,
arg2);
-	//		for (EventMention event1 : arg1Events){
-	//			if(!event1.getClass().equals(EventMention.class)){//&& event1.getBegin()==arg1.getBegin()
&& event1.getEnd()==arg1.getEnd()){
-	//				for (EventMention event2 : arg2Events){
-	//					if(!event2.getClass().equals(EventMention.class)){// && event2.getBegin()==arg2.getBegin()
&& event2.getEnd()==arg2.getEnd()){
-	//						if(event1.getClass().equals(event2.getClass())){
-	//							return true;
-	//						}
-	//					}
-	//				}
-	//			}
-	//		}
-	//		return false;
-	//	}
-
-	private static boolean sameTime(JCas jCas, TimeMention firstTime,
-			TimeMention secondTime) {
-		List<BaseToken> currentTokens = JCasUtil.selectCovered(jCas, BaseToken.class, firstTime);
-		List<BaseToken> nextTokens    = JCasUtil.selectCovered(jCas, BaseToken.class, secondTime);
-		int tokenSize = currentTokens.size();
-		if(tokenSize != nextTokens.size()){
-			return false;
-		}
-		for(int i=0; i<tokenSize; i++){
-			if(!currentTokens.get(i).getCoveredText().equals(nextTokens.get(i).getCoveredText())){
-				return false;
-			}
-		}
-		return true;
-	}
-
-	private static void createRelation(JCas jCas, IdentifiedAnnotation arg1,
-			IdentifiedAnnotation arg2, String cagegory) {
-		RelationArgument relArg1 = new RelationArgument(jCas);
-		relArg1.setArgument(arg1);
-		relArg1.setRole("Arg1");
-		relArg1.addToIndexes();
-		RelationArgument relArg2 = new RelationArgument(jCas);
-		relArg2.setArgument(arg2);
-		relArg2.setRole("Arg2");
-		relArg2.addToIndexes();
-		TemporalTextRelation relation = new TemporalTextRelation(jCas);
-		relation.setArg1(relArg1);
-		relation.setArg2(relArg2);
-		relation.setCategory(cagegory);
-		relation.addToIndexes();
-	}
-
-	/**
-	 * Method for checking if two arguments share some common NNs ob VBs.
-	 * @param jCas
-	 * @param event1
-	 * @param event2
-	 * @return
-	 */
-	private static boolean hasOverlapNNs(JCas jCas, EventMention event1, EventMention event2)
{
-		List<WordToken> currentTokens = JCasUtil.selectCovered(jCas, WordToken.class, event1);
-		List<WordToken> nextTokens = JCasUtil.selectCovered(jCas, WordToken.class, event2);
-
-		int NNSize1 = 0;
-		int NNSize2 = 0;
-		int matches = 0;
-		for(WordToken t1: currentTokens){
-			if(t1.getPartOfSpeech().startsWith("NN")||t1.getPartOfSpeech().startsWith("VB")){
-				NNSize1 ++;
-				for(WordToken t2: nextTokens){
-					if(t2.getPartOfSpeech().startsWith("NN")||t2.getPartOfSpeech().startsWith("VB")){
-						NNSize2 ++;
-						if(t1.getCanonicalForm().equals(t2.getCanonicalForm())){
-							matches++;
-						}						
-					}
-				}
-
-			}
-		}
-		int NNSize = Math.min(NNSize1, NNSize2);
-		if (NNSize == 0) return false;
-		float matchRatio = (float)matches/NNSize;
-		if( matchRatio == 1)
-			return true;
-		return false;
-	}
+         //remove current Events from allEvents:
+         for ( EventMention event : currentEvents ) {
+            allEvents.remove( event );
+            //check if current event is the admission event
+            if ( admission != null && event != admission &&
+                 event.getCoveredText().toLowerCase().startsWith( "admitted" ) ) {
+               createRelation( jCas, event, admission, "OVERLAP" );
+            }
+         }
+
+         for ( EventMention arg1 : currentEvents ) {
+            for ( EventMention arg2 : allEvents ) {
+               if ( hasOverlapNNs( jCas, arg1, arg2 ) ) {//hasSameSemanticType(jCas, arg1,
arg2) &&
+                  createRelation( jCas, arg2, arg1, "OVERLAP" );
+               }
+            }
+         }
+
+      }
+   }
+
+   //	private static boolean hasSameSemanticType(JCas jCas, EventMention arg1,
+   //			EventMention arg2) {
+   //		List<EventMention> arg1Events = JCasUtil.selectCovered(jCas, EventMention.class,
arg1);
+   //		List<EventMention> arg2Events = JCasUtil.selectCovered(jCas, EventMention.class,
arg2);
+   //		for (EventMention event1 : arg1Events){
+   //			if(!event1.getClass().equals(EventMention.class)){//&& event1.getBegin()==arg1.getBegin()
&& event1.getEnd()==arg1.getEnd()){
+   //				for (EventMention event2 : arg2Events){
+   //					if(!event2.getClass().equals(EventMention.class)){// && event2.getBegin()==arg2.getBegin()
&& event2.getEnd()==arg2.getEnd()){
+   //						if(event1.getClass().equals(event2.getClass())){
+   //							return true;
+   //						}
+   //					}
+   //				}
+   //			}
+   //		}
+   //		return false;
+   //	}
+
+   private static boolean sameTime( JCas jCas, TimeMention firstTime,
+                                    TimeMention secondTime ) {
+      List<BaseToken> currentTokens = JCasUtil.selectCovered( jCas, BaseToken.class,
firstTime );
+      List<BaseToken> nextTokens = JCasUtil.selectCovered( jCas, BaseToken.class, secondTime
);
+      int tokenSize = currentTokens.size();
+      if ( tokenSize != nextTokens.size() ) {
+         return false;
+      }
+      for ( int i = 0; i < tokenSize; i++ ) {
+         if ( !currentTokens.get( i ).getCoveredText().equals( nextTokens.get( i ).getCoveredText()
) ) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private static void createRelation( JCas jCas, IdentifiedAnnotation arg1,
+                                       IdentifiedAnnotation arg2, String cagegory ) {
+      RelationArgument relArg1 = new RelationArgument( jCas );
+      relArg1.setArgument( arg1 );
+      relArg1.setRole( "Arg1" );
+      relArg1.addToIndexes();
+      RelationArgument relArg2 = new RelationArgument( jCas );
+      relArg2.setArgument( arg2 );
+      relArg2.setRole( "Arg2" );
+      relArg2.addToIndexes();
+      TemporalTextRelation relation = new TemporalTextRelation( jCas );
+      relation.setArg1( relArg1 );
+      relation.setArg2( relArg2 );
+      relation.setCategory( cagegory );
+      relation.addToIndexes();
+   }
+
+   /**
+    * Method for checking if two arguments share some common NNs ob VBs.
+    *
+    * @param jCas
+    * @param event1
+    * @param event2
+    * @return
+    */
+   private static boolean hasOverlapNNs( final JCas jCas, final EventMention event1, final
EventMention event2 ) {
+      final Collection<WordToken> currentTokens = JCasUtil.selectCovered( jCas, WordToken.class,
event1 );
+      final Collection<WordToken> nextTokens = JCasUtil.selectCovered( jCas, WordToken.class,
event2 );
+      if ( currentTokens == null || currentTokens.isEmpty() || nextTokens == null || nextTokens.isEmpty()
) {
+         return false;
+      }
+      int NNSize1 = 0;
+      int NNSize2 = 0;
+      int matches = 0;
+      for ( WordToken t1 : currentTokens ) {
+         if ( t1.getPartOfSpeech().startsWith( "NN" ) || t1.getPartOfSpeech().startsWith(
"VB" ) ) {
+            NNSize1++;
+            for ( WordToken t2 : nextTokens ) {
+               if ( t2.getPartOfSpeech().startsWith( "NN" ) || t2.getPartOfSpeech().startsWith(
"VB" ) ) {
+                  NNSize2++;
+                  if ( WordTokenUtil.getCanonicalForm( t1 ).equals( WordTokenUtil.getCanonicalForm(
t2 ) ) ) {
+                     matches++;
+                  }
+               }
+            }
+         }
+      }
+      final int NNSize = Math.min( NNSize1, NNSize2 );
+      if ( NNSize == 0 ) {
+         return false;
+      }
+      final float matchRatio = (float)matches / NNSize;
+      // Try to avoid [float1] == [float2] primitive comparison
+      return Float.compare( matchRatio, 1f ) == 0;
+   }
+
 }

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/OverlappedHeadFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/OverlappedHeadFeaturesExtractor.java?rev=1703425&r1=1703424&r2=1703425&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/OverlappedHeadFeaturesExtractor.java
(original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/OverlappedHeadFeaturesExtractor.java
Wed Sep 16 15:21:20 2015
@@ -1,79 +1,87 @@
 package org.apache.ctakes.temporal.ae.feature;
 
-import java.util.ArrayList;
-import java.util.List;
-
 import org.apache.ctakes.relationextractor.ae.features.TokenFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
 import org.cleartk.ml.Feature;
-import org.apache.uima.fit.util.JCasUtil;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
 
 /**
  * Extract the overlapping head words of two arguments. Head words: the NNs of NP + the VBs
of VP
- * @author CH151862
  *
+ * @author CH151862
  */
 public class OverlappedHeadFeaturesExtractor extends TokenFeaturesExtractor {
 
-	@Override
-	public List<Feature> extract(JCas jCas, IdentifiedAnnotation mention1, IdentifiedAnnotation
mention2)
-			throws AnalysisEngineProcessException {
-		List<Feature> features = new ArrayList<>();
-		Annotation arg1 = mention1;
-		Annotation arg2 = mention2;
-		
-		String featName = "overlappingHeadTerms";
-
-		//iterate through the tokens of two arguments
-		List<WordToken> currentTokens = JCasUtil.selectCovered(jCas, WordToken.class, arg1);
-		List<WordToken> nextTokens = JCasUtil.selectCovered(jCas, WordToken.class, arg2);
-		
-		int headSize1 = 0;
-		int headSize2 = 0;
-		int headSize  = 0;
-		int longHeadSize = 0;
-		int matches = 0;
-		for(WordToken t1: currentTokens){
-			String t1_pos = t1.getPartOfSpeech();
-			if(t1_pos.startsWith("NN")||t1_pos.startsWith("VB")){
-				headSize1 ++;
-				for(WordToken t2: nextTokens){
-					String t2_pos = t2.getPartOfSpeech();
-					if(t2_pos.startsWith("NN")||t2_pos.startsWith("VB")){
-						headSize2 ++;
-						String t1str = t1.getCanonicalForm();
-						String t2str = t2.getCanonicalForm();
-						if(t1str.equals(t2str)){
-							features.add(new Feature(featName+"_CanoticalForm", t1str));
-							features.add(new Feature(featName+"_length", t1str.length()));
-							features.add(new Feature(featName+"_POS", t1_pos));
-							matches++;
-						}
-					}
-				}
-			}
-		}
-		if(matches > 0){
-			headSize = Math.min(headSize1, headSize2);
-			longHeadSize = Math.max(headSize1, headSize2);
-			
-			//feature of counting times of matches
-			features.add(new Feature(featName+"_count", matches));
-			
-			//ratio of the count of matches to the shorter length of tokens between the two arguments
-			float matchShortRatio = (float)matches/headSize;
-			features.add(new Feature(featName+"_shortRatio", matchShortRatio));
-			
-			//ratio of the count of matches to the longer length of tokens between the two arguments
-			float matchLongRatio  = (float)matches/longHeadSize;
-			features.add(new Feature(featName+"_longRatio", matchLongRatio));
-		}
-		
-		return features;
-	}
+   static private final String FEATURE_NAME_ROOT = "overlappingHeadTerms";
+   static private final String NOT_NN_VB_POS = "NOT_NN_VB_POS";
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public List<Feature> extract( final JCas jCas,
+                                 final IdentifiedAnnotation mention1,
+                                 final IdentifiedAnnotation mention2 ) throws AnalysisEngineProcessException
{
+      final Collection<WordToken> currentTokens = JCasUtil.selectCovered( jCas, WordToken.class,
mention1 );
+      final Collection<WordToken> nextTokens = JCasUtil.selectCovered( jCas, WordToken.class,
mention2 );
+      if ( currentTokens == null || currentTokens.isEmpty() || nextTokens == null || nextTokens.isEmpty()
) {
+         return Collections.emptyList();
+      }
+      final List<Feature> features = new ArrayList<>();
+      //iterate through the tokens of two arguments
+      int headSize1 = 0;
+      int headSize2 = 0;
+      int matches = 0;
+      for ( WordToken t1 : currentTokens ) {
+         final String t1_pos = getNnVbPos( t1 );
+         if ( !t1_pos.equals( NOT_NN_VB_POS ) ) {
+            headSize1++;
+            for ( WordToken t2 : nextTokens ) {
+               if ( !getNnVbPos( t2 ).equals( NOT_NN_VB_POS ) ) {
+                  headSize2++;
+                  final String t1str = t1.getCanonicalForm();
+                  if ( t1str != null && t1str.equals( t2.getCanonicalForm() ) ) {
+                     features.add( createFeature( "CanonicalForm", t1str ) );
+                     features.add( createFeature( "length", t1str.length() ) );
+                     features.add( createFeature( "POS", t1_pos ) );
+                     matches++;
+                  }
+               }
+            }
+         }
+      }
+      if ( matches > 0 ) {
+         //feature of counting times of matches
+         features.add( createFeature( "count", matches ) );
+         //ratio of the count of matches to the shorter length of tokens between the two
arguments
+         final float matchShortRatio = (float)matches / (float)Math.min( headSize1, headSize2
);
+         features.add( createFeature( "shortRatio", matchShortRatio ) );
+         //ratio of the count of matches to the longer length of tokens between the two arguments
+         final float matchLongRatio = (float)matches / (float)Math.max( headSize1, headSize2
);
+         features.add( createFeature( "longRatio", matchLongRatio ) );
+      }
+      return features;
+   }
+
+   static private String getNnVbPos( final BaseToken baseToken ) {
+      final String pos = baseToken.getPartOfSpeech();
+      if ( pos.startsWith( "NN" ) || pos.startsWith( "VB" ) ) {
+         return pos;
+      }
+      return NOT_NN_VB_POS;
+   }
+
+   static private Feature createFeature( final String suffix, final Object value ) {
+      return new Feature( FEATURE_NAME_ROOT + "_" + suffix, value );
+   }
 
 }



Mime
View raw message