ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1562132 - /ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java
Date Tue, 28 Jan 2014 18:04:34 GMT
Author: dligach
Date: Tue Jan 28 18:04:34 2014
New Revision: 1562132

URL: http://svn.apache.org/r1562132
Log:
outputting stats about polysemy wrt semantic type

Modified:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java

Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java?rev=1562132&r1=1562131&r2=1562132&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java Tue
Jan 28 18:04:34 2014
@@ -9,6 +9,7 @@ import java.util.Set;
 import org.apache.ctakes.core.cr.XMIReader;
 import org.apache.ctakes.typesystem.type.refsem.OntologyConcept;
 import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.uima.analysis_engine.AnalysisEngine;
@@ -30,10 +31,7 @@ import com.google.common.collect.Lists;
 
 /**
  * 
- * Read XMI files and apply a consumer that extracts relation features for a downstream component.
- * 
  * @author dmitriy dligach
- *
  */
 public class Analyze {
 
@@ -49,7 +47,7 @@ public class Analyze {
 
     @Option(
         name = "--output-dir",
-        usage = "specify the path to the directory where the training data will be placed",
+        usage = "specify the path to the output directory",
         required = false)
     public File outputDirectory;
   }
@@ -59,10 +57,10 @@ public class Analyze {
     Options options = new Options();
     options.parseOptions(args);
 
-    List<File> trainFiles = Arrays.asList(options.inputDirectory.listFiles());
-    String[] paths = new String[trainFiles.size()];
+    List<File> xmiFiles = Arrays.asList(options.inputDirectory.listFiles());
+    String[] paths = new String[xmiFiles.size()];
     for (int i = 0; i < paths.length; ++i) {
-      paths[i] = trainFiles.get(i).getPath();
+      paths[i] = xmiFiles.get(i).getPath();
     }
 
     CollectionReader xmiCollectionReader = CollectionReaderFactory.createCollectionReader(
@@ -70,12 +68,12 @@ public class Analyze {
         XMIReader.PARAM_FILES,
         paths);
     
-    AnalysisEngine featureExtractorAe = AnalysisEngineFactory.createPrimitive(DoSomething.class);
+    AnalysisEngine consumer = AnalysisEngineFactory.createPrimitive(Consumer.class);
         
-    SimplePipeline.runPipeline(xmiCollectionReader, featureExtractorAe);
+    SimplePipeline.runPipeline(xmiCollectionReader, consumer);
   }
   
-  public static class DoSomething extends JCasAnnotator_ImplBase{
+  public static class Consumer extends JCasAnnotator_ImplBase {
 
     @Override
     public void process(JCas jCas) throws AnalysisEngineProcessException {
@@ -88,14 +86,25 @@ public class Analyze {
       }   
       
       for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class)))
{
+        // for some reason in gold begin offset for some mentions is a huge number
+        if(mention.getBegin() > jCas.getDocumentText().length()) {
+          continue;
+        }
         
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        System.out.format("%s|%s\n", text, semanticType);
+      }
+      
+      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class)))
{
         // avoid weird crashes
-        if(mention.getBegin() == 2147483647) {
+        if(mention.getBegin() > jCas.getDocumentText().length()) {
           continue;
         }
         
-        Set<String> codes = getOntologyConceptCodes(mention);
-        System.out.println(mention.getCoveredText().toLowerCase() + "," + codes.size());
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        System.out.format("%s|%s\n", text, semanticType);
       }
     }
     



Mime
View raw message