Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 0677D200B51 for ; Mon, 1 Aug 2016 19:53:50 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 04F90160A6C; Mon, 1 Aug 2016 17:53:50 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 2225E160A66 for ; Mon, 1 Aug 2016 19:53:48 +0200 (CEST) Received: (qmail 9180 invoked by uid 500); 1 Aug 2016 17:53:48 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 9170 invoked by uid 99); 1 Aug 2016 17:53:48 -0000 Received: from pnap-us-west-generic-nat.apache.org (HELO spamd4-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Aug 2016 17:53:48 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd4-us-west.apache.org (ASF Mail Server at spamd4-us-west.apache.org) with ESMTP id CFDC1C0B9A for ; Mon, 1 Aug 2016 17:53:47 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd4-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: -0.426 X-Spam-Level: X-Spam-Status: No, score=-0.426 tagged_above=-999 required=6.31 tests=[KAM_LAZY_DOMAIN_SECURITY=1, RP_MATCHES_RCVD=-1.426] autolearn=disabled Received: from mx2-lw-us.apache.org ([10.40.0.8]) by localhost (spamd4-us-west.apache.org [10.40.0.11]) (amavisd-new, port 10024) with ESMTP id 2uP0Ee65Y7Gw for ; Mon, 1 Aug 2016 17:53:46 +0000 (UTC) Received: from mailrelay1-us-west.apache.org (mailrelay1-us-west.apache.org [209.188.14.139]) by mx2-lw-us.apache.org (ASF Mail Server at mx2-lw-us.apache.org) with ESMTP id C56875F613 for ; Mon, 1 Aug 2016 17:53:45 +0000 (UTC) Received: from svn01-us-west.apache.org (svn.apache.org [10.41.0.6]) by mailrelay1-us-west.apache.org (ASF Mail Server at mailrelay1-us-west.apache.org) with ESMTP id 03A56E002B for ; Mon, 1 Aug 2016 17:53:43 +0000 (UTC) Received: from svn01-us-west.apache.org (localhost [127.0.0.1]) by svn01-us-west.apache.org (ASF Mail Server at svn01-us-west.apache.org) with ESMTP id A1D1D3A0046 for ; Mon, 1 Aug 2016 17:53:43 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1754783 - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/GoldEventPrinterWithLabels.java Date: Mon, 01 Aug 2016 17:53:43 -0000 To: commits@ctakes.apache.org From: dligach@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20160801175343.A1D1D3A0046@svn01-us-west.apache.org> archived-at: Mon, 01 Aug 2016 17:53:50 -0000 Author: dligach Date: Mon Aug 1 17:53:42 2016 New Revision: 1754783 URL: http://svn.apache.org/viewvc?rev=1754783&view=rev Log: printe event data / single label per word Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/GoldEventPrinterWithLabels.java Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/GoldEventPrinterWithLabels.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/GoldEventPrinterWithLabels.java?rev=1754783&view=auto ============================================================================== --- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/GoldEventPrinterWithLabels.java (added) +++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/GoldEventPrinterWithLabels.java Mon Aug 1 17:53:42 2016 @@ -0,0 +1,209 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.ctakes.temporal.nn; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.ctakes.temporal.duration.Utils; +import org.apache.ctakes.temporal.eval.CommandLine; +import org.apache.ctakes.temporal.eval.THYMEData; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.typesystem.type.textsem.EventMention; +import org.apache.ctakes.typesystem.type.textspan.Sentence; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CASException; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; + +import com.lexicalscope.jewel.cli.CliFactory; +import com.lexicalscope.jewel.cli.Option; + +/** + * Read cTAKES annotations from XMI files. + * + * @author dmitriy dligach + */ +public class GoldEventPrinterWithLabels { + + static interface Options { + + @Option(longName = "xmi-dir") + public File getInputDirectory(); + + @Option(longName = "patients") + public CommandLine.IntegerRanges getPatients(); + + @Option(longName = "output-train") + public File getTrainOutputDirectory(); + + @Option(longName = "output-test") + public File getTestOutputDirectory(); + } + + public static void main(String[] args) throws Exception { + + Options options = CliFactory.parseArguments(Options.class, args); + + File trainFile = options.getTrainOutputDirectory(); + if(trainFile.exists()) { + trainFile.delete(); + } + trainFile.createNewFile(); + File devFile = options.getTestOutputDirectory(); + if(devFile.exists()) { + devFile.delete(); + } + devFile.createNewFile(); + + List patientSets = options.getPatients().getList(); + List trainItems = THYMEData.getPatientSets(patientSets, THYMEData.TRAIN_REMAINDERS); + List devItems = THYMEData.getPatientSets(patientSets, THYMEData.DEV_REMAINDERS); + + List trainFiles = Utils.getFilesFor(trainItems, options.getInputDirectory()); + List devFiles = Utils.getFilesFor(devItems, options.getInputDirectory()); + + // sort training files to eliminate platform specific dir listings + Collections.sort(trainFiles); + + // write training data to file + CollectionReader trainCollectionReader = Utils.getCollectionReader(trainFiles); + AnalysisEngine trainDataWriter = AnalysisEngineFactory.createEngine( + EventPrinter.class, + "OutputFile", + trainFile.getAbsoluteFile()); + SimplePipeline.runPipeline(trainCollectionReader, trainDataWriter); + + // write dev data to file + CollectionReader devCollectionReader = Utils.getCollectionReader(devFiles); + AnalysisEngine devDataWriter = AnalysisEngineFactory.createEngine( + EventPrinter.class, + "OutputFile", + devFile.getAbsolutePath()); + SimplePipeline.runPipeline(devCollectionReader, devDataWriter); + } + + /** + * Print events and entities. + * + * @author dmitriy dligach + */ + public static class EventPrinter extends JCasAnnotator_ImplBase { + + @ConfigurationParameter( + name = "OutputFile", + mandatory = true, + description = "path to the output file") + private String outputFile; + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + + // gold EventMention(s) are all in gold view + JCas goldView; + try { + goldView = jCas.getView("GoldView"); + } catch (CASException e) { + throw new AnalysisEngineProcessException(e); + } + + // system view has sentence segmentation, tokens, and dictionary lookup events + JCas systemView; + try { + systemView = jCas.getView("_InitialView"); + } catch (CASException e) { + throw new AnalysisEngineProcessException(e); + } + + List labelsAndTokens = new ArrayList<>(); + for(Sentence sentence : JCasUtil.select(systemView, Sentence.class)) { + List sentenceTokens = new ArrayList<>(); // tokens in this sentence + List sentenceLabels = new ArrayList<>(); // label for each token in this sentence + + for(BaseToken baseToken : JCasUtil.selectCovered(systemView, BaseToken.class, sentence)) { + sentenceTokens.add(tokenToString(baseToken)); + List events = JCasUtil.selectCovering(goldView, EventMention.class, baseToken.getBegin(), baseToken.getEnd()); + if(events.size() > 0) { + sentenceLabels.add("1"); // this is an event + } else { + sentenceLabels.add("0"); // this is not an event + } + } + + String sentenceAsString = String.join(" ", sentenceTokens).replaceAll("[\r\n]", " "); + String labelsAsString = String.join(" ", sentenceLabels); + labelsAndTokens.add(labelsAsString + "|" + sentenceAsString); + } + + try { + Files.write(Paths.get(outputFile), labelsAndTokens, StandardOpenOption.APPEND); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + /* + * Make sure this matches how data was pre-processed for word2vec + */ + public static String tokenToString(BaseToken token) { + + String stringValue; + String tokenType = token.getClass().getSimpleName(); + String tokenText = token.getCoveredText().toLowerCase(); + + switch(tokenType) { + case "ContractionToken": + stringValue = tokenText; + break; + case "NewlineToken": + // stringValue = null; + stringValue = ""; + break; + case "NumToken": + stringValue = "number_token"; + break; + case "PunctuationToken": + stringValue = tokenText; + break; + case "SymbolToken": + stringValue = tokenText; + break; + case "WordToken": + stringValue = tokenText; + break; + default: + throw new IllegalArgumentException("Invalid token type: " + tokenType); + } + + return stringValue; + } +}