Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 7E50310278 for ; Wed, 14 Aug 2013 21:03:28 +0000 (UTC) Received: (qmail 53694 invoked by uid 500); 14 Aug 2013 21:03:28 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 53662 invoked by uid 500); 14 Aug 2013 21:03:28 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 53654 invoked by uid 99); 14 Aug 2013 21:03:28 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 14 Aug 2013 21:03:28 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 14 Aug 2013 21:03:26 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id EE32C2388831; Wed, 14 Aug 2013 21:03:05 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1514049 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion: cr/NegExAnnotation.java cr/NegExCorpusReader.java util/AssertionConst.java Date: Wed, 14 Aug 2013 21:03:05 -0000 To: commits@ctakes.apache.org From: james-masanz@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20130814210305.EE32C2388831@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: james-masanz Date: Wed Aug 14 21:03:05 2013 New Revision: 1514049 URL: http://svn.apache.org/r1514049 Log: 1st pass at a reader for negex gold standard. Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java (with props) ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java (with props) Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java?rev=1514049&view=auto ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java (added) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java Wed Aug 14 21:03:05 2013 @@ -0,0 +1,135 @@ +package org.apache.ctakes.assertion.cr; + +import java.util.Date; + +import org.apache.log4j.Logger; + +/** + * parses a line of data from the negex gold standard + * For the few instances where there is somethign wrong with the gold standard, corrects/rejects + * some mistakes in gold standard (e.g. where entity is longer than the sentence!) + */ +public class NegExAnnotation { + static Logger LOGGER = Logger.getLogger(NegExAnnotation.class); + + /** + * Each line consist of following fields + * line number + * Condition (from within sentence) + * sentence + * negation_status (Negated, Affirmed) also handles if possible) + * + * + * @param args + */ + + String lineNumber; + String entityCoveredText; + String sentenceText; + String polarity; // -1 means negated. 1 means not negated. Note, shares field with possible + String possible; // 1 means possible. 0 = either negated or affirmed. shares field with negated/polarity + String temporality; + String experiencer; + String begin; + String end; + + public NegExAnnotation(String lineWithAnnotation) { + + String s = lineWithAnnotation.trim(); + if (s.length()==0) throw new RuntimeException("no annotation or sentence data found"); + + String [] fields = lineWithAnnotation.split("\t"); + int numRequiredFields = 4; + if (fields.length < numRequiredFields) { + throw new RuntimeException("Not enough fields on line '" + lineWithAnnotation + "', need at least " + numRequiredFields + "fields."); + } + + lineNumber = fields[0].trim(); + + String INCORRECT_LINE1 = "OSTEOCHONDRAL IRREGULARITY WITHIN THE 45 DEGREE FLEXION ZONE OF THE LATERAL FEMORAL CONDYLE COMPATIBLE WITH OSTEOCHONDRAL LESION. INCREASED SCLEROSIS WITHIN THIS REGION"; + String CORRECTED_LINE1 = "OSTEOCHONDRAL IRREGULARITY WITHIN THE 45 DEGREE FLEXION ZONE OF THE LATERAL FEMORAL CONDYLE COMPATIBLE WITH OSTEOCHONDRAL LESION."; + + + entityCoveredText = fields[1].trim(); + if (entityCoveredText.toLowerCase().equals(INCORRECT_LINE1.toLowerCase())) { // correct an error in the gold standard + entityCoveredText = CORRECTED_LINE1; + } + if (entityCoveredText.length()<1) throw new RuntimeException("Error parsing entityCoveredText from line '" + lineWithAnnotation + "'"); + + if (entityCoveredText.startsWith("Pharynx good.")) entityCoveredText = "Pharynx good."; + if (entityCoveredText.toLowerCase().startsWith("neck: supple.")) entityCoveredText = entityCoveredText.substring(0,"NECK: Supple.".length()); + + String INCORRECT_LINE3 = "RIGHT THYROID: SATISFACTORY FOR INTERPRETATION. NEGATIVE FOR MALIGNANT CELLS. COLLOID NODULE"; + String CORRECTED_LINE3 = "RIGHT THYROID: SATISFACTORY FOR INTERPRETATION."; + if (entityCoveredText.toLowerCase().equals(INCORRECT_LINE3.toLowerCase())) { // correct an error in the gold standard + entityCoveredText = CORRECTED_LINE3; + } + + if (entityCoveredText.toLowerCase().equals("tolerating p.o. intake")) { + //1290 tolerating p.o. intake intake and voiding without difficulty and ambulating independently. Affirmed + LOGGER.warn("Unable to handle at this time because gold standard is incorrect"); + throw new RuntimeException("Skip this one as gold standard has a problem"); + } + sentenceText = fields[2].trim(); + String INCORRECT_LINE2 = "The patient states that she was able to tolerate some p.o."; + String CORRECTED_LINE2 = "The patient states that she was able to tolerate some p.o. fluids"; + if (sentenceText.equals(INCORRECT_LINE2)) sentenceText = CORRECTED_LINE2; // correct an error in the gold standard + + int position = sentenceText.toLowerCase().indexOf(entityCoveredText.toLowerCase()); + char DQUOTE = '"'; + if (position<0) { + if (entityCoveredText.charAt(0)==DQUOTE) entityCoveredText = entityCoveredText.substring(1); + int last = entityCoveredText.length()-1; + if (entityCoveredText.charAt(last)==DQUOTE) entityCoveredText = entityCoveredText.substring(0, last); + position = sentenceText.toLowerCase().indexOf(entityCoveredText.toLowerCase()); + if (position<0) { + throw new RuntimeException("Did not find entity text '" + entityCoveredText + "' within sentence '" + sentenceText + "'"); + } + } + String rest = sentenceText.substring(position+1); + if (rest.contains(entityCoveredText)) { + LOGGER.error("Assuming 2nd occurrence is correct occurenence of '" + entityCoveredText + "'."); + position = sentenceText.toLowerCase().indexOf(entityCoveredText.toLowerCase(), position+1); + //throw new RuntimeException("Unable to handle two occurences of entity within sentence"); + } + begin = position + ""; + end = (position + entityCoveredText.length()) + ""; + + String field3LowerCase = fields[3].trim().toLowerCase(); + + if (field3LowerCase.equals("possible")) { + polarity = "1"; + possible = "1"; + } else if (field3LowerCase.equals("affirmed")) { + polarity = "1"; + possible = "0"; + } else if (field3LowerCase.equals("negated")) { + polarity = "-1"; + possible = "0"; + } + + if (fields.length > 4 && fields[4]!=null && fields[4].length()>0) throw new RuntimeException("Does not support temporality yet"); + if (fields.length > 5 && fields[5]!=null && fields[5].length()>0) throw new RuntimeException("Does not support experiencer yet"); + + } + + public String toString() { + + String s = entityCoveredText + " (" + begin + ", " + end + ") polarity=" + polarity + " possible=" + possible; + s = s + "\n" + "in '" + sentenceText + "'"; + return s; + + } + /** + * test a single line + * @param args + */ + public static void main(String[] args) { + String line = "2 pulmonic regurgitation There is trace PULMONIC REGURGITATION. Affirmed"; + NegExAnnotation anno = new NegExAnnotation(line); + System.out.println("Was able to create NegExAnnotation successfully at " + new Date()); + System.out.println(anno.toString()); + + } + +} Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java?rev=1514049&view=auto ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java (added) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java Wed Aug 14 21:03:05 2013 @@ -0,0 +1,339 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.ctakes.assertion.cr; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.ctakes.assertion.util.AssertionConst; +import org.apache.ctakes.core.knowtator.KnowtatorAnnotation; +import org.apache.ctakes.core.knowtator.KnowtatorXMLParser; +import org.apache.ctakes.core.util.CtakesFileNamer; +import org.apache.ctakes.core.util.SHARPKnowtatorXMLDefaults; +import org.apache.ctakes.typesystem.type.constants.CONST; +import org.apache.ctakes.typesystem.type.refsem.BodyLaterality; +import org.apache.ctakes.typesystem.type.refsem.BodySide; +import org.apache.ctakes.typesystem.type.refsem.Course; +import org.apache.ctakes.typesystem.type.refsem.Date; +import org.apache.ctakes.typesystem.type.refsem.Event; +import org.apache.ctakes.typesystem.type.refsem.EventProperties; +import org.apache.ctakes.typesystem.type.refsem.LabReferenceRange; +import org.apache.ctakes.typesystem.type.refsem.LabValue; +import org.apache.ctakes.typesystem.type.refsem.MedicationDosage; +import org.apache.ctakes.typesystem.type.refsem.MedicationDuration; +import org.apache.ctakes.typesystem.type.refsem.MedicationForm; +import org.apache.ctakes.typesystem.type.refsem.MedicationFrequency; +import org.apache.ctakes.typesystem.type.refsem.MedicationRoute; +import org.apache.ctakes.typesystem.type.refsem.MedicationStatusChange; +import org.apache.ctakes.typesystem.type.refsem.MedicationStrength; +import org.apache.ctakes.typesystem.type.refsem.OntologyConcept; +import org.apache.ctakes.typesystem.type.refsem.ProcedureDevice; +import org.apache.ctakes.typesystem.type.refsem.ProcedureMethod; +import org.apache.ctakes.typesystem.type.refsem.Severity; +import org.apache.ctakes.typesystem.type.refsem.UmlsConcept; +import org.apache.ctakes.typesystem.type.relation.AffectsTextRelation; +import org.apache.ctakes.typesystem.type.relation.AspectualTextRelation; +import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation; +import org.apache.ctakes.typesystem.type.relation.ComplicatesDisruptsTextRelation; +import org.apache.ctakes.typesystem.type.relation.DegreeOfTextRelation; +import org.apache.ctakes.typesystem.type.relation.LocationOfTextRelation; +import org.apache.ctakes.typesystem.type.relation.ManagesTreatsTextRelation; +import org.apache.ctakes.typesystem.type.relation.ManifestationOfTextRelation; +import org.apache.ctakes.typesystem.type.relation.RelationArgument; +import org.apache.ctakes.typesystem.type.relation.ResultOfTextRelation; +import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation; +import org.apache.ctakes.typesystem.type.structured.DocumentID; +import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention; +import org.apache.ctakes.typesystem.type.textsem.BodyLateralityModifier; +import org.apache.ctakes.typesystem.type.textsem.BodySideModifier; +import org.apache.ctakes.typesystem.type.textsem.ConditionalModifier; +import org.apache.ctakes.typesystem.type.textsem.CourseModifier; +import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention; +import org.apache.ctakes.typesystem.type.textsem.EntityMention; +import org.apache.ctakes.typesystem.type.textsem.EventMention; +import org.apache.ctakes.typesystem.type.textsem.GenericModifier; +import org.apache.ctakes.typesystem.type.textsem.HistoryOfModifier; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textsem.LabEstimatedModifier; +import org.apache.ctakes.typesystem.type.textsem.LabInterpretationModifier; +import org.apache.ctakes.typesystem.type.textsem.LabMention; +import org.apache.ctakes.typesystem.type.textsem.LabReferenceRangeModifier; +import org.apache.ctakes.typesystem.type.textsem.LabValueModifier; +import org.apache.ctakes.typesystem.type.textsem.MedicationAllergyModifier; +import org.apache.ctakes.typesystem.type.textsem.MedicationDosageModifier; +import org.apache.ctakes.typesystem.type.textsem.MedicationDurationModifier; +import org.apache.ctakes.typesystem.type.textsem.MedicationFormModifier; +import org.apache.ctakes.typesystem.type.textsem.MedicationFrequencyModifier; +import org.apache.ctakes.typesystem.type.textsem.MedicationMention; +import org.apache.ctakes.typesystem.type.textsem.MedicationRouteModifier; +import org.apache.ctakes.typesystem.type.textsem.MedicationStatusChangeModifier; +import org.apache.ctakes.typesystem.type.textsem.MedicationStrengthModifier; +import org.apache.ctakes.typesystem.type.textsem.Modifier; +import org.apache.ctakes.typesystem.type.textsem.PolarityModifier; +import org.apache.ctakes.typesystem.type.textsem.ProcedureDeviceModifier; +import org.apache.ctakes.typesystem.type.textsem.ProcedureMention; +import org.apache.ctakes.typesystem.type.textsem.ProcedureMethodModifier; +import org.apache.ctakes.typesystem.type.textsem.SeverityModifier; +import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention; +import org.apache.ctakes.typesystem.type.textsem.SubjectModifier; +import org.apache.ctakes.typesystem.type.textsem.TimeMention; +import org.apache.ctakes.typesystem.type.textsem.UncertaintyModifier; +import org.apache.log4j.Logger; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.TOP; +import org.apache.uima.jcas.tcas.Annotation; +import org.jdom2.JDOMException; +import org.uimafit.component.JCasAnnotator_ImplBase; +import org.uimafit.component.xwriter.XWriter; +import org.uimafit.descriptor.ConfigurationParameter; +import org.uimafit.factory.AnalysisEngineFactory; +import org.uimafit.util.JCasUtil; + +import com.google.common.base.Charsets; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.google.common.io.Files; + +/** + * assumes knowtator xml files are in "exported-xml" subdirectory + * and the original plaintext files are in "text" subdirectory + * + */ +public class NegExCorpusReader extends JCasAnnotator_ImplBase { + static Logger LOGGER = Logger.getLogger(NegExCorpusReader.class); + + public static final String PARAM_TEXT_DIRECTORY = "TextDirectory"; + @ConfigurationParameter( + name = PARAM_TEXT_DIRECTORY, + description = "directory containing the text files (if DocumentIDs are just filenames); " + + "defaults to assuming that DocumentIDs are full file paths") + private File textDirectory; + + public static final String PARAM_SET_DEFAULTS = "SetDefaults"; + @ConfigurationParameter( + name = PARAM_SET_DEFAULTS, + description = "whether or not to set default attribute values if no annotation is present") + private boolean setDefaults; + + private static final Map SUBJECT_KNOWTATOR_TO_UIMA_MAP; + static { + SUBJECT_KNOWTATOR_TO_UIMA_MAP = Maps.newHashMap(); + SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("C0030705", CONST.ATTR_SUBJECT_PATIENT); + SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("patient", CONST.ATTR_SUBJECT_PATIENT); + SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("family_member", CONST.ATTR_SUBJECT_FAMILY_MEMBER); + SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("donor_family_member", CONST.ATTR_SUBJECT_DONOR_FAMILY_MEMBER); + SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("donor_other", CONST.ATTR_SUBJECT_DONOR_OTHER); + SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("other", CONST.ATTR_SUBJECT_OTHER); + } + + /** + * Get the URI that the text in this class was loaded from + */ + protected URI getTextURI(JCas jCas) throws AnalysisEngineProcessException { + + String textPath = JCasUtil.selectSingle(jCas, DocumentID.class).getDocumentID(); + if (this.textDirectory != null) { + textPath = this.textDirectory + File.separator + textPath; + } + + URI uri; + try { + uri = new URI(textPath); + } catch (URISyntaxException e) { + throw new AnalysisEngineProcessException(e); + } + + //LOGGER.info("textPath = " + textPath); + //LOGGER.info("uri = " + uri); + + + + + //File tmpFile = new File(textPath); // Note this does not work with something like "file:/C:/usr/data/MiPACQ/1/xml/0054074073-0.xml" + //LOGGER.info("tmpFile = " + tmpFile); + //URI answer = tmpFile.toURI(); + //LOGGER.info("answer = " + answer); + + return uri; + + } + + + +/** + * Returns the names of the annotators in the Knowtator files that represent the gold standard + */ + protected static String[] getAnnotatorNames() { + return new String[] { "cTAKES , Mayo Clinic", "CU annotator ,", "consensus set annotator team" , "cons annotator team", "cons team", "team" }; // these three are what are used by MiPACQ gold standard + } + + + private static List getDiseaseDisorderKnowtatorClasses() { + return Arrays.asList(new String [] {"Disorders"}); + } + + + private static List getSignSymptomKnowtatorClasses() { + return Arrays.asList(new String [] {"Sign_Symptom", "Finding"}); + } + + private static List getProcedureKnowtatorClasses() { + return Arrays.asList(new String [] { + "Diagnostic_procedure", + "Laboratory_procedure", + "Procedures", + "Therapeutic_or_preventive_procedure", + "Intervention", + "Health_care_activity", + "Research_activity"}); + } + + private static List getMedicationKnowtatorClasses() { + return Arrays.asList(new String [] {"Chemicals_and_drugs", "Pharmacologic_substance"}); + } + + private static List getAnatomyKnowtatorClasses() { + return Arrays.asList(new String [] {"Anatomy"}); + } + + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + // + } + + static String format(Annotation ann) { + String result; + if (ann.getEnd() == Integer.MIN_VALUE || ann.getBegin() == Integer.MAX_VALUE) { + result = ""; + } else { + result = String.format("\"%s\"[%d,%d]", ann.getCoveredText(), ann.getBegin(), ann.getEnd()); + } + return String.format("%s(%s)", ann.getClass().getSimpleName(), result); + } + + + + + + /** + * This main method is only for testing purposes. It runs the reader on Knowtator directories. + * args[0] = "/usr/data/MiPACQ/copies-of-just-clinical-knowtator-xml-and-text/"; + * should have a child directory called "text" + * should have a child directory called "exported-xml" + * files in knowtator xml directory should have files that end with .xml + */ + public static void main(String[] args) throws Exception { + + String filename; + if (args.length != 0) { + filename = args[0]; + } else { + try { + LOGGER.warn(String.format( + "usage: java %s path/to/negex/file ", + NegExCorpusReader.class.getName())); + } catch (IllegalArgumentException e) { + e.printStackTrace(); + } + Exception e = new RuntimeException("Going to continue with default values"); + LOGGER.warn(e.getLocalizedMessage()); + filename = AssertionConst.NEGEX_CORPUS; + } + + AnalysisEngine negexReader = AnalysisEngineFactory.createPrimitive(NegExCorpusReader.class); + + AnalysisEngine xWriter = AnalysisEngineFactory.createPrimitive( + XWriter.class, + XWriter.PARAM_OUTPUT_DIRECTORY_NAME, + AssertionConst.NEGEX_CORPUS_PREPROCESSED, + XWriter.PARAM_FILE_NAMER_CLASS_NAME, + CtakesFileNamer.class.getName() + ); + + // For each line of data in the file that contains the negex corpus, parse the line and process the data. + String [] lines = readNonWhiteSpaceLines(filename); + int n = lines.length; + LOGGER.info("Processing " + n + " lines from the negex file, treating each line as a document."); + + for (String data : lines) { + LOGGER.info("Processing line '" + data + "'."); + try { + NegExAnnotation a = new NegExAnnotation(data); + JCas jCas = negexReader.newJCas(); + jCas.setDocumentText(a.sentenceText); + DocumentID documentID = new DocumentID(jCas); + documentID.setDocumentID("doc" + a.lineNumber); + documentID.addToIndexes(); + IdentifiedAnnotation ia = new IdentifiedAnnotation(jCas); + ia.setBegin(Integer.parseInt(a.begin)); + ia.setEnd(Integer.parseInt(a.end)); + ia.setPolarity(Integer.parseInt(a.polarity)); + ia.addToIndexes(); + xWriter.process(jCas); + } catch (RuntimeException e) { + LOGGER.warn("Skipping this one because of RuntimeException"); + } + } + + } + + + + private static String[] readNonWhiteSpaceLines(String filename) { + List lines = new ArrayList(); + BufferedReader br = null; + try { + br = new BufferedReader(new FileReader(filename)); + String line; + while ((line=br.readLine())!=null) { + if (line.trim().length()>0) { + lines.add(line); + } + } + } catch (Exception e) { + // + } finally { + if (br!=null) + try { + br.close(); + } catch (IOException e1) { + e1.printStackTrace(); + } + } + return lines.toArray(new String[0]); + + } +} Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java?rev=1514049&r1=1514048&r2=1514049&view=diff ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java (original) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java Wed Aug 14 21:03:05 2013 @@ -34,10 +34,14 @@ public class AssertionConst { // expects subdirectories called exported-xml and text public static final String MiPACQ_CORPUS = DATA_DIR + "gold_standard/copies-of-just-clinical-knowtator-xml-and-text/"; + public static final String NEGEX_CORPUS = DATA_DIR + "gold_standard/negex/Annotations-1-120-random.txt"; + public static final String NEGEX_CORPUS_PREPROCESSED = DATA_DIR + "preprocessed_data/negex/"; + // Just plaintext files, which will be run through cTAKES, to generate XMI - attributes will then be judged // This in input for cTAKES; the output (evalOutputDir) can then be the input of the judge step. - public static final String CORPUS_WO_GOLD_STD_TO_RUN_THROUGH_CTAKES = DATA_DIR + "ActiveLearning/plaintext"; - + public static final String CORPUS_WO_GOLD_STD_TO_RUN_THROUGH_CTAKES = DATA_DIR + "ActiveLearning/plaintext"; + + // specify the model to write (train/crossvalidate) or read (test/crossvalidate). // please rename for different configurations of training data public static String modelDirectory = "../ctakes-assertion-res/resources/model/sharp-sprint-train";