Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 8F21B108B6 for ; Thu, 19 Feb 2015 18:06:26 +0000 (UTC) Received: (qmail 79543 invoked by uid 500); 19 Feb 2015 18:06:21 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 79465 invoked by uid 500); 19 Feb 2015 18:06:21 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 78917 invoked by uid 99); 19 Feb 2015 18:06:21 -0000 Received: from eris.apache.org (HELO hades.apache.org) (140.211.11.105) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 19 Feb 2015 18:06:21 +0000 Received: from hades.apache.org (localhost [127.0.0.1]) by hades.apache.org (ASF Mail Server at hades.apache.org) with ESMTP id 0A7D2AC04EC for ; Thu, 19 Feb 2015 18:06:21 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1660963 [19/19] - in /ctakes/sandbox/timelanes: META-INF/ edu/ edu/mayo/ edu/mayo/bmi/ edu/mayo/bmi/annotation/ edu/mayo/bmi/annotation/knowtator/ org/ org/chboston/ org/chboston/cnlp/ org/chboston/cnlp/anafora/ org/chboston/cnlp/anafora/a... Date: Thu, 19 Feb 2015 18:06:17 -0000 To: commits@ctakes.apache.org From: seanfinan@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20150219180621.0A7D2AC04EC@hades.apache.org> Added: ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/timespan/plus/TimeSpanPlus.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/timespan/plus/TimeSpanPlus.java?rev=1660963&view=auto ============================================================================== --- ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/timespan/plus/TimeSpanPlus.java (added) +++ ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/timespan/plus/TimeSpanPlus.java Thu Feb 19 18:06:13 2015 @@ -0,0 +1,181 @@ +package org.chboston.cnlp.timeline.timespan.plus; + +import net.jcip.annotations.Immutable; +import org.chboston.cnlp.timeline.timespan.AbstractTimeSpan; +import org.chboston.cnlp.timeline.timespan.EndPointer; + +/** + * Author: SPF + * Affiliation: CHIP-NLP + * Date: 7/30/13 + */ +@Immutable +final public class TimeSpanPlus extends AbstractTimeSpan implements PointedTimeSpan { + + static public final TimeSpanPlus UNKNOWN_TIMESPAN_PLUS + = new TimeSpanPlus( TimeEndPoint.NULL_END_POINT, TimeEndPoint.NULL_END_POINT ); + + + final private TimeEndPoint _startTime; + final private TimeEndPoint _stopTime; + + public TimeSpanPlus( final TimeEndPoint startTime, final TimeEndPoint stopTime ) { + if ( startTime.getMillis() > stopTime.getMillis() ) { + // Hopefully this never happens ... + _startTime = new TimeEndPoint( EndPointer.OVERLAP, stopTime.getMillis(), stopTime.isFuzzy() ); + _stopTime = new TimeEndPoint( EndPointer.OVERLAP, startTime.getMillis(), startTime.isFuzzy() ); + return; + } + _startTime = startTime; + _stopTime = stopTime; + } + + public TimeEndPoint getStartTime() { + return _startTime; + } + + public TimeEndPoint getStopTime() { + return _stopTime; + } + + + /** + * {@inheritDoc} + */ + @Override + public long getStartMillis() { + return _startTime.getMillis(); + } + + /** + * {@inheritDoc} + */ + @Override + public long getStopMillis() { + return _stopTime.getMillis(); + } + + + /** + * {@inheritDoc} + */ + @Override + public boolean isFuzzyDate() { + return _startTime.isFuzzy() || _stopTime.isFuzzy(); + } + + public String getRelationText() { + if ( this.equals( UNKNOWN_TIMESPAN_PLUS ) ) { + return "Unknown"; + } + final EndPointer startPointer = _startTime.getPointer(); + final EndPointer stopPointer = _stopTime.getPointer(); + String prefix = null; + if ( isSingleDate() ) { + if ( startPointer == EndPointer.BEFORE + && stopPointer == EndPointer.BEFORE ) { + prefix = "Occurs Before"; + } else if ( startPointer == EndPointer.AFTER + && stopPointer == EndPointer.AFTER ) { + prefix = "Occurs After"; + } else if ( startPointer == EndPointer.BEFORE + && stopPointer == EndPointer.EQUAL ) { + prefix = "Ends on"; + } else if ( startPointer == EndPointer.EQUAL + && stopPointer == EndPointer.AFTER ) { + prefix = "Begins on"; + } + } + if ( prefix == null || prefix.isEmpty() ) { + if ( startPointer == EndPointer.BEFORE + && stopPointer == EndPointer.BEFORE ) { + prefix = "Starts before, ends within"; + } else if ( startPointer == EndPointer.BEFORE + && stopPointer == EndPointer.EQUAL ) { + prefix = "Starts before, ends with"; + } else if ( startPointer == EndPointer.BEFORE + && stopPointer == EndPointer.OVERLAP ) { + prefix = "Starts before, overlaps"; + } else if ( startPointer == EndPointer.BEFORE + && stopPointer == EndPointer.AFTER ) { + prefix = "Starts before, ends after"; + + } else if ( startPointer == EndPointer.AFTER + && stopPointer == EndPointer.BEFORE ) { + prefix = "Is Within"; + } else if ( startPointer == EndPointer.AFTER + && stopPointer == EndPointer.EQUAL ) { + prefix = "Starts within, ends with"; + } else if ( startPointer == EndPointer.AFTER + && stopPointer == EndPointer.OVERLAP ) { + prefix = "Starts within, overlaps"; + } else if ( startPointer == EndPointer.AFTER + && stopPointer == EndPointer.AFTER ) { + prefix = "Starts within, ends after"; + + } else if ( startPointer == EndPointer.EQUAL + && stopPointer == EndPointer.BEFORE ) { + prefix = "Starts with, ends before"; + } else if ( startPointer == EndPointer.EQUAL + && stopPointer == EndPointer.EQUAL ) { + prefix = "Starts with, ends with"; + } else if ( startPointer == EndPointer.EQUAL + && stopPointer == EndPointer.OVERLAP ) { + prefix = "Starts with, overlaps"; + } else if ( startPointer == EndPointer.EQUAL + && stopPointer == EndPointer.AFTER ) { + prefix = "Starts with, ends after"; + + } else if ( startPointer == EndPointer.OVERLAP + && stopPointer == EndPointer.BEFORE ) { + prefix = "Overlaps, ends within"; + } else if ( startPointer == EndPointer.OVERLAP + && stopPointer == EndPointer.EQUAL ) { + prefix = "Overlaps, ends with"; + } else if ( startPointer == EndPointer.OVERLAP + && stopPointer == EndPointer.OVERLAP ) { + prefix = "Overlaps"; + } else if ( startPointer == EndPointer.OVERLAP + && stopPointer == EndPointer.AFTER ) { + prefix = "Overlaps, ends after"; + } + } + return prefix; + } + + public String getSpanText() { + if ( this.equals( UNKNOWN_TIMESPAN_PLUS ) ) { + return ""; + } + return super.toString(); + } + + /** + * {@inheritDoc} + */ + @Override + public String toString() { + return getRelationText() + " " + getSpanText(); + } + + + /** + * {@inheritDoc} + */ + @Override + public int hashCode() { + return _startTime.hashCode() + 3 * _stopTime.hashCode(); + } + + /** + * {@inheritDoc} + */ + @Override + public boolean equals( final Object object ) { + return object instanceof TimeSpanPlus + && ((TimeSpanPlus)object)._startTime.equals( _startTime ) + && ((TimeSpanPlus)object)._stopTime.equals( _stopTime ); + } + + +} Added: ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/timespan/plus/TimeSpanPlusComparator.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/timespan/plus/TimeSpanPlusComparator.java?rev=1660963&view=auto ============================================================================== --- ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/timespan/plus/TimeSpanPlusComparator.java (added) +++ ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/timespan/plus/TimeSpanPlusComparator.java Thu Feb 19 18:06:13 2015 @@ -0,0 +1,56 @@ +package org.chboston.cnlp.timeline.timespan.plus; + + +import java.util.Comparator; + +/** + * Author: SPF + * Affiliation: CHIP-NLP + * Date: 8/2/13 + */ +public enum TimeSpanPlusComparator implements Comparator { + INSTANCE; + + static public TimeSpanPlusComparator getInstance() { + return INSTANCE; + } + + /** + * {@inheritDoc} + */ + @Override + public int compare( final PointedTimeSpan timeSpan1, final PointedTimeSpan timeSpan2 ) { + final int startComparison = TimeEndPointComparator.getInstance().compare( timeSpan1.getStartTime(), + timeSpan2.getStartTime() ); + if ( startComparison != 0 ) { + return startComparison; + } + return TimeEndPointComparator.getInstance().compare( timeSpan1.getStopTime(), timeSpan2.getStopTime() ); + } + + static private enum TimeEndPointComparator implements Comparator { + INSTANCE; + + static public TimeEndPointComparator getInstance() { + return INSTANCE; + } + + /** + * {@inheritDoc} + */ + @Override + public int compare( final TimeEndPoint endPoint1, final TimeEndPoint endPoint2 ) { + final long millis1 = endPoint1.getMillis(); + final long millis2 = endPoint2.getMillis(); + if ( millis1 < millis2 ) { + return -1; + } else if ( millis2 < millis1 ) { + return 1; + } + return endPoint1.getPointer().getOrder() - endPoint2.getPointer().getOrder(); + } + + } + + +} Added: ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/todo.txt URL: http://svn.apache.org/viewvc/ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/todo.txt?rev=1660963&view=auto ============================================================================== --- ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/todo.txt (added) +++ ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/todo.txt Thu Feb 19 18:06:13 2015 @@ -0,0 +1,27 @@ +ok 1. UmlsEvent to use RelativeTimeSpan instead of its own TimexRel enum +1. Create UmlsEvent by Relation parse with RelativeTimeSpan +2. Modify TimeSpanRenderer Before and After +3. Get rid of Linked Scrollers +4. Add to Semantic Type collection on Search +5. Add remove button to left of each event lane + +4. Add button to expand / collapse semantic type + +10. Fix update on Header for Events + + + + + +X?- Colors on dates for UMLS types +X?- Add I2B2 Color Scheme +- Get Semantic Types for Events +- Cull / Combine by coreference +- "Lifeline Date" header listing dates w/o overlap that change with zoom +- "&" and "|" searches +?- Sort by "criticality" of event +- Link timexRel events across timespans : AFTER> - - - entities = annotationStore.getNamedEntities(); + try ( Writer writer = new BufferedWriter( new FileWriter( outputFile ) ) ) { + for ( Entity entity : entities ) { + String cui = entity.getAttributeValue( DefinedAttributeType.CUI ); + if ( cui == null || cui.isEmpty() ) { + cui = "UNKNOWN"; + } + String tui = entity.getAttributeValue( DefinedAttributeType.TUI ); + if ( tui == null || tui.isEmpty() ) { + tui = "UNKNOWN"; + } + writer.write( entity.getTextSpan().getStartIndex() + "," + entity.getTextSpan().getEndIndex() + + " " + cui + "_" + tui + " " + entity.getSpannedText() + "\n" ); + } + writer.write( "Total Words: " + annotationStore.getWordCount() + "\n"); + writer.write( "Total Annotations: " + entities.size() + "\n" ); + } catch (IOException ioE ) { + LOGGER.severe( ioE.getMessage() ); + } + } + + + static private void copyGoldEntityXmls( final File inputDir, final File outputDir ) { + final String[] fileNames = inputDir.list(); + if ( fileNames == null ) { + return; + } + File bestXml = null; + long longestLength = 0; + for ( String fileName : fileNames ) { + if ( fileName.endsWith( ".UMLS-Entity.gold.completed.xml" ) ) { + bestXml = new File( inputDir, fileName ); + break; + } + if ( fileName.contains( ".UMLS-Entity" ) ) { + final File entityXml = new File( inputDir, fileName ); + if ( entityXml.length() > longestLength ) { + bestXml = entityXml; + longestLength = entityXml.length(); + } + } + } + if ( bestXml == null ) { + return; + } + final AnnotationsParser parser = new AnaforaXmlParser(); + parser.setDocumentTextFile( new File( inputDir, inputDir.getName() + ".txt" ) ); + parser.parseFile( bestXml.getPath() ); + final AnnotationStore annotationStore = parser.getAnnotationStore(); + final File outputFile = new File( outputDir, bestXml.getName() + ".out" ); + writeCuis( outputFile, annotationStore ); + } + + + public static void main( String... args ) { + final String inputParentPath = "C:\\Spiffy\\prj_thyme\\data\\internal\\annotations\\release_gold\\ColonCancer";//args[0]; + final String outputDirPath = "C:\\Spiffy\\prj_thyme\\output\\temp\\release_gold_cuis";//args[1]; + final File outputDir = new File( outputDirPath ); + + final File inputParentDir = new File( inputParentPath ); + final File[] inputDirs = inputParentDir.listFiles(); + for ( File inputDir : inputDirs ) { + copyGoldEntityXmls( inputDir, outputDir ); + } + } + + +} Added: ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/util/GoldSerializer.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/util/GoldSerializer.java?rev=1660963&view=auto ============================================================================== --- ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/util/GoldSerializer.java (added) +++ ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/util/GoldSerializer.java Thu Feb 19 18:06:13 2015 @@ -0,0 +1,149 @@ +package org.chboston.cnlp.timeline.util; + +import org.chboston.cnlp.nlp.annotation.annotation.store.AnnotationStore; +import org.chboston.cnlp.nlp.annotation.annotation.store.AnnotationStoreFactory; +import org.chboston.cnlp.timeline.gui.qaclipper.TimelineAnaforaWriter5; +import org.chboston.cnlp.timeline.timeline.Timeline; +import org.chboston.cnlp.timeline.timeline.TimelineFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.Collection; +import java.util.HashSet; + +/** + * Author: SPF + * Affiliation: CHIP-NLP + * Date: 8/15/14 + */ +final public class GoldSerializer { + + private GoldSerializer() { + } + + static private final File ROOT_DIR = new File( "C:/Spiffy/Data/IAA/THYME5/IaaInput_ColonCancer/a_ux" ); + static private final File IAA_IN_DIR = new File( "C:/Spiffy/Data/IAA/THYME5/IaaInput_ColonCancer/a_gold_ux" ); + + static private final String SERIALIZATIONS = "C:/Spiffy/Output/Timeline/Serialized/Gold/ColonCancer"; + + // static private final File ROOT_IN_DIR = new File( "C:\\Spiffy\\prj_thyme\\data\\external\\extrinsic\\ColonCancer" ); +// static private final File ROOT_OUT_DIR = new File( "C:\\Spiffy\\prj_thyme\\output\\permanent\\extrinsic\\ColonCancer" ); + static private final File ROOT_IN_DIR + = new File( "C:\\Spiffy\\prj_thyme\\data\\internal\\corpus\\colon_cancer\\processed_test" ); + static private final File ROOT_OUT_DIR + = new File( "C:\\Spiffy\\prj_thyme\\output\\permanent\\extrinsic\\colon_cancer\\from_xmi" ); + + + public static void main( final String[] args ) { +// makeCopies(); +// copyNotes(); +// serializeTimelines(); +// serializeTimelines( ROOT_IN_DIR, ROOT_OUT_DIR ); + serializeXmiTimelines( ROOT_IN_DIR, ROOT_OUT_DIR ); + } + + static private void serializeTimelines( final File rootInDir, final File rootOutDir ) { + final File[] subDirs = rootInDir.listFiles(); + if ( subDirs == null ) { + return; + } + for ( File subDir : subDirs ) { + final AnnotationStore annotationStore + = AnnotationStoreFactory + .createAnnotationCollection2( subDir, new File( subDir, subDir.getName() + ".txt" ) ); + if ( annotationStore.getTimeRelations().isEmpty() ) { + continue; + } + System.out.println( "Serializing " + subDir.getName() ); + final Timeline timeline = TimelineFactory.createTimeline( subDir.getName(), annotationStore ); + final File outSubDir = new File( rootOutDir, subDir.getName() ); + outSubDir.mkdirs(); + TimelineAnaforaWriter5.writeTimeline( outSubDir.getPath() + "/" + outSubDir.getName(), timeline ); + } + } + + static private void serializeXmiTimelines( final File rootInDir, final File rootOutDir ) { + final File[] xmiFiles = rootInDir.listFiles(); + if ( xmiFiles == null ) { + return; + } + for ( File xmiFile : xmiFiles ) { + final AnnotationStore annotationStore + = AnnotationStoreFactory.createAnnotationCollection( xmiFile.getPath() ); + if ( annotationStore.getTimeRelations().isEmpty() ) { + continue; + } + System.out.println( "Serializing " + xmiFile.getName() ); + final Timeline timeline = TimelineFactory.createTimeline( xmiFile.getName(), annotationStore ); + rootOutDir.mkdirs(); + TimelineAnaforaWriter5.writeTimeline( rootOutDir.getPath() + "/" + xmiFile.getName(), timeline ); + } + } + + + static private void serializeTimelines() { + final File[] subDirs = IAA_IN_DIR.listFiles(); + if ( subDirs == null ) { + return; + } + for ( File subDir : subDirs ) { + final AnnotationStore annotationStore + = AnnotationStoreFactory.createAnnotationCollection2( subDir.getPath(), subDir.getName() ); + if ( annotationStore.getTimeRelations().isEmpty() ) { + continue; + } + System.out.println( "Serializing " + subDir.getName() ); + final Timeline timeline = TimelineFactory.createTimeline( subDir.getName(), annotationStore ); + final String outputPath = SERIALIZATIONS + "/" + subDir.getName(); + TimelineAnaforaWriter5.writeTimeline( outputPath, timeline ); + } + } + + + static private void makeCopies() { + final Collection setNames = new HashSet<>(); + final String[] fileNames = ROOT_DIR.list(); + for ( String fileName : fileNames ) { + setNames.add( fileName.substring( 0, fileName.indexOf( '.' ) ) ); + } + for ( String setName : setNames ) { + final File setDir = new File( IAA_IN_DIR, setName ); + setDir.mkdir(); + for ( String fileName : fileNames ) { + if ( fileName.startsWith( setName ) ) { + final File inputFile = new File( ROOT_DIR, fileName ); + final File outputFile = new File( setDir, fileName ); + System.out.println( inputFile.getPath() + " > " + outputFile.getPath() ); + try { + Files.copy( inputFile.toPath(), outputFile.toPath() ); + } catch ( IOException ioE ) { + System.err.println( ioE.getMessage() ); + } + } + } + } + } + + + static private void copyNotes() { + final File rootDir = new File( "C:/Spiffy/Data/IAA/THYME5/ColonCancer" ); + final File[] subDirs = rootDir.listFiles(); + for ( File subDir : subDirs ) { + final File noteFile = new File( subDir, subDir.getName() ); + if ( !noteFile.exists() ) { + System.out.println( "No note for " + subDir.getName() ); + continue; + } + final File outputDir = new File( IAA_IN_DIR, subDir.getName() ); + final File outputFile = new File( outputDir, subDir.getName() ); + try { + Files.copy( noteFile.toPath(), outputFile.toPath() ); + } catch ( IOException ioE ) { + System.err.println( ioE.getMessage() ); + } + } + } + + +} Added: ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/util/SimpleStoreWriter.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/util/SimpleStoreWriter.java?rev=1660963&view=auto ============================================================================== --- ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/util/SimpleStoreWriter.java (added) +++ ctakes/sandbox/timelanes/org/chboston/cnlp/timeline/util/SimpleStoreWriter.java Thu Feb 19 18:06:13 2015 @@ -0,0 +1,226 @@ +package org.chboston.cnlp.timeline.util; + +import org.chboston.cnlp.nlp.annotation.annotation.store.AnnotationStore; +import org.chboston.cnlp.nlp.annotation.annotation.store.AnnotationStoreFactory; +import org.chboston.cnlp.nlp.annotation.attribute.DefinedAttributeType; +import org.chboston.cnlp.nlp.annotation.coreference.CoreferenceChain; +import org.chboston.cnlp.nlp.annotation.entity.Entity; +import org.chboston.cnlp.nlp.annotation.relation.Relation; +import org.chboston.cnlp.timeline.timeline.Timeline; +import org.chboston.cnlp.timeline.timeline.TimelineFactory; +import org.chboston.cnlp.timeline.timespan.plus.PointedTimeSpan; + +import java.io.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.logging.FileHandler; +import java.util.logging.LogRecord; +import java.util.logging.Logger; +import java.util.logging.SimpleFormatter; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 1/12/2015 + */ +public class SimpleStoreWriter { + + static private final Logger LOGGER = Logger.getLogger( "SimpleStoreWriter" ); + + +// static private final File ROOT_IN_DIR +// = new File( "C:\\Spiffy\\prj_darth_phenome\\data\\internal\\xmi\\27Notes_Jan9_2015" ); +// static private final File ROOT_OUT_DIR +// = new File( "C:\\Spiffy\\prj_darth_phenome\\data\\internal\\xmi\\27Notes_Jan9_2015\\simple2" ); + static private final boolean XMI = false; +static private final File ROOT_IN_DIR +// = new File( "C:\\Spiffy\\prj_thyme\\data\\internal\\x_eval\\annotations\\layer\\aggregate\\ctakes\\colon\\test" ); + = new File( "C:\\Spiffy\\prj_thyme\\data\\internal\\annotations\\release_gold\\ColonCancer" ); + static private final File ROOT_OUT_DIR +// = new File( "C:\\Spiffy\\prj_thyme\\output\\temp\\x_eval\\annotations\\layer\\aggregate\\ctakes\\colon\\negTest" ); + = new File( "C:\\Spiffy\\prj_thyme\\output\\temp\\x_eval\\annotations\\layer\\aggregate\\gold\\colon\\negTest" ); + + + static private void writeXmiStores( final File rootInDir, final File rootOutDir ) { + rootOutDir.mkdirs(); + final StringBuilder sb = XMI ? runXmiDir( rootInDir, rootOutDir ) : runAnaforaDir( rootInDir, rootOutDir ); + final String name = "Summary"; + try ( final Writer writer = new BufferedWriter( new FileWriter( ROOT_OUT_DIR + "/" + name + ".txt" ) ) ) { + writer.write( String.format( "%1$40s NE Evnt Time Chn Rltn TLnk Span\n", name ) ); + writer.write( sb.toString() ); + writer.write( "\n" ); + writer.write( String.format( "%1$40s %2$4d %3$4d %4$4d %5$4d %6$4d %7$4d %8$4d\n", "Total", + _entities, _events, _timex3s, _chains, _relations, _tlinks, _spans ) ); + } catch ( IOException ioE ) { + LOGGER.severe( ioE.getMessage() ); + } + try ( final Writer writer = new BufferedWriter( new FileWriter( ROOT_OUT_DIR + "/" + name + ".err.txt" ) ) ) { + for ( String unparsed : _unparsedTimexList ) { + writer.write( unparsed + "\n" ); + } + } catch ( IOException ioE ) { + LOGGER.severe( ioE.getMessage() ); + } + } + + static private StringBuilder runXmiDir( final File xmiDir, final File rootOutDir ) { + final StringBuilder sb = new StringBuilder(); + final File[] xmiFiles = xmiDir.listFiles(); + if ( xmiFiles == null ) { + return sb; + } + for ( File xmiFile : xmiFiles ) { + if ( xmiFile.isDirectory() ) { + sb.append( runXmiDir( xmiFile, rootOutDir ) ); + continue; + } + final AnnotationStore annotationStore + = AnnotationStoreFactory.createAnnotationCollection( xmiFile.getPath() ); + final String countText = writeAnnotationStore( rootOutDir, xmiFile.getName(), annotationStore ); + sb.append( countText ); + } + return sb; + } + + static private StringBuilder runAnaforaDir( final File anaforaDir, final File rootOutDir ) { + final StringBuilder sb = new StringBuilder(); + final File[] anaforaFiles = anaforaDir.listFiles(); + if ( anaforaFiles == null ) { + return sb; + } + for ( File anaforaFile : anaforaFiles ) { + if ( anaforaFile.isDirectory() ) { + sb.append( runAnaforaDir( anaforaFile, rootOutDir ) ); + continue; + } + if ( !anaforaFile.getName().endsWith( ".txt" ) ) { + continue; + } + final AnnotationStore annotationStore + = AnnotationStoreFactory.createAnnotationCollection( anaforaFile.getPath() ); + final String countText = writeAnnotationStore( rootOutDir, anaforaFile.getName(), annotationStore ); + sb.append( countText ); + } + return sb; + } + + + + static private final Collection _unparsedTimexList = new ArrayList<>(); + static private int _entities; + static private int _events; + static private int _timex3s; + static private int _chains; + static private int _relations; + static private int _tlinks; + static private int _spans; + + + static private String writeAnnotationStore( final File rootOutDir, final String name, + final AnnotationStore annotationStore ) { + final File outputFile = new File( rootOutDir, name + ".simple.txt" ); + final File errorFile = new File( rootOutDir, name + ".error.txt" ); + final Logger timeSpanFactoryLogger = Logger.getLogger( "TimeSpanFactory" ); + final Logger tlinkCloserLogger = Logger.getLogger( "TLinkTypeArray3" ); + LOGGER.info( "Writing Simple " + outputFile.getPath() ); + try ( final Writer writer = new BufferedWriter( new FileWriter( outputFile ) ) ) { + final FileHandler errorHandler = new FileHandler( errorFile.getPath() ); + final SimpleFormatter errorFormatter = new SimpleFormatter() { + public synchronized String format( final LogRecord record ) { + _unparsedTimexList.add( formatMessage( record ) ); + return formatMessage( record ) + "\n"; + } + }; + errorHandler.setFormatter( errorFormatter ); + timeSpanFactoryLogger.addHandler( errorHandler ); + tlinkCloserLogger.addHandler( errorHandler ); + Collection entities = annotationStore.getNamedEntities(); + final int entityCount = entities.size(); + _entities += entityCount; + for ( Entity entity : entities ) { + final String lineText = String.format( "%1$20s | %2$30s | %3$3d,%4$3d", + entity.getClassType(), entity.getSpannedTextRepresentation(), + entity.getTextSpan().getStartIndex(), entity.getTextSpan().getEndIndex() ); + writer.write( lineText + "\n" ); + } + entities = annotationStore.getEvents(); + final int eventCount = entities.size(); + _events += eventCount; + for ( Entity entity : entities ) { + final String lineText = String.format( "%1$20s | %2$30s | %3$3d,%4$3d", + entity.getClassType(), entity.getSpannedTextRepresentation(), + entity.getTextSpan().getStartIndex(), entity.getTextSpan().getEndIndex() ); + writer.write( lineText + "\n" ); + } + entities = annotationStore.getTimes(); + final int timesCount = entities.size(); + _timex3s += timesCount; + for ( Entity entity : entities ) { + final String lineText = String.format( "%1$20s | %2$30s | %3$3d,%4$3d", + entity.getClassType(), entity.getSpannedTextRepresentation(), + entity.getTextSpan().getStartIndex(), entity.getTextSpan().getEndIndex() ); + writer.write( lineText + "\n" ); + } + final Collection chains = annotationStore.getCoreferenceChains(); + int chainCount = 0; + for ( CoreferenceChain chain : chains ) { + if ( chain.getChainLength() == 1 || !chain.getSpannedTextRepresentation().contains( " ... " ) ) { + continue; + } + final String lineText = String.format( "%1$20s | %2$30s | %3$3d,%4$3d", + chain.getClassType(), chain.getSpannedTextRepresentation(), + chain.getTextSpan().getStartIndex(), chain.getTextSpan().getEndIndex() ); + writer.write( lineText + "\n" ); + chainCount++; + } + _chains += chainCount; + Collection relations = annotationStore.getUmlsRelations(); + final int relationCount = relations.size(); + _relations += relationCount; + for ( Relation relation : relations ) { + final String lineText = String.format( "%1$20s | %2$30s | %3$3d,%4$3d", + relation.getClassType(), relation.getSpannedTextRepresentation(), + relation.getTextSpan().getStartIndex(), relation.getTextSpan().getEndIndex() ); + writer.write( lineText + "\n" ); + } + relations = annotationStore.getTimeRelations(); + final int tlinkCount = relations.size(); + _tlinks += tlinkCount; + for ( Relation relation : relations ) { + final String tlinkType = relation.getFirstEntity().getSpannedTextRepresentation() + + " " + relation.getAttributeValue( DefinedAttributeType.RELATION_TYPE ) + + " " + relation.getSecondEntity().getSpannedTextRepresentation(); + final String lineText = String.format( "%1$20s | %2$30s | %3$3d,%4$3d", + relation.getClassType(), tlinkType, + relation.getTextSpan().getStartIndex(), relation.getTextSpan().getEndIndex() ); + writer.write( lineText + "\n" ); + } + final Timeline timeline = TimelineFactory.createTimeline( name, annotationStore ); + _spans += timeline.getTimeSpans().size(); + for ( PointedTimeSpan timeSpan : timeline ) { + writer.write( timeSpan + "\n" ); + } + writer.write( "\n\n" ); + writer.write( annotationStore.getDocumentText() ); + writer.write( "\n\n" ); + writer.write( " NE Evnt Time Chn Rltn TLnk Span\n" ); + final String countText = + String.format( "%1$4d %2$4d %3$4d %4$4d %5$4d %6$4d %7$4d\n", + entityCount, eventCount, timesCount, chainCount, + relationCount, tlinkCount, timeline.getTimeSpans().size() ); + writer.write( countText + "\n" ); + errorHandler.flush(); + errorHandler.close(); + timeSpanFactoryLogger.removeHandler( errorHandler ); + tlinkCloserLogger.removeHandler( errorHandler ); + return String.format( "%1$40s ", name ) + countText; + } catch ( IOException ioE ) { + LOGGER.severe( ioE.getMessage() ); + } + return ""; + } + + public static void main( final String... args ) { + writeXmiStores( ROOT_IN_DIR, ROOT_OUT_DIR ); + } +} Added: ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/XmiEolFixer.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/XmiEolFixer.java?rev=1660963&view=auto ============================================================================== --- ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/XmiEolFixer.java (added) +++ ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/XmiEolFixer.java Thu Feb 19 18:06:13 2015 @@ -0,0 +1,75 @@ +package org.chboston.cnlp.xmi; + +import java.io.*; +import java.util.logging.Logger; +import java.util.regex.Pattern; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 10/13/2014 + */ +final public class XmiEolFixer { + + static private final Logger LOGGER = Logger.getLogger( "XmiEolFixer" ); + + private XmiEolFixer() { + } + + static private final String INPUT_DIR_PATH + = "C:\\Spiffy\\prj_darth_phenome\\data\\internal\\xmi\\27Notes_Jan9_2015"; + static private final String OUTPUT_DIR_PATH + = "C:\\Spiffy\\prj_darth_phenome\\data\\internal\\xmi\\27Notes_Jan9_2015\\xml"; + + + public static void main( String[] args ) { + final File inputDir = new File( INPUT_DIR_PATH ); + final File[] files = inputDir.listFiles(); + if ( files == null ) { + LOGGER.warning( "No files in " + inputDir.getPath() ); + System.exit( 0 ); + } + final Pattern pattern = Pattern.compile( "><" ); + final char[] buffer = new char[ 1024 ]; + for ( File file : files ) { + if ( !file.getName().endsWith( ".xmi" ) ) { + continue; + } + final String filePathOld = file.getPath(); + final String filePathNew = OUTPUT_DIR_PATH + "/" + file.getName() + ".xml"; + try ( BufferedReader reader = new BufferedReader( new FileReader( filePathOld ) ); + Writer writer = new BufferedWriter( new FileWriter( filePathNew ) ) ) { + int length = reader.read( buffer ); + while ( length > 0 ) { + final String text = new String( buffer, 0, length ); + if ( text.startsWith( "><" ) ) { + writer.write( ">\n<" ); + } else if ( text.startsWith( "<" ) ) { + writer.write( "<" ); + } + final String[] lines = pattern.split( text ); + if ( lines.length == 1 ) { + writer.write( lines[ 0 ] ); + } else if ( lines.length > 1 ) { + writer.write( lines[ 0 ] + ">\n" ); + for ( int i = 1; i < lines.length - 1; i++ ) { + if ( !lines[ i ].isEmpty() ) { + writer.write( "<" + lines[ i ] + ">\n" ); + } + } + writer.write( "<" + lines[ lines.length - 1 ] ); + } + if ( text.endsWith( "><" ) && text.length() > 2 ) { + writer.write( ">\n<" ); + } else if ( text.endsWith( ">" ) ) { + writer.write( ">\n" ); + } + length = reader.read( buffer ); + } + } catch ( IOException ioE ) { + LOGGER.severe( ioE.getMessage() ); + } + } + } + +} Added: ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/parser/UimaXmiParser.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/parser/UimaXmiParser.java?rev=1660963&view=auto ============================================================================== --- ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/parser/UimaXmiParser.java (added) +++ ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/parser/UimaXmiParser.java Thu Feb 19 18:06:13 2015 @@ -0,0 +1,695 @@ +package org.chboston.cnlp.xmi.parser; + +import org.chboston.cnlp.nlp.annotation.annotation.store.ImmutableAnnotationStore; +import org.chboston.cnlp.nlp.annotation.attribute.AttributeType; +import org.chboston.cnlp.nlp.annotation.attribute.AttributeTypeFactory; +import org.chboston.cnlp.nlp.annotation.attribute.DefaultAttribute; +import org.chboston.cnlp.nlp.annotation.attribute.DefinedAttributeType; +import org.chboston.cnlp.nlp.annotation.classtype.ClassType; +import org.chboston.cnlp.nlp.annotation.classtype.CustomClassType; +import org.chboston.cnlp.nlp.annotation.classtype.SemanticClassType; +import org.chboston.cnlp.nlp.annotation.classtype.TemporalClassType; +import org.chboston.cnlp.nlp.annotation.coreference.CoreferenceChain; +import org.chboston.cnlp.nlp.annotation.coreference.CoreferenceChainSpanComparator; +import org.chboston.cnlp.nlp.annotation.coreference.CoreferenceFactory; +import org.chboston.cnlp.nlp.annotation.entity.DefaultEntity; +import org.chboston.cnlp.nlp.annotation.entity.Entity; +import org.chboston.cnlp.nlp.annotation.parser.AbstractAnnotationXmlParser; +import org.chboston.cnlp.nlp.annotation.relation.DefaultRelation; +import org.chboston.cnlp.nlp.annotation.relation.Relation; +import org.chboston.cnlp.nlp.annotation.textspan.DefaultTextSpan; +import org.chboston.cnlp.nlp.annotation.textspan.DiscontiguousTextSpan; +import org.chboston.cnlp.nlp.annotation.textspan.TextSpan; +import org.jdom.Attribute; +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.JDOMException; +import org.jdom.input.SAXBuilder; + +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.logging.Logger; +import java.util.regex.Pattern; + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 10/13/2014 + */ +final public class UimaXmiParser extends AbstractAnnotationXmlParser { + + static private final Logger LOGGER = Logger.getLogger( "UimaXmiParser" ); + + // TODO Refactor to use XmiTag + static private final String ROOT_ELEMENT_NAME = "XMI"; + static private final String DOCUMENT_TEXT_NAME = "Sofa"; + static private final String EVENT_MENTION = "EventMention"; + static private final String TIME_MENTION = "TimeMention"; + static private final String DATE_ANNOTATION = "DateAnnotation"; + static private final String TLINK = "TemporalTextRelation"; + static private final String UMLS = "UmlsConcept"; + + static private final String EVENT = "Event"; + static private final String EVENT_PROPERTIES = "EventProperties"; + + static private final String EVENT_ID_KEY = "event"; + static private final String PROPERTIES_ID_KEY = "properties"; + + + static private final String SIGN_SYMPTOM = "SignSymptomMention"; + static private final String PROCEDURE = "ProcedureMention"; + static private final String DISEASE = "DiseaseDisorderMention"; + static private final String MEDICATION = "MedicationMention"; + static private final String ANATOMIC_SITE = "AnatomicalSiteMention"; + + static private final String CONCEPT_ARRAY = "ontologyConceptArr"; + static private final String UMLS_CONCEPT = "UmlsConcept"; + + static private final String ID = "id"; + static private final String DOCUMENT_TEXT = "sofaString"; + static private final String BEGIN = "begin"; + static private final String END = "end"; + static private final String DOC_TIME_REL = "docTimeRel"; + static private final String POLARITY = "polarity"; + static private final String LINK_TYPE = "category"; + static private final String LINK_ARG1 = "arg1"; + static private final String LINK_ARG2 = "arg2"; + static private final String RELATION_ARG = "RelationArgument"; + static private final String ARGUMENT = "argument"; + static private final String CUI = "cui"; + static private final String TUI = "tui"; + + + static private final String COREF_RELATION = "CoreferenceRelation"; + static private final String COREF_CHAIN = "CollectionTextRelation"; + static private final String COREF_MEMBERS = "members"; + + + static private final String INPUT_DIR_PATH = "C:\\Spiffy\\prj_darth_phenome\\data\\internal\\xmi\\Oct10_2014"; + + /** + * @param filePath path to file with annotation information + * @return true if this AnnotationsParser can properly handle the given file + */ + static public boolean canParse( final String filePath ) { + final SAXBuilder saxBuilder = new SAXBuilder(); + try { + final Document document = saxBuilder.build( filePath ); + final Element rootElement = document.getRootElement(); + return rootElement != null && rootElement.getName().equals( ROOT_ELEMENT_NAME ); + } catch ( JDOMException jdomE ) { + LOGGER.severe( jdomE.getMessage() ); + return false; + } catch ( IOException ioE ) { + LOGGER.severe( ioE.getMessage() ); + return false; + } + } + + public boolean preParseFile( final String xmlFilePath ) { + return true; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean parseFile( final String xmlFilePath ) { + reset(); + if ( xmlFilePath == null || xmlFilePath.isEmpty() ) { + return false; + } + final File xmlFile = new File( xmlFilePath ); + if ( !xmlFile.canRead() ) { + return false; + } + final SAXBuilder saxBuilder = new SAXBuilder(); + try { + final Document document = saxBuilder.build( xmlFile ); + final Element rootElement = document.getRootElement(); + final String documentText = getDocumentText( rootElement ); + setDocumentText( documentText ); + final Map eventMap = createElementIdMap( rootElement, EVENT ); + final Map eventPropertiesMap = createElementIdMap( rootElement, EVENT_PROPERTIES ); + final Map entityMap = getEntityMap( rootElement, eventMap, eventPropertiesMap ); + final List entityList = XmiEntitySegregator.getNamedEntities( entityMap.values() ); + final List eventList = XmiEntitySegregator.getEvents( entityMap.values() ); + final List timexList = XmiEntitySegregator.getTimes( entityMap.values() ); + final Map relationArgMap = getRelationArgMap( rootElement ); + final List tlinkRelationList = getTLinks( rootElement, entityMap, relationArgMap ); + final List> coreferenceTextSpans = new ArrayList<>(); + coreferenceTextSpans.addAll( getCorefLinks( rootElement, entityMap, relationArgMap ) ); + coreferenceTextSpans.addAll( getCorefChains( rootElement, entityMap, relationArgMap ) ); + Collections.sort( coreferenceTextSpans, CoreferenceChainSpanComparator.getInstance() ); + List coreferenceChainList = Collections.emptyList(); + if ( !entityList.isEmpty() || !eventList.isEmpty() || !timexList.isEmpty() ) { + coreferenceChainList = CoreferenceFactory.createCoreferenceChains( coreferenceTextSpans, entityList, + eventList, timexList ); + } + int wordCount = -1; + if ( documentText != null && !documentText.isEmpty() ) { + wordCount = documentText.split( "\\s+" ).length; + } + final ImmutableAnnotationStore.AnnoteCollectBuilder builder + = new ImmutableAnnotationStore.AnnoteCollectBuilder(); + builder.entities( entityList ).events( eventList ).times( timexList ); +// builder.relations( umlsRelationList ) + builder.timeRelations( tlinkRelationList ); +// builder.coreferenceTextSpans( coreferenceChains ); + builder.coreferenceChains( coreferenceChainList ).wordCount( wordCount ); + if ( documentText != null && !documentText.isEmpty() ) { + builder.documentText( documentText ); + } + _annotationStore = builder.build(); + return true; + } catch ( JDOMException jdomE ) { + LOGGER.severe( jdomE.getMessage() ); + return false; + } catch ( IOException ioE ) { + LOGGER.severe( ioE.getMessage() ); + return false; + } + } + + + static private String getDocumentText( final Element rootElement ) { + final Element child = getChild( rootElement, DOCUMENT_TEXT_NAME ); + final String text = child.getAttributeValue( DOCUMENT_TEXT ); + return text.replace( " ", "\n" ); + } + + /** + * @param rootElement the root xml element in an annotation xml file + * @return map of entityIDs and Knowtator Entities + */ + private Map getEntityMap( final Element rootElement, + final Map eventMap, + final Map eventPropertiesMap ) { + final Map entityMap = new HashMap<>(); + entityMap.putAll( getEntities( rootElement, SIGN_SYMPTOM, null, null ) ); + entityMap.putAll( getEntities( rootElement, PROCEDURE, null, null ) ); + entityMap.putAll( getEntities( rootElement, DISEASE, null, null ) ); + entityMap.putAll( getEntities( rootElement, MEDICATION, null, null ) ); + entityMap.putAll( getEntities( rootElement, ANATOMIC_SITE, null, null ) ); + entityMap.putAll( getEntities( rootElement, EVENT_MENTION, eventMap, eventPropertiesMap ) ); + entityMap.putAll( getEntities( rootElement, TIME_MENTION, null, null ) ); + entityMap.putAll( getEntities( rootElement, DATE_ANNOTATION, null, null ) ); + return entityMap; + } + + + static private Map getRelationArgMap( final Element rootElement ) { + final Map relationArgMap = new HashMap<>(); + final List argElements = getChildren( rootElement, RELATION_ARG ); + for ( Element argElement : argElements ) { + final String argId = getElementId( argElement ); + final String entityId = argElement.getAttributeValue( ARGUMENT ); + relationArgMap.put( argId, entityId ); + } + return relationArgMap; + + + //TODO + + } + + + static private Map createElementIdMap( final Element rootElement, final String elementClassName ) { + final List classElements = getChildren( rootElement, elementClassName ); + final Map elementIdMap = new HashMap<>( classElements.size() ); + for ( Element element : classElements ) { + final String elementId = getElementId( element ); + if ( elementId != null ) { + elementIdMap.put( elementId, element ); + } + } + return elementIdMap; + } + + + private Map getEntities( final Element rootElement, final String classElementName, + final Map eventMap, + final Map eventPropertiesMap ) { + final List classElements = getChildren( rootElement, classElementName ); + final Map entityMap = new HashMap<>(); + final String documentText = getDocumentText(); + for ( Element entityElement : classElements ) { + final String entityId = getElementId( entityElement ); + final TextSpan textSpan = createEntityTextSpan( entityElement ); + if ( textSpan.equals( BAD_TEXT_SPAN ) ) { + continue; + } + final String spannedText = getSpannedText( textSpan, documentText ); + if ( spannedText.trim().isEmpty() ) { + continue; + } + final ClassType classType = getEntityClassType( classElementName ); + final Collection nlpAttributes + = createNlpAttributes( entityElement ); + if ( classElementName.equals( EVENT_MENTION ) ) { + final String eventId = entityElement.getAttributeValue( EVENT_ID_KEY ); + nlpAttributes.addAll( parseEventAttributes( rootElement, eventId, eventMap, eventPropertiesMap ) ); + } else if ( classElementName.equals( TIME_MENTION ) || classElementName.equals( DATE_ANNOTATION ) ) { + nlpAttributes.add( new DefaultAttribute( "XMI_TIMEX", "DATE" ) ); + } else { + final String conceptIdArray = entityElement.getAttributeValue( CONCEPT_ARRAY ); + if ( conceptIdArray != null ) { + final String[] conceptIds = conceptIdArray.split( "\\s+" ); + for ( String conceptId : conceptIds ) { + nlpAttributes.addAll( parseUmlsAttributes( rootElement, conceptId ) ); + } + } + } + nlpAttributes.add( new DefaultAttribute( DefinedAttributeType.UNIQUE_ID, entityId ) ); + final Entity entity = new DefaultEntity( textSpan, spannedText, classType, + nlpAttributes + .toArray( new org.chboston.cnlp.nlp.annotation.attribute.Attribute[ nlpAttributes.size() ] ) ); + entityMap.put( entityId, entity ); + } + return entityMap; + } + + + static private Collection createNlpAttributes( + final Element element ) { + final Collection nonIdAttributes = getNonIdXmlAttributes( element ); + final Collection attributeList + = new ArrayList<>( nonIdAttributes.size() ); + for ( Attribute xmlAttribute : nonIdAttributes ) { + attributeList.add( createNlpAttribute( xmlAttribute.getName(), xmlAttribute.getValue() ) ); + } + return attributeList; + } + + static private org.chboston.cnlp.nlp.annotation.attribute.Attribute createNlpAttribute( final String name, + final String value ) { + final AttributeType attributeType = AttributeTypeFactory.getAttributeForName( name ); + if ( attributeType == DefinedAttributeType.POLARITY && value.equals( "-1" ) ) { + return new DefaultAttribute( DefinedAttributeType.POLARITY, "NEG" ); + } + return new DefaultAttribute( attributeType.getName(), value ); + } + + + static private ClassType getEntityClassType( final String classTypeName ) { + // TODO - add more class types. See Knowtator xml parser for possible list + switch ( classTypeName ) { + case EVENT_MENTION: + return TemporalClassType.EVENT; + case TIME_MENTION: + return TemporalClassType.TIMEX; + case DATE_ANNOTATION: + return TemporalClassType.TIMEX; + case SIGN_SYMPTOM: + return SemanticClassType.SIGN_OR_SYMPTOM; + case DISEASE: + return SemanticClassType.DISEASE_DISORDER; + case ANATOMIC_SITE: + return SemanticClassType.ANATOMICAL_SITE; + case MEDICATION: + return SemanticClassType.MEDICATION; + case PROCEDURE: + return SemanticClassType.PROCEDURE; + case "generic_class": + return SemanticClassType.MISC; + } + return new CustomClassType( classTypeName ); + } + + + static private Collection parseEventAttributes( + final Element rootElement, final String eventId, + final Map eventMap, final Map eventPropertiesMap ) { + final Element eventElement = eventMap.get( eventId ); + if ( eventElement == null ) { + return Collections.emptyList(); + } + final String propertiesId = eventElement.getAttributeValue( PROPERTIES_ID_KEY ); + if ( propertiesId == null ) { + return Collections.emptyList(); + } + final Element propertiesElement = eventPropertiesMap.get( propertiesId ); + if ( propertiesElement == null ) { + return Collections.emptyList(); + } + return createNlpAttributes( propertiesElement ); + } + + static private Collection parseUmlsAttributes( + final Element rootElement, final String conceptId ) { + final Element conceptElement = getIdElement( rootElement, UMLS_CONCEPT, conceptId ); + if ( conceptElement == null ) { + return Collections.emptyList(); + } + return createNlpAttributes( conceptElement ); + } + + + static private Element getIdElement( final Element rootElement, final String elementTypeName, + final String elementId ) { + final List childElements = getChildren( rootElement, elementTypeName ); + for ( Element childElement : childElements ) { + if ( getElementId( childElement ).equals( elementId ) ) { + return childElement; + } + } + return null; + } + + static private String getElementId( final Element element ) { + final List xmiAttributes = element.getAttributes(); + if ( xmiAttributes != null ) { + for ( Attribute xmiAttribute : xmiAttributes ) { + final String attributeName = xmiAttribute.getName(); + final String attributeValue = xmiAttribute.getValue(); + if ( attributeName.equals( ID ) && !attributeValue.equals( "0" ) ) { + return attributeValue; + } + } + } + return ""; + } + + static private Collection getNonIdXmlAttributes( final Element element ) { + final List xmiAttributes = element.getAttributes(); + final Collection nonIdAttributes = new ArrayList<>( xmiAttributes.size() - 1 ); + for ( Attribute xmiAttribute : xmiAttributes ) { + final String attributeName = xmiAttribute.getName(); + if ( !attributeName.equals( ID ) ) { + nonIdAttributes.add( xmiAttribute ); + } + } + return nonIdAttributes; + } + + /** + * @param rootElement xml root element + * @param entityMap map of elementIDs and Entities + * @return list of Relations created with all the given information + */ + static private List getTLinks( final Element rootElement, + final Map entityMap, + final Map relationArgMap ) { + if ( entityMap.isEmpty() ) { + return Collections.emptyList(); + } + final List relationList = new ArrayList<>(); + final List relationElementList = getChildren( rootElement, TLINK ); + final List attributeList = new ArrayList<>(); + for ( Element relationElement : relationElementList ) { + attributeList.clear(); + final String relationId = getElementId( relationElement ); + // TODO make classtype tlink + String sourceEntityId = ""; + String targetEntityId = ""; + final Collection nonIdAttributes = getNonIdXmlAttributes( relationElement ); + for ( Attribute xmlAttribute : nonIdAttributes ) { + final String attributeName = xmlAttribute.getName(); + final String attributeValue = xmlAttribute.getValue(); + if ( attributeName.equalsIgnoreCase( LINK_ARG1 ) ) { + sourceEntityId = attributeValue; + } else if ( attributeName.equalsIgnoreCase( LINK_ARG2 ) ) { + targetEntityId = attributeValue; + } else if ( attributeName.equalsIgnoreCase( LINK_TYPE ) ) { + attributeList.add( createNlpAttribute( DefinedAttributeType.RELATION_TYPE.getName(), attributeValue ) ); + } else { + attributeList.add( createNlpAttribute( attributeName, attributeValue ) ); + } + } + if ( sourceEntityId.isEmpty() || targetEntityId.isEmpty() ) { + LOGGER.severe( "Relation " + relationId + + " has no Source " + sourceEntityId + + " and/or no Target " + targetEntityId ); + continue; + } + final String realSource = relationArgMap.get( sourceEntityId ); + final String realTarget = relationArgMap.get( targetEntityId ); + if ( realSource == null || realTarget == null ) { + LOGGER.severe( "Relation " + relationId + + " has no Source " + sourceEntityId + + " and/or no Target " + targetEntityId ); + continue; + } + + final Entity entity1 = entityMap.get( realSource ); + final Entity entity2 = entityMap.get( realTarget ); + if ( entity1 == null || entity2 == null ) { + LOGGER.severe( "Relation " + relationId + + " Source " + realSource + + " and/or Target " + realTarget + " does not exist" ); + continue; + } + attributeList.add( new DefaultAttribute( DefinedAttributeType.UNIQUE_ID, relationId ) ); + final Relation relation = new DefaultRelation( entity1, entity2, TemporalClassType.TLINK, + attributeList + .toArray( new org.chboston.cnlp.nlp.annotation.attribute.Attribute[ attributeList.size() ] ) ); + relationList.add( relation ); + } + return relationList; + } + + + /** + * @param rootElement xml root element + * @param entityMap map of elementIDs and Entities + * @param relationArgMap map of argument elementIDs and entity elementIDs + * @return list of TextSpan pairs for Coreference Relations created with all the given information + */ + static private List> getCorefLinks( final Element rootElement, + final Map entityMap, + final Map relationArgMap ) { + if ( entityMap.isEmpty() ) { + return Collections.emptyList(); + } + final List> corefList = new ArrayList<>(); + final List relationElementList = getChildren( rootElement, COREF_RELATION ); + for ( Element relationElement : relationElementList ) { + final String relationId = getElementId( relationElement ); + String sourceEntityId = ""; + String targetEntityId = ""; + final Collection nonIdAttributes = getNonIdXmlAttributes( relationElement ); + for ( Attribute xmlAttribute : nonIdAttributes ) { + final String attributeName = xmlAttribute.getName(); + final String attributeValue = xmlAttribute.getValue(); + if ( attributeName.equalsIgnoreCase( LINK_ARG1 ) ) { + sourceEntityId = attributeValue; + } else if ( attributeName.equalsIgnoreCase( LINK_ARG2 ) ) { + targetEntityId = attributeValue; + } + } + if ( sourceEntityId.isEmpty() || targetEntityId.isEmpty() ) { + LOGGER.severe( "Relation " + relationId + + " has no Source " + sourceEntityId + + " and/or no Target " + targetEntityId ); + continue; + } + final String realSource = relationArgMap.get( sourceEntityId ); + final String realTarget = relationArgMap.get( targetEntityId ); + if ( realSource == null || realTarget == null ) { + LOGGER.severe( "Relation " + relationId + + " has no Source " + sourceEntityId + + " and/or no Target " + targetEntityId ); + continue; + } + final Entity entity1 = entityMap.get( realSource ); + final Entity entity2 = entityMap.get( realTarget ); + if ( entity1 == null || entity2 == null ) { + LOGGER.severe( "Relation " + relationId + + " Source " + realSource + + " and/or Target " + realTarget + " does not exist" ); + continue; + } + final Collection textSpans = new ArrayList<>( 2 ); + textSpans.add( entity1.getTextSpan() ); + textSpans.add( entity2.getTextSpan() ); + corefList.add( textSpans ); + } + return Collections.unmodifiableList( corefList ); + } + + + /** + * @param rootElement xml root element + * @param entityMap map of elementIDs and Entities + * @param relationArgMap map of argument elementIDs and entity elementIDs + * @return list of TextSpan pairs for Coreference Relations created with all the given information + */ + static private List> getCorefChains( final Element rootElement, + final Map entityMap, + final Map relationArgMap ) { + if ( entityMap.isEmpty() ) { + return Collections.emptyList(); + } + final Pattern memberSplitter = Pattern.compile( "\\s+" ); + final List> corefList = new ArrayList<>(); + final List relationElementList = getChildren( rootElement, COREF_CHAIN ); + final Collection entityIds = new ArrayList<>(); + for ( Element relationElement : relationElementList ) { + entityIds.clear(); + final String relationId = getElementId( relationElement ); + final Collection nonIdAttributes = getNonIdXmlAttributes( relationElement ); + String[] argumentIDs = null; + for ( Attribute xmlAttribute : nonIdAttributes ) { + final String attributeName = xmlAttribute.getName(); + final String attributeValue = xmlAttribute.getValue(); + if ( attributeName.equalsIgnoreCase( COREF_MEMBERS ) ) { + argumentIDs = memberSplitter.split( attributeValue ); + break; + } + } + if ( argumentIDs == null || argumentIDs.length == 0 ) { + LOGGER.severe( "Relation " + relationId + " has no Members" ); + continue; + } + if ( argumentIDs.length == 1 ) { + LOGGER.severe( "Relation " + relationId + " has only one Member " + argumentIDs[ 0 ] ); + continue; + } + for ( String argumentId : argumentIDs ) { + final String realSource = relationArgMap.get( argumentId ); + if ( realSource == null ) { + LOGGER.severe( "Relation " + relationId + + " has no Source " + argumentId ); + continue; + } + entityIds.add( realSource ); + } + final Collection textSpans = new ArrayList<>( 2 ); + for ( String entityId : entityIds ) { + final Entity entity = entityMap.get( entityId ); + if ( entity == null ) { + LOGGER.severe( "Relation " + relationId + " Entity ID " + entityId + " does not exist" ); + continue; + } + textSpans.add( entity.getTextSpan() ); + } + if ( textSpans.size() > 1 ) { + corefList.add( textSpans ); + } + } + return Collections.unmodifiableList( corefList ); + } + + + /** + * {@inheritDoc} + */ + @Override + protected TextSpan createEntityTextSpan( final Element spanElement ) { + int begin = 0; + int end = 0; + try { + begin = spanElement.getAttribute( BEGIN ).getIntValue(); + end = spanElement.getAttribute( END ).getIntValue(); + } catch ( JDOMException jdomE ) { + LOGGER.severe( jdomE.getMessage() ); + return BAD_TEXT_SPAN; + } + return new DefaultTextSpan( begin, end ); + } + + /** + * Anafora XML does not provide actual text, but the document text may be known. + * If the document text is known then this simply returns a substring, otherwise a string of 'A' + * + * @param textSpan - + * @return The spanned text within provided document text, or a String filled with character 'A' + */ + static private String getSpannedText( final TextSpan textSpan, final String documentText ) { + if ( documentText == null || documentText.isEmpty() ) { + return fakeSomeText( textSpan ); + } + final int startIndex = textSpan.getStartIndex(); + final int endIndex = textSpan.getEndIndex(); + if ( startIndex >= 0 && endIndex < documentText.length() ) { + return documentText.substring( startIndex, endIndex ); + } + return fakeSomeText( textSpan ); + } + + /** + * Anafora XML does not provide actual text, so we need to fake it. + * This will knock some of the IAA capabilities, such as Alpha computations based upon word count, + * marked comparison by word count, etc. + * + * @param textSpan - + * @return A String the length of the textSpan filled with the character 'A' + */ + static private String fakeSomeText( final TextSpan textSpan ) { + if ( textSpan instanceof DiscontiguousTextSpan ) { + final TextSpan jointTextSpan = new DefaultTextSpan( textSpan.getStartIndex(), textSpan.getEndIndex() ); + return fakeSomeText( jointTextSpan ); + } + final char[] chars = new char[ textSpan.getLength() ]; + Arrays.fill( chars, 'A' ); + return String.valueOf( chars ); + } + + + static private List getChildren( final Element rootElement, final String name ) { + final List rootChildren = rootElement.getChildren(); + final List children = new ArrayList<>( rootChildren.size() ); + for ( Object child : rootChildren ) { + if ( child instanceof Element && ((Element)child).getName().equals( name ) ) { + children.add( (Element)child ); + } + } + return children; + } + + static private Element getChild( final Element rootElement, final String name ) { + final List children = getChildren( rootElement, name ); + if ( children.isEmpty() ) { + return null; + } + return children.get( 0 ); + } + + + static private void testParse( final String filePath ) { + final SAXBuilder saxBuilder = new SAXBuilder(); + try { + final Document document = saxBuilder.build( filePath ); + final Element rootElement = document.getRootElement(); + final List rootChildren = rootElement.getChildren(); + for ( Object child : rootChildren ) { + if ( child instanceof Element ) { + final Element element = (Element)child; + System.out.println( element.getName() ); + final List stuff = element.getAttributes(); + for ( Object thing : stuff ) { + if ( thing instanceof Attribute ) { + final Attribute attribute = (Attribute)thing; + System.out.println( "\t" + attribute.getName() + " = " + attribute.getValue() ); + } + } + } else { + LOGGER.warning( " NOT ELEMENT " + child.toString() ); + } + } + getDocumentText( rootElement ); + } catch ( JDOMException | IOException multE ) { + LOGGER.severe( multE.getMessage() ); + } + } + + + public static void main( String[] args ) { + final File inputDir = new File( INPUT_DIR_PATH ); + final File[] files = inputDir.listFiles(); + if ( files == null ) { + LOGGER.warning( "No files in " + inputDir.getPath() ); + System.exit( 0 ); + } + for ( File file : files ) { +// if ( !file.getName().endsWith( ".old" ) ) { + if ( !file.getName().endsWith( "_report_4.txt.xmi.old" ) ) { + continue; + } + testParse( file.getPath() ); + break; + } + } + +} Added: ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/parser/XmiEntitySegregator.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/parser/XmiEntitySegregator.java?rev=1660963&view=auto ============================================================================== --- ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/parser/XmiEntitySegregator.java (added) +++ ctakes/sandbox/timelanes/org/chboston/cnlp/xmi/parser/XmiEntitySegregator.java Thu Feb 19 18:06:13 2015 @@ -0,0 +1,74 @@ +package org.chboston.cnlp.xmi.parser; + +import org.chboston.cnlp.nlp.annotation.annotation.AnnotationSpanComparator; +import org.chboston.cnlp.nlp.annotation.classtype.ClassType; +import org.chboston.cnlp.nlp.annotation.classtype.TemporalClassType; +import org.chboston.cnlp.nlp.annotation.entity.Entity; +import org.chboston.cnlp.nlp.annotation.textspan.TextSpan; + +import java.util.*; + +/** + * Author: SPF + * Affiliation: CHIP-NLP + * Date: 3/28/13 + */ +final public class XmiEntitySegregator { + + private XmiEntitySegregator() { + } + + /** + * @param entities collection of entities + * @return all named entities with the given collection of entities + */ + static public List getNamedEntities( final Iterable entities ) { + final List namedEntityList = new ArrayList<>(); + for ( Entity entity : entities ) { + final ClassType type = entity.getClassType(); + if ( type != TemporalClassType.EVENT && type != TemporalClassType.TIMEX ) { + namedEntityList.add( entity ); + } + } + Collections.sort( namedEntityList, AnnotationSpanComparator.getInstance() ); + return Collections.unmodifiableList( namedEntityList ); + } + + /** + * @param entities collection of entities + * @return all events with the given collection of entities + */ + static public List getEvents( final Iterable entities ) { + final List eventList = new ArrayList<>(); + for ( Entity entity : entities ) { + final ClassType type = entity.getClassType(); + if ( type == TemporalClassType.EVENT ) { + eventList.add( entity ); + } + } + Collections.sort( eventList, AnnotationSpanComparator.getInstance() ); + return Collections.unmodifiableList( eventList ); + } + + /** + * @param entities collection of entities + * @return all timex3 times with the given collection of entities + */ + static public List getTimes( final Iterable entities ) { + // XMI has Date, Time, and Timex3 annotations. Many will overlap. We only want one per textSpan. + final Map textSpanTimes = new HashMap<>(); + for ( Entity entity : entities ) { + final ClassType type = entity.getClassType(); + if ( type == TemporalClassType.TIMEX ) { + textSpanTimes.put( entity.getTextSpan(), entity ); + } + } + final List timexList = new ArrayList<>(); + for ( Entity entity : textSpanTimes.values() ) { + timexList.add( entity ); + } + Collections.sort( timexList, AnnotationSpanComparator.getInstance() ); + return Collections.unmodifiableList( timexList ); + } + +}