Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id C264310B6C for ; Fri, 3 Jan 2014 23:23:31 +0000 (UTC) Received: (qmail 91524 invoked by uid 500); 3 Jan 2014 23:23:31 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 91492 invoked by uid 500); 3 Jan 2014 23:23:31 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 91485 invoked by uid 99); 3 Jan 2014 23:23:31 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 03 Jan 2014 23:23:31 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 03 Jan 2014 23:23:22 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 4C4FF23889EC; Fri, 3 Jan 2014 23:23:00 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1555281 [2/3] - in /ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima: annotators/ dao/ lookup/ lookup/ae/ mapper/ model/ resource/ Date: Fri, 03 Jan 2014 23:22:59 -0000 To: commits@ctakes.apache.org From: vjapache@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140103232300.4C4FF23889EC@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceSpan.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceSpan.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceSpan.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceSpan.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,359 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.ctakes.ytex.uima.annotators; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * A span of text and its offsets within some larger text + */ +public class SentenceSpan { + + public static String LF = "\n"; + public static String CR = "\r"; + public static String CRLF = "\r\n"; + + private int start; // offset of text within larger text + private int end; // offset of end of text within larger text + private String text; + private static final Pattern dotPattern = Pattern.compile("\\."); + private static final Pattern nonWhiteSpacePattern = Pattern.compile("\\S"); + + public SentenceSpan(int s, int e, String t){ + start = s; + end = e; + text = t; + } + + /** + * Set offset of start of this span within the larger text + */ + public void setStart(int in){ + start = in; + } + + /** + * + * Set offset of end of this span within the larger text + */ + public void setEnd(int in){ + end = in; + } + + public void setText(String in){ + text = in; + } + + public int getStart() {return start;} + public int getEnd() {return end;} + public String getText() {return text;} + + +// /** +// * If the span contains splitChar, +// * create a List of the (sub)spans separated by splitChar, and trimmed. +// * Otherwise return a List containing just this SentenceSpan, trimmed. +// * @param splitChar (written to be general, but probably newline) +// * @return List +// */ +// public List splitSpan(char splitChar) { +// ArrayList subspans = new ArrayList(); +// int nlPosition; +// +//// nlPosition = text.indexOf(splitChar); //JZ +//// if (nlPosition < 0) { +//// subspans.add(this); //JZ: should trim as specified in the JavaDoc +//// return subspans; +//// } +// +// int subspanStart = 0; // +// int relativeSpanEnd = end-start; +// int subspanEnd = -1; +// int trimmedSubspanEnd = -1; +// +// try { +// while (subspanStart < relativeSpanEnd) { +// String subString = text.substring(subspanStart, relativeSpanEnd); +// nlPosition = subString.indexOf(splitChar); +// if (nlPosition < 0) { +// subspanEnd = relativeSpanEnd; +// } +// else { +// subspanEnd = nlPosition + subspanStart; +// } +// String coveredText = text.substring(subspanStart, subspanEnd); +// coveredText = coveredText.trim(); +// // old len = (ssend-ssstart) +// // new len = ct.len +// // new e = ssstart+newlen +// trimmedSubspanEnd = subspanStart + coveredText.length(); +// subspans.add(new SentenceSpan(subspanStart+start, trimmedSubspanEnd+start, coveredText)); +// subspanStart = subspanEnd+1; // skip past newline +// } +// } +// catch (java.lang.StringIndexOutOfBoundsException iobe) { +// System.err.println("splitChar as int = " + (int)splitChar); +// this.toString(); +// System.err.println("subspanStart = " + subspanStart); +// System.err.println("relativeSpanEnd = " + relativeSpanEnd); +// System.err.println("subspanEnd = " + subspanEnd); +// System.err.println("trimmedSubspanEnd = " + trimmedSubspanEnd); +// System.err.println("splitChar as int = " + (int)splitChar); +// iobe.printStackTrace(); +// throw iobe; +// } +// return subspans; +// } + + /** + * Trim any leading or trailing whitespace. + * If there are any end-of-line characters in what's left, split into multiple smaller sentences, + * and trim each. + * If is entirely whitespace, return an empty list + * @param separatorPattern CR LF or CRLF + */ + public List splitAtLineBreaksAndTrim(String separatorPattern) { + + ArrayList subspans = new ArrayList(); + + // Validate input parameter + if (!separatorPattern.equals(LF) && !separatorPattern.equals(CR) && !separatorPattern.equals(CRLF)) { + + int len = separatorPattern.length(); + System.err.println("Invalid line break: " + len + " characters long."); + + System.err.print(" line break character values: "); + for (int i=0; i0) { + positionOfNonWhiteSpace = s.indexOf(t.charAt(0)); + } else { + positionOfNonWhiteSpace = 0; + } + // Might have trimmed off some at the beginning of the sentences other than the 1st (#0) + position += positionOfNonWhiteSpace; // sf Bugs artifact 3083903: For _each_ sentence, advance past any spaces at beginning of line + subspans.add(new SentenceSpan(position, position+t.length(), t)); + position += (s.length()-positionOfNonWhiteSpace + separatorPattern.length()); + } + + return subspans; + + } + + + public String toString() { + String s = "(" + start + ", " + end + ") " + text; + return s; + } + + + /** + * vng added + * + * @return + */ + public List splitAtPeriodAndTrim(Pattern acronymPattern, + Pattern periodPattern, Pattern splitPattern) { + ArrayList subspans = new ArrayList(); + if (acronymPattern == null && periodPattern == null) { + // don't split at periods + subspans.add(this); + } else { + // Check first if contains only whitespace, in which case return an + // empty list + String coveredText = text.substring(0, end - start); + String trimmedText = coveredText.trim(); + int trimmedLen = trimmedText.length(); + if (trimmedLen == 0) { + return subspans; + } + + // If there is any leading or trailing whitespace, determine + // position of + // the trimmed section + int trimmedStart = start; + // int trimmedEnd = end; + int positionOfNonWhiteSpace = 0; + if (trimmedLen != coveredText.length()) { + // Use indexOf to skip past the white space. + // Consider looking through looking characters using + // Character.isWhiteSpace(ch) + positionOfNonWhiteSpace = coveredText.indexOf(trimmedText); + trimmedStart = start + positionOfNonWhiteSpace; + // trimmedEnd = trimmedStart + trimmedLen; + } + + // Split into multiple sentences if contains end-of-line characters + // or return just one sentence if no end-of-line characters are + // within + // the trimmed string + Matcher dotMatcher = dotPattern.matcher(trimmedText); + int position = trimmedStart; + int currentStartPos = 0; + while (dotMatcher.find()) { + // found a period within the span + // see if an acronym precedes it + boolean ok = true; + if (acronymPattern != null && dotMatcher.start() > 0) { + String precedingText = trimmedText.substring( + currentStartPos, dotMatcher.start()); + ok = !acronymPattern.matcher(precedingText).find(); + } + // acronym not preceding period + // make sure the subsequent text matches the specified + // pattern + if (ok && periodPattern != null + && dotMatcher.end() < trimmedText.length()) { + String followingText = trimmedText.substring(dotMatcher + .end()); + ok = periodPattern.matcher(followingText).find(); + } + if (ok) { + // ok to split on this period + String t = trimmedText.substring(currentStartPos, + dotMatcher.end()); + subspans.add(new SentenceSpan(position + currentStartPos, + position + currentStartPos + t.length(), t)); + currentStartPos += t.length(); + if (currentStartPos < trimmedText.length()) { + // skip ahead to next non-whitespace character + Matcher nwsMatcher = nonWhiteSpacePattern + .matcher(trimmedText.substring(currentStartPos)); + if (nwsMatcher.find()) { + currentStartPos += nwsMatcher.start(); + } + } + } + } + + // String spans[] = periodPattern.split(trimmedText); + // int position = trimmedStart; + // Matcher matcher = periodPattern.matcher(trimmedText); + // int currentStartPos = 0; + // while (matcher.find()) { + // // matcher.start() + 1 because we want to include the "." + // String t = trimmedText.substring(currentStartPos, + // matcher.start() + 1); + // subspans.add(new SentenceSpan(position + currentStartPos, + // position + currentStartPos + t.length() + 1, t)); + // // matcher.end() - 1 because we want to include the 1st letter + // // of + // // the sentence + // currentStartPos += (matcher.end() - currentStartPos - 1); + // } + if (currentStartPos < trimmedText.length()) { + String t = trimmedText.substring(currentStartPos); + subspans.add(new SentenceSpan(position + currentStartPos, + position + currentStartPos + t.length(), t)); + } + } + return splitSubspans(subspans, splitPattern); + } + + /** + * vng added + * + * @return + */ + public List splitSubspans(List subspans, + Pattern splitPattern) { + List splitSubspans = new ArrayList(); + if (splitPattern == null) { + splitSubspans.addAll(subspans); + } else { + // Split into multiple sentences if contains end-of-line characters + // or return just one sentence if no end-of-line characters are + // within + // the trimmed string + for (SentenceSpan span : subspans) { + String trimmedText = span.getText(); + boolean bSplit = false; + Matcher matcher = splitPattern.matcher(trimmedText); + int position = span.getStart(); + int currentStartPos = 0; + while (matcher.find()) { + bSplit = true; + if (matcher.start() > currentStartPos) { + String t = trimmedText.substring(currentStartPos, + matcher.start()); + splitSubspans.add(new SentenceSpan(position + + currentStartPos, position + currentStartPos + + t.length(), t)); + currentStartPos += t.length(); + } + } + if (bSplit) { + if (currentStartPos < trimmedText.length()) { + String t = trimmedText.substring(currentStartPos); + splitSubspans.add(new SentenceSpan(position + + currentStartPos, position + currentStartPos + + t.length(), t)); + } + } else + splitSubspans.add(span); + } + } + return splitSubspans; + } + + +} Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/DocumentDao.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/DocumentDao.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/DocumentDao.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/DocumentDao.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,22 @@ +package org.apache.ctakes.ytex.uima.dao; + + +import org.apache.ctakes.ytex.uima.model.Document; + +/** + * DAO interface for accessing Document objects. + * @author vijay + * + */ +public interface DocumentDao { + + /** + * Retrieve document by primary key + * @param documentID + * @return + */ + public abstract Document getDocument(int documentID); + +// public abstract Integer saveDocument(JCas jcas, String analysisBatch); + +} \ No newline at end of file Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/DocumentDaoImpl.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/DocumentDaoImpl.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/DocumentDaoImpl.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/DocumentDaoImpl.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,32 @@ +package org.apache.ctakes.ytex.uima.dao; + + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.ctakes.ytex.uima.model.Document; +import org.hibernate.SessionFactory; + +public class DocumentDaoImpl implements DocumentDao { + private SessionFactory sessionFactory; + private static final Log log = LogFactory.getLog(DocumentDaoImpl.class); + + public SessionFactory getSessionFactory() { + return sessionFactory; + } + + public void setSessionFactory(SessionFactory sessionFactory) { + this.sessionFactory = sessionFactory; + } + + + /* + * (non-Javadoc) + * + * @see gov.va.vacs.esld.dao.DocumentDao#getDocument(int) + */ + public Document getDocument(int documentID) { + return (Document) this.sessionFactory.getCurrentSession().get( + Document.class, documentID); + } + +} Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/NamedEntityRegexDao.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/NamedEntityRegexDao.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/NamedEntityRegexDao.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/NamedEntityRegexDao.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,17 @@ +package org.apache.ctakes.ytex.uima.dao; + + +import java.util.List; + +import org.apache.ctakes.ytex.uima.model.NamedEntityRegex; + +/** + * Dao to access NamedEntity Regular Expressions used by the NamedEntityRegexAnnotator + * @author vijay + * + */ +public interface NamedEntityRegexDao { + + public abstract List getNamedEntityRegexs(); + +} \ No newline at end of file Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/NamedEntityRegexDaoImpl.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/NamedEntityRegexDaoImpl.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/NamedEntityRegexDaoImpl.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/NamedEntityRegexDaoImpl.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,25 @@ +package org.apache.ctakes.ytex.uima.dao; + +import java.util.List; + +import org.apache.ctakes.ytex.uima.model.NamedEntityRegex; +import org.hibernate.SessionFactory; + + +public class NamedEntityRegexDaoImpl implements NamedEntityRegexDao { + public SessionFactory getSessionFactory() { + return sessionFactory; + } + public void setSessionFactory(SessionFactory sessionFactory) { + this.sessionFactory = sessionFactory; + } + private SessionFactory sessionFactory; + /* (non-Javadoc) + * @see gov.va.vacs.esld.dao.NamedEntityRegexDao#getNamedEntityRegexs() + */ + @SuppressWarnings("unchecked") + public List getNamedEntityRegexs() { + return (List)sessionFactory.getCurrentSession().createQuery("from NamedEntityRegex").list(); + } + +} Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/SegmentRegexDao.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/SegmentRegexDao.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/SegmentRegexDao.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/SegmentRegexDao.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,21 @@ +package org.apache.ctakes.ytex.uima.dao; + + +import java.util.List; + +import org.apache.ctakes.ytex.uima.model.SegmentRegex; + +/** + * Dao to access Segment Boundary Regular Expressions. + * Used by SegmentRegexAnnotator. + * @author vijay + * + */ +public interface SegmentRegexDao { + + /* (non-Javadoc) + * @see gov.va.vacs.esld.dao.NamedEntityRegexDao#getNamedEntityRegexs() + */ + public abstract List getSegmentRegexs(); + +} \ No newline at end of file Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/SegmentRegexDaoImpl.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/SegmentRegexDaoImpl.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/SegmentRegexDaoImpl.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/dao/SegmentRegexDaoImpl.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,24 @@ +package org.apache.ctakes.ytex.uima.dao; + + +import java.util.List; + +import org.apache.ctakes.ytex.uima.model.SegmentRegex; +import org.hibernate.SessionFactory; + +public class SegmentRegexDaoImpl implements SegmentRegexDao { + public SessionFactory getSessionFactory() { + return sessionFactory; + } + public void setSessionFactory(SessionFactory sessionFactory) { + this.sessionFactory = sessionFactory; + } + private SessionFactory sessionFactory; + /* (non-Javadoc) + * @see gov.va.vacs.esld.dao.SegmentRegex#getSegmentRegexs() + */ + @SuppressWarnings("unchecked") + public List getSegmentRegexs() { + return (List)sessionFactory.getCurrentSession().createQuery("from SegmentRegex").list(); + } +} Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/lookup/ae/FirstTokenPermLookupInitializerImpl.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/lookup/ae/FirstTokenPermLookupInitializerImpl.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/lookup/ae/FirstTokenPermLookupInitializerImpl.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/lookup/ae/FirstTokenPermLookupInitializerImpl.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,315 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.ctakes.ytex.uima.lookup.ae; + +import org.apache.ctakes.core.util.JCasUtil; +import org.apache.ctakes.dictionary.lookup.DictionaryEngine; +import org.apache.ctakes.dictionary.lookup.ae.LookupAnnotationToJCasAdapter; +import org.apache.ctakes.dictionary.lookup.ae.LookupInitializer; +import org.apache.ctakes.dictionary.lookup.algorithms.FirstTokenPermutationImpl; +import org.apache.ctakes.dictionary.lookup.algorithms.LookupAlgorithm; +import org.apache.ctakes.dictionary.lookup.phrasebuilder.PhraseBuilder; +import org.apache.ctakes.dictionary.lookup.phrasebuilder.VariantPhraseBuilderImpl; +import org.apache.ctakes.dictionary.lookup.vo.LookupAnnotation; +import org.apache.ctakes.dictionary.lookup.vo.LookupToken; +import org.apache.ctakes.typesystem.type.syntax.*; +import org.apache.log4j.Logger; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.JFSIndexRepository; +import org.apache.uima.jcas.tcas.Annotation; + +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.util.*; + +/** + * @author Mayo Clinic + */ +public class FirstTokenPermLookupInitializerImpl implements LookupInitializer { + + static private final String TRUE_STRING = Boolean.toString( true ); + static private final String FALSE_STRING = Boolean.toString( false ); + + // LOG4J logger based on class name + final private Logger iv_logger = Logger.getLogger( getClass().getName() ); + + // properties for firstWordPermutation algorithm + static private final String TEXT_MFS_PRP_KEY = "textMetaFields"; + static private final String MAX_P_LEVEL_PRP_KEY = "maxPermutationLevel"; + static private final String WINDOW_ANNOT_PRP_KEY = "windowAnnotations"; + static private final String EXC_TAGS_PRP_KEY = "exclusionTags"; // optional + + static private final String CANONICAL_VARIANT_ATTR = "canonicalATTR"; + + final private Properties iv_props; + + // array of JCas window annotation type values + final private int[] iv_annotTypeArr; + + // set of exclusion POS tags (lower cased) + final private Set iv_exclusionTagSet; + + /* + * vng - to support lookup using stemmed words + */ + protected Constructor lookupTokenAdapterCtor = null; + /* + * vng - config key for lookupTokenAdapter class name + */ + private final String LOOKUP_TOKEN_ADAPTER = "lookupTokenAdapter"; + /** + * vng use the constructor identified during initialization to create the + * lookup token + * + * @param bta + * @return + * @throws AnnotatorInitializationException + */ + private LookupToken annoToLookupToken(Annotation bta) + throws AnnotatorInitializationException { + try { + return (LookupToken) lookupTokenAdapterCtor.newInstance(bta); + } catch (InvocationTargetException e) { + throw new AnnotatorInitializationException(e); + } catch (IllegalArgumentException e) { + throw new AnnotatorInitializationException(e); + } catch (InstantiationException e) { + throw new AnnotatorInitializationException(e); + } catch (IllegalAccessException e) { + throw new AnnotatorInitializationException(e); + } + } + + public FirstTokenPermLookupInitializerImpl( final UimaContext uimaContext, + final Properties props ) throws ClassNotFoundException, + IllegalAccessException, + NoSuchFieldException { + // TODO property validation could be done here + iv_props = props; + + // optional context window annotations + final String windowAnnots = iv_props.getProperty( WINDOW_ANNOT_PRP_KEY ); + if ( windowAnnots != null ) { + String[] windowAnnotArr = windowAnnots.split( "\\|" ); + iv_annotTypeArr = new int[windowAnnotArr.length]; + for ( int i = 0; i < windowAnnotArr.length; i++ ) { + iv_annotTypeArr[i] = JCasUtil.getType( windowAnnotArr[i] ); + } + } else { + iv_annotTypeArr = null; + } + + // optional exclusion POS tags + final String tagStr = iv_props.getProperty( EXC_TAGS_PRP_KEY ); + if ( tagStr != null ) { + iv_exclusionTagSet = new HashSet(); + final String[] tagArr = tagStr.split( "," ); + for ( String tag : tagArr ) { + iv_exclusionTagSet.add( tag.toLowerCase() ); + } + iv_logger.info( "Exclusion tagset loaded: " + iv_exclusionTagSet ); + } else { + iv_exclusionTagSet = null; + } + // vng change - get the lookupTokenAdapter class name from the + // configuration properties + // this is to support stemming + String lookupTokenAdapterClazz = this.iv_props.getProperty( + LOOKUP_TOKEN_ADAPTER, + LookupAnnotationToJCasAdapter.class.getName()); + try { + this.lookupTokenAdapterCtor = Class + .forName(lookupTokenAdapterClazz).getConstructor( + Annotation.class); + } catch (NoSuchMethodException nsme) { + throw new ClassNotFoundException(lookupTokenAdapterClazz, nsme); + } + + } + + /** + * {@inheritDoc} + */ + @Override + public LookupAlgorithm getLookupAlgorithm( final DictionaryEngine dictEngine ) + throws AnnotatorInitializationException { + final String textMetaFields = iv_props.getProperty( TEXT_MFS_PRP_KEY ); + String[] textMetaFieldNameArr; + if ( textMetaFields == null ) { + textMetaFieldNameArr = new String[0]; + } else { + textMetaFieldNameArr = textMetaFields.split( "\\|" ); + } + // variant support + final String[] variantArr = {CANONICAL_VARIANT_ATTR}; + final PhraseBuilder pb = new VariantPhraseBuilderImpl( variantArr, true ); + final int maxPermutationLevel = Integer.parseInt( iv_props.getProperty( MAX_P_LEVEL_PRP_KEY ) ); + return new FirstTokenPermutationImpl( dictEngine, pb, textMetaFieldNameArr, maxPermutationLevel ); + } + + private boolean isTagExcluded( final String tag ) { + return iv_exclusionTagSet != null && tag != null && iv_exclusionTagSet.contains( tag.toLowerCase() ); + } + + /** + * {@inheritDoc} + */ + @Override + public Iterator getLookupTokenIterator( final JCas jcas ) throws AnnotatorInitializationException { + final List ltList = new ArrayList(); + + final JFSIndexRepository indexes = jcas.getJFSIndexRepository(); + final AnnotationIndex annotationIndex = indexes.getAnnotationIndex( BaseToken.type ); + for ( Annotation annotation : annotationIndex ) { + if ( !(annotation instanceof BaseToken) ) { + iv_logger.warn( getClass().getName() + " getLookupTokenIterator(..) Annotation is not a BaseToken" ); + continue; + } + final boolean isNonLookup = annotation instanceof NewlineToken + || annotation instanceof PunctuationToken + || annotation instanceof ContractionToken + || annotation instanceof SymbolToken; + if ( isNonLookup ) { + continue; + } + final BaseToken bta = (BaseToken) annotation; + final LookupToken lt = new LookupAnnotationToJCasAdapter( bta ); + // POS exclusion logic for first word lookup + if ( isTagExcluded( bta.getPartOfSpeech() ) ) { + lt.addStringAttribute( FirstTokenPermutationImpl.LT_KEY_USE_FOR_LOOKUP, FALSE_STRING ); + } else { + lt.addStringAttribute( FirstTokenPermutationImpl.LT_KEY_USE_FOR_LOOKUP, TRUE_STRING ); + } + if ( bta instanceof WordToken ) { + final WordToken wta = (WordToken) bta; + final String canonicalForm = wta.getCanonicalForm(); + if ( canonicalForm != null ) { + lt.addStringAttribute( CANONICAL_VARIANT_ATTR, canonicalForm ); + } + } + ltList.add( lt ); + } + return ltList.iterator(); + } + + /** + * {@inheritDoc} + */ + @Override + public Iterator getLookupWindowIterator( final JCas jcas ) throws AnnotatorInitializationException { + try { + final JFSIndexRepository indexes = jcas.getJFSIndexRepository(); + final String objClassName = iv_props.getProperty( WINDOW_ANNOT_PRP_KEY ); + int windowType; + try { + windowType = JCasUtil.getType( objClassName ); + } catch ( IllegalArgumentException iaE ) { + // thrown by JCasUtil.getType() + throw new AnnotatorInitializationException( iaE ); + } + return indexes.getAnnotationIndex( windowType ).iterator(); + } catch ( Exception e ) { + // TODO specify exceptions, get rid of the catch for "Exception" + throw new AnnotatorInitializationException( e ); + } + } + + /** + * {@inheritDoc} + */ + @Override + public Map> getContextMap( final JCas jcas, + final int windowBegin, final int windowEnd ) + throws AnnotatorInitializationException { + if ( iv_annotTypeArr == null ) { + return Collections.emptyMap(); + } + final List list = new ArrayList(); + // algorithm depends on a window for permutations + final JFSIndexRepository indexes = jcas.getJFSIndexRepository(); + for ( int annotationType : iv_annotTypeArr ) { + final Iterator itr = indexes.getAnnotationIndex( annotationType ).iterator(); + list.addAll( constrainToWindow( windowBegin, windowEnd, itr ) ); + } + final Map> m = new HashMap>( 1 ); + m.put( FirstTokenPermutationImpl.CTX_KEY_WINDOW_ANNOTATIONS, list ); + return m; + } + + /** + * Gets a list of LookupAnnotation objects within the specified window. + * + * @param annotItr - + * @return list of lookup annotations + */ + private List constrainToWindow( final int begin, final int end, + final Iterator annotItr ) + throws AnnotatorInitializationException + { + final List list = new ArrayList(); + while ( annotItr.hasNext() ) { + final Annotation annot = annotItr.next(); + // only consider if it's within the window + if ( (annot.getBegin() >= begin) && (annot.getEnd() <= end) ) { + // vng list.add( new LookupAnnotationToJCasAdapter( annot ) ); + list.add(annoToLookupToken(annot)); + } + } + return list; + } + + /** + * {@inheritDoc} + */ + @Override + public List getSortedLookupTokens( final JCas jcas, + final Annotation covering ) throws AnnotatorInitializationException { + final List ltList = new ArrayList(); + final List inputList = org.uimafit.util.JCasUtil.selectCovered( jcas, BaseToken.class, covering ); + for ( BaseToken bta : inputList ) { + final boolean isNonLookup = bta instanceof NewlineToken + || bta instanceof PunctuationToken + || bta instanceof ContractionToken + || bta instanceof SymbolToken; + if ( isNonLookup ) { + continue; + } + final LookupToken lt = new LookupAnnotationToJCasAdapter( bta ); + // POS exclusion logic for first word lookup + if ( isTagExcluded( bta.getPartOfSpeech() ) ) { + lt.addStringAttribute( FirstTokenPermutationImpl.LT_KEY_USE_FOR_LOOKUP, FALSE_STRING ); + } else { + lt.addStringAttribute( FirstTokenPermutationImpl.LT_KEY_USE_FOR_LOOKUP, TRUE_STRING ); + } + if ( bta instanceof WordToken ) { + final WordToken wta = (WordToken) bta; + final String canonicalForm = wta.getCanonicalForm(); + if ( canonicalForm != null ) { + lt.addStringAttribute( CANONICAL_VARIANT_ATTR, canonicalForm ); + } + } + ltList.add( lt ); + } + return ltList; + } + +} Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/lookup/ae/StemmedLookupAnnotationToJCasAdapter.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/lookup/ae/StemmedLookupAnnotationToJCasAdapter.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/lookup/ae/StemmedLookupAnnotationToJCasAdapter.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/lookup/ae/StemmedLookupAnnotationToJCasAdapter.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,62 @@ +package org.apache.ctakes.ytex.uima.lookup.ae; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.ctakes.dictionary.lookup.vo.LookupAnnotation; +import org.apache.ctakes.dictionary.lookup.vo.LookupToken; +import org.apache.ctakes.typesystem.type.syntax.WordToken; +import org.apache.ctakes.ytex.tools.SetupAuiFirstWord; +import org.apache.uima.jcas.tcas.Annotation; + +/** + * allow dictionary lookup with stemmed words + * + * @author vijay + * + */ +public class StemmedLookupAnnotationToJCasAdapter implements LookupAnnotation, + LookupToken { + private Map iv_attrMap = new HashMap(); + + private Annotation iv_jcasAnnotObj; + + public StemmedLookupAnnotationToJCasAdapter(Annotation jcasAnnotObj) { + iv_jcasAnnotObj = jcasAnnotObj; + } + + public void addStringAttribute(String attrKey, String attrVal) { + iv_attrMap.put(attrKey, attrVal); + } + + public int getEndOffset() { + return iv_jcasAnnotObj.getEnd(); + } + + public int getLength() { + return getStartOffset() - getEndOffset(); + } + + public int getStartOffset() { + return iv_jcasAnnotObj.getBegin(); + } + + public String getStringAttribute(String attrKey) { + return (String) iv_attrMap.get(attrKey); + } + + /** + * if this is a word, return the stemmed word, if available - i.e. canonicalForm not null and not empty. + * else return the covered text. + * @see SetupAuiFirstWord + */ + public String getText() { + if (iv_jcasAnnotObj instanceof WordToken) { + WordToken wt = (WordToken) iv_jcasAnnotObj; + if (wt.getCanonicalForm() != null && wt.getCanonicalForm().length() > 0) + return wt.getCanonicalForm(); + } + return iv_jcasAnnotObj.getCoveredText(); + } + +} \ No newline at end of file Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/AnnoMappingInfo.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/AnnoMappingInfo.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/AnnoMappingInfo.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/AnnoMappingInfo.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,138 @@ +package org.apache.ctakes.ytex.uima.mapper; + +import java.util.HashSet; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +public class AnnoMappingInfo { + String annoClassName; + Set columnMappingInfos; + ColumnMappingInfo coveredTextColumn; + SortedMap mapField = new TreeMap(); + + String sql; + + String tableName; + int uimaTypeId; + String uimaTypeIdColumnName; + + public AnnoMappingInfo() { + } + + /** + * copy values from other annoMappingInfo + * + * @param o + */ + public AnnoMappingInfo deepCopy() { + AnnoMappingInfo n = new AnnoMappingInfo(); + n.annoClassName = this.annoClassName; + n.tableName = this.tableName; + n.sql = this.sql; + n.coveredTextColumn = this.coveredTextColumn != null ? this.coveredTextColumn + .deepCopy() : null; + Set ciCopy = new HashSet(); + for (ColumnMappingInfo e : this.columnMappingInfos) { + ciCopy.add(e.deepCopy()); + } + n.setColumnMappingInfos(ciCopy); + return n; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + AnnoMappingInfo other = (AnnoMappingInfo) obj; + if (annoClassName == null) { + if (other.annoClassName != null) + return false; + } else if (!annoClassName.equals(other.annoClassName)) + return false; + return true; + } + + public String getAnnoClassName() { + return annoClassName; + } + + public Set getColumnMappingInfos() { + return columnMappingInfos; + } + + public ColumnMappingInfo getCoveredTextColumn() { + return coveredTextColumn; + } + + public SortedMap getMapField() { + return mapField; + } + + public String getSql() { + return sql; + } + + public String getTableName() { + return tableName; + } + + public int getUimaTypeId() { + return uimaTypeId; + } + + public String getUimaTypeIdColumnName() { + return uimaTypeIdColumnName; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((annoClassName == null) ? 0 : annoClassName.hashCode()); + return result; + } + + public void setAnnoClassName(String annoClassName) { + this.annoClassName = annoClassName; + } + + public void setColumnMappingInfos(Set columnMappingInfos) { + this.columnMappingInfos = columnMappingInfos; + for (ColumnMappingInfo ci : columnMappingInfos) { + this.mapField.put(ci.getColumnName(), ci); + } + } + + public void setCoveredTextColumn(ColumnMappingInfo coveredTextColumn) { + this.coveredTextColumn = coveredTextColumn; + } + + public void setSql(String sql) { + this.sql = sql; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public void setUimaTypeId(int uimaTypeId) { + this.uimaTypeId = uimaTypeId; + } + + public void setUimaTypeIdColumnName(String uimaTypeIdColumnName) { + this.uimaTypeIdColumnName = uimaTypeIdColumnName; + } + + @Override + public String toString() { + return "AnnoMappingInfo [mapField=" + mapField + ", tableName=" + + tableName + "]"; + } + +} Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/ColumnMappingInfo.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/ColumnMappingInfo.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/ColumnMappingInfo.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/ColumnMappingInfo.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,128 @@ +package org.apache.ctakes.ytex.uima.mapper; + +import org.apache.commons.beanutils.Converter; + +public class ColumnMappingInfo { + private String annoFieldName; + private String columnName; + private Converter converter; + private String jxpath; + private int size; + private int sqlType; + + private Class targetType; + + private String targetTypeName; + + public ColumnMappingInfo() { + } + + public ColumnMappingInfo deepCopy() { + ColumnMappingInfo n = new ColumnMappingInfo(); + n.annoFieldName = this.annoFieldName; + n.converter = this.converter; + n.columnName = this.columnName; + n.targetType = this.targetType; + n.targetTypeName = this.targetTypeName; + n.sqlType = this.sqlType; + n.jxpath = this.jxpath; + return n; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + ColumnMappingInfo other = (ColumnMappingInfo) obj; + if (columnName == null) { + if (other.columnName != null) + return false; + } else if (!columnName.equals(other.columnName)) + return false; + return true; + } + + public String getAnnoFieldName() { + return annoFieldName; + } + + public String getColumnName() { + return columnName; + } + + public Converter getConverter() { + return converter; + } + + public String getJxpath() { + return jxpath; + } + + public int getSize() { + return size; + } + + public int getSqlType() { + return sqlType; + } + + public Class getTargetType() { + return targetType; + } + + public String getTargetTypeName() { + return targetTypeName; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((columnName == null) ? 0 : columnName.hashCode()); + return result; + } + + public void setAnnoFieldName(String annoFieldName) { + this.annoFieldName = annoFieldName; + } + + public void setColumnName(String tableFieldName) { + this.columnName = tableFieldName; + } + + public void setConverter(Converter converter) { + this.converter = converter; + } + + public void setJxpath(String jxpath) { + this.jxpath = jxpath; + } + + public void setSize(int size) { + this.size = size; + } + + public void setSqlType(int sqlType) { + this.sqlType = sqlType; + } + + public void setTargetTypeName(String targetTypeName) { + this.targetTypeName = targetTypeName; + try { + this.targetType = Class.forName(targetTypeName); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + @Override + public String toString() { + return "ColumnMappingInfo [columnName=" + columnName + "]"; + } + +} Added: ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/DocumentMapperService.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/DocumentMapperService.java?rev=1555281&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/DocumentMapperService.java (added) +++ ctakes/branches/ytex/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/mapper/DocumentMapperService.java Fri Jan 3 23:22:58 2014 @@ -0,0 +1,21 @@ +package org.apache.ctakes.ytex.uima.mapper; + +import java.util.Set; + +import org.apache.uima.jcas.JCas; + +public interface DocumentMapperService { + + /** + * Save Document and all mapped annotations. + * + * @param jcas + * @param analysisBatch + * optional + * @return document id + */ + public abstract Integer saveDocument(JCas jcas, String analysisBatch, + boolean bStoreDocText, boolean bStoreCAS, + boolean bInsertAnnotationContainmentLinks, Set typesToIgnore); + +} \ No newline at end of file