Return-Path: X-Original-To: apmail-incubator-ctakes-commits-archive@minotaur.apache.org Delivered-To: apmail-incubator-ctakes-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id B57BEDD2F for ; Wed, 31 Oct 2012 05:28:57 +0000 (UTC) Received: (qmail 52337 invoked by uid 500); 31 Oct 2012 05:28:57 -0000 Delivered-To: apmail-incubator-ctakes-commits-archive@incubator.apache.org Received: (qmail 52278 invoked by uid 500); 31 Oct 2012 05:28:56 -0000 Mailing-List: contact ctakes-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ctakes-dev@incubator.apache.org Delivered-To: mailing list ctakes-commits@incubator.apache.org Received: (qmail 52232 invoked by uid 99); 31 Oct 2012 05:28:55 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 31 Oct 2012 05:28:55 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 31 Oct 2012 05:28:50 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id E508E2388C73; Wed, 31 Oct 2012 05:27:18 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1403989 [21/28] - in /incubator/ctakes/branches/SHARPn-cTAKES: Constituency Parser/src/org/chboston/cnlp/ctakes/parser/ Constituency Parser/src/org/chboston/cnlp/ctakes/parser/uima/ae/ Constituency Parser/src/org/chboston/cnlp/ctakes/parse... Date: Wed, 31 Oct 2012 05:26:55 -0000 To: ctakes-commits@incubator.apache.org From: james-masanz@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20121031052718.E508E2388C73@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Modified: incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/FirstTokenPermutationImpl.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/dictionary%20lookup/src/edu/mayo/bmi/lookup/algorithms/FirstTokenPermutationImpl.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/FirstTokenPermutationImpl.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/FirstTokenPermutationImpl.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,690 +14,690 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.lookup.algorithms; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.log4j.Logger; - -import edu.mayo.bmi.dictionary.DictionaryEngine; -import edu.mayo.bmi.dictionary.MetaDataHit; -import edu.mayo.bmi.lookup.phrasebuilder.PhraseBuilder; -import edu.mayo.bmi.lookup.vo.LookupAnnotation; -import edu.mayo.bmi.lookup.vo.LookupHit; -import edu.mayo.bmi.lookup.vo.LookupToken; - -/** - * OVERVIEW: Each LookupToken is fed into a "first token" Dictionary. A - * hit indicates an anchor and the window around this anchor is based on - * context. This hit also contains all the presentations from the Dictionary - * where the "first token" is contained. - * - * The window is determined by finding the largest overlapping context window - * annotation. Permutations of LookupTokens found within this window are used to - * match against the presentations found earlier. If context window annotations - * are not provided, a fixed window is used based on the specified max - * permutation level. - * - * OPTIONAL CONTEXT: context window annotations - * - * @author Mayo Clinic - */ -public class FirstTokenPermutationImpl implements LookupAlgorithm -{ - // LOG4J logger based on class name - private Logger iv_logger = Logger.getLogger(getClass().getName()); - - /** - * Key value for context map. Value is expected to be a List of - * LookupAnnotation objects in sorted order. - */ - public static final String CTX_KEY_WINDOW_ANNOTATIONS = "WINDOW_ANNOTATIONS"; - - /** - * Key value for LookupToken attribute. Value is expected to be either TRUE - * or FALSE. This indicates whether to use this token for a "first token" - * lookup or not. This is optional. - */ - public static final String LT_KEY_USE_FOR_LOOKUP = "USE_FOR_LOOKUP"; - - private DictionaryEngine iv_firstTokenDictEngine; - private PhraseBuilder iv_phrBuilder; - - private int iv_maxPermutationLevel; - // key = level Integer, value = Permutation list - private Map iv_permCacheMap = new HashMap(); - - private String[] iv_textMetaFieldNames; - - /** - * Constructor - * - * @param firstTokenDictEngine - * Dictionary that is indexed against first tokens. - * @param phraseBuilder - * Builds phrases to match against Dictionary. - * @param textMetaFieldNames - * MetaFieldNames used to extract presentations. - * @param maxPermutationLevel - * Max permutation Level allowed. - */ - public FirstTokenPermutationImpl(DictionaryEngine firstTokenDictEngine, - PhraseBuilder phraseBuilder, String textMetaFieldNames[], - int maxPermutationLevel) - { - iv_firstTokenDictEngine = firstTokenDictEngine; - iv_phrBuilder = phraseBuilder; - iv_textMetaFieldNames = textMetaFieldNames; - - iv_maxPermutationLevel = maxPermutationLevel; - for (int i = 0; i <= maxPermutationLevel; i++) - { - Integer level = new Integer(i); - List permList = PermutationUtil.getPermutationList(i); - iv_permCacheMap.put(level, permList); - } - } - - /** - * Implementation of algorithm. - */ - public Collection lookup(List ltList, Map ctxMap) throws Exception - { - // setup optional window context data - boolean useWindowAnnots = false; - List wAnnotList = getWindowAnnotations(ctxMap); - if (wAnnotList.size() > 0) - { - useWindowAnnots = true; - } - Map wStartOffsetMap = getStartOffsetMap(wAnnotList, true); - Map wEndOffsetMap = getEndOffsetMap(wAnnotList, true); - - Map ltListIndexMap = getListIndexMap(ltList); - Map ltStartOffsetMap = getStartOffsetMap(ltList, true); - Map ltEndOffsetMap = getEndOffsetMap(ltList, true); - - List lhList = new ArrayList(); - for (int ltIdx = 0; ltIdx < ltList.size(); ltIdx++) - { - LookupToken lt = (LookupToken) ltList.get(ltIdx); - - Boolean useForLookup = Boolean.valueOf(lt.getStringAttribute(LT_KEY_USE_FOR_LOOKUP)); - - if ((useForLookup == null) || (useForLookup.booleanValue())) - { - Collection mdhCol = getFirstTokenHits(lt); - - if ((mdhCol != null) && (mdhCol.size() > 0)) - { - int wEndOffset = -1; - if (useWindowAnnots) - { - // get the largest overlapping window annotation - LookupAnnotation wAnnot = getLargestWindowAnnotation( - ltIdx, - lt, - ltStartOffsetMap, - ltEndOffsetMap, - ltListIndexMap, - wStartOffsetMap, - wEndOffsetMap); - if (wAnnot != null) - { - wEndOffset = wAnnot.getEndOffset(); - } - } - if (wEndOffset == -1) - { - iv_logger.debug("Window size set to max perm level."); - wEndOffset = getFixedWindowEndOffset(ltIdx, lt, ltList); - } - - List endLookupTokenList = getLookupTokenList( - wEndOffset, - ltEndOffsetMap, - false); - LookupToken endLookupToken = (LookupToken) endLookupTokenList.get(endLookupTokenList.size() - 1); - - int startTokenIdx = ltIdx; - int endTokenIdx = ((Integer) ltListIndexMap.get(endLookupToken)).intValue(); - - // list of LookupToken objects bound by the window - List wLookupTokenList = ltList.subList( - startTokenIdx, - endTokenIdx + 1); - - // use permutation algorithm to find any hits inside the window - Collection lhCol = getLookupHits( - mdhCol, - wLookupTokenList, - new Integer(ltIdx - startTokenIdx)); - - lhList.addAll(lhCol); - } - } - } - - return lhList; - } - - private Collection getLookupHits( - Collection mdhCol, - List wLookupTokenList, - Integer firstTokenIndex) throws Exception - { - if ((wLookupTokenList.size() - 1) > iv_maxPermutationLevel) - { - iv_logger.debug("Beyond permutation cache size."); - return new ArrayList(); - } - - // build a list of index values (excludes index of first token) - List idxList = new ArrayList(); - for (int i = 0; i < wLookupTokenList.size(); i++) - { - if (i != firstTokenIndex.intValue()) - { - idxList.add(new Integer(i)); - } - } - - Collection permCol = (Collection) iv_permCacheMap.get(new Integer( - idxList.size())); - - List lhList = new ArrayList(); - - Map mdhMap = new HashMap(); - Iterator mdhItr = mdhCol.iterator(); - while (mdhItr.hasNext()) - { - MetaDataHit mdh = (MetaDataHit) mdhItr.next(); - for (int i = 0; i < iv_textMetaFieldNames.length; i++) - { - String text = mdh.getMetaFieldValue(iv_textMetaFieldNames[i]); - if (text != null) - { - text = text.toLowerCase(); - Set mdhSet = (Set) mdhMap.get(text); - if (mdhSet == null) - { - mdhSet = new HashSet(); - } - mdhSet.add(mdh); - mdhMap.put(text, mdhSet); - } - else - { - if (iv_logger.isDebugEnabled()) - { - iv_logger.debug("MetaField " - + iv_textMetaFieldNames[i] - + " contains no data."); - } - } - } - } - - LookupToken firstWordLookupToken = (LookupToken) wLookupTokenList.get(firstTokenIndex.intValue()); - - Iterator permItr = permCol.iterator(); - while (permItr.hasNext()) - { - // convert permutation idx back into LookupTokens - List tempList = new ArrayList(); - List permutation = (List) permItr.next(); - Iterator idxItr = permutation.iterator(); - while (idxItr.hasNext()) - { - int idx = ((Integer) idxItr.next()).intValue(); - if (idx <= firstTokenIndex.intValue()) - { - idx--; - } - LookupToken lt = (LookupToken) wLookupTokenList.get(idx); - tempList.add(lt); - } - - List singleTokenList = new ArrayList(); - singleTokenList.add(firstWordLookupToken); - String[] fwPerms = iv_phrBuilder.getPhrases(singleTokenList); - - String[] phrArr = iv_phrBuilder.getPhrases(tempList); - for (int i = 0; i < phrArr.length; i++) - { - for (int fwPermIdx = 0; fwPermIdx < fwPerms.length; fwPermIdx++) - { - StringBuffer phraseSB = new StringBuffer(); - phraseSB.append(fwPerms[fwPermIdx]); - phraseSB.append(' '); - phraseSB.append(phrArr[i]); - String phrase = phraseSB.toString().trim().toLowerCase(); - Set mdhSet = (Set) mdhMap.get(phrase); - if (mdhSet != null) - { - Iterator mdhIterator = mdhSet.iterator(); - while (mdhIterator.hasNext()) - { - MetaDataHit mdh = (MetaDataHit) mdhIterator.next(); - // figure out start and end offsets - Collections.sort(permutation); - - int startOffset; - if (permutation.size() > 0) - { - int firstIdx = ((Integer) permutation.get(0)).intValue(); - if (firstIdx <= firstTokenIndex.intValue()) - { - firstIdx--; - } - LookupToken lt = (LookupToken) wLookupTokenList.get(firstIdx); - if (lt.getStartOffset() < firstWordLookupToken.getStartOffset()) - { - startOffset = lt.getStartOffset(); - } - else - { - startOffset = firstWordLookupToken.getStartOffset(); - } - } - else - { - startOffset = firstWordLookupToken.getStartOffset(); - } - - int endOffset; - if (permutation.size() > 0) - { - int lastIdx = ((Integer) permutation.get(permutation.size() - 1)).intValue(); - if (lastIdx <= firstTokenIndex.intValue()) - { - lastIdx--; - } - LookupToken lt = (LookupToken) wLookupTokenList.get(lastIdx); - if (lt.getEndOffset() > firstWordLookupToken.getEndOffset()) - { - endOffset = lt.getEndOffset(); - } - else - { - endOffset = firstWordLookupToken.getEndOffset(); - } - } - else - { - endOffset = firstWordLookupToken.getEndOffset(); - } - - LookupHit lh = new LookupHit( - mdh, - startOffset, - endOffset); - - lhList.add(lh); - } - } - } - } - } - return lhList; - } - - /** - * Extracts the list of LookupAnnotation objects representing noun phrases - * from the context map. - * - * @param contextMap - * @return - */ - private List getWindowAnnotations(Map contextMap) - { - List list = (List) contextMap.get(CTX_KEY_WINDOW_ANNOTATIONS); - if ((list == null) || (list.size() == 0)) - { - iv_logger.debug("No context window annotations."); - return new ArrayList(); - } - return list; - } - - /** - * Determines the number of ListTokens are contained within the specified - * start and end offsets; - * - * @param ltStartOffsetMap - * @param ltEndOffsetMap - * @param ltListIndexMap - * @param startOffset - * @param endOffset - * @return - */ - private int getNumberOfListTokens( - Map ltStartOffsetMap, - Map ltEndOffsetMap, - Map ltListIndexMap, - int startOffset, - int endOffset) - { - List startLookupTokenList = getLookupTokenList( - startOffset, - ltStartOffsetMap, - true); - List endLookupTokenList = getLookupTokenList( - endOffset, - ltEndOffsetMap, - false); - - if ((startLookupTokenList == null) || (endLookupTokenList == null)) - { - iv_logger.debug("Invalid window:" + startOffset + "," + endOffset); - return -1; - } - LookupToken startLookupToken = (LookupToken) startLookupTokenList.get(0); - Integer startIdx = (Integer) ltListIndexMap.get(startLookupToken); - - LookupToken endLookupToken = (LookupToken) endLookupTokenList.get(endLookupTokenList.size() - 1); - Integer endIdx = (Integer) ltListIndexMap.get(endLookupToken); - - return endIdx.intValue() - startIdx.intValue() + 1; - } - - /** - * Attempts to get a list of LookupToken objects at the specified offset. If - * there are none, this method attempts to try nearby offsets based on the - * traversal direction. - * - * @param offset - * @param ltOffsetMap - * @param traverseRight - * @return - */ - private List getLookupTokenList( - int offset, - Map ltOffsetMap, - boolean traverseRight) - { - // first attempt the original offset, which will be the case most of the - // time - List lookupTokenList = (List) ltOffsetMap.get(new Integer(offset)); - if (lookupTokenList != null) - { - return lookupTokenList; - } - else - { - // otherwise traverse some nearby offsets and attempt to find a - // token - - // TODO hardcoded max offset window is 10 char - final int offsetWindow = 10; - - // build list of offsets to try - List offsetList = new ArrayList(); - if (traverseRight) - { - int max = offset + offsetWindow; - for (int i = offset; i <= max; i++) - { - offsetList.add(new Integer(i)); - } - } - else - { - int min = offset - offsetWindow; - for (int i = offset; i >= min; i--) - { - offsetList.add(new Integer(i)); - } - } - - Iterator offsetItr = offsetList.iterator(); - while (offsetItr.hasNext()) - { - Integer tempOffset = (Integer) offsetItr.next(); - lookupTokenList = (List) ltOffsetMap.get(tempOffset); - if (lookupTokenList != null) - { - return lookupTokenList; - } - } - } - // no tokens in window - return null; - } - - /** - * Determines the largest overlapping window annotation for the specified - * LookupToken. - * - * @param lt - * @param wStartOffsetMap - * @param wEndOffsetMap - * @return - */ - private LookupAnnotation getLargestWindowAnnotation( - int tokenIdx, - LookupToken lt, - Map ltStartOffsetMap, - Map ltEndOffsetMap, - Map ltListIndexMap, - Map wStartOffsetMap, - Map wEndOffsetMap) - { - Set startCandidateSet = new HashSet(); - Set endCandidateSet = new HashSet(); - - Iterator startItr = wStartOffsetMap.keySet().iterator(); - while (startItr.hasNext()) - { - Integer startOffset = (Integer) startItr.next(); - if (startOffset.intValue() <= lt.getStartOffset()) - { - List wAnnotList = (List) wStartOffsetMap.get(startOffset); - startCandidateSet.addAll(wAnnotList); - } - } - - Iterator endItr = wEndOffsetMap.keySet().iterator(); - while (endItr.hasNext()) - { - Integer endOffset = (Integer) endItr.next(); - if (endOffset.intValue() >= lt.getEndOffset()) - { - List wAnnotList = (List) wEndOffsetMap.get(endOffset); - endCandidateSet.addAll(wAnnotList); - } - } - - // union to get window annotations that are overlapping with LookupToken - startCandidateSet.retainAll(endCandidateSet); - - // find largest overlapping window annotation - LookupAnnotation largestWindowAnnot = null; - Iterator laItr = startCandidateSet.iterator(); - while (laItr.hasNext()) - { - LookupAnnotation tempLookupAnnot = (LookupAnnotation) laItr.next(); - if ((largestWindowAnnot == null) - || (tempLookupAnnot.getLength() > largestWindowAnnot.getLength())) - { - // now see if we can handle the size of this window (permutation - // wise) - int ltCount = getNumberOfListTokens( - ltStartOffsetMap, - ltEndOffsetMap, - ltListIndexMap, - tempLookupAnnot.getStartOffset(), - tempLookupAnnot.getEndOffset()); - - if ((ltCount <= iv_maxPermutationLevel) && (ltCount > 0)) - { - largestWindowAnnot = tempLookupAnnot; - } - else - { - if (iv_logger.isDebugEnabled()) - { - iv_logger.debug("Window size of " - + ltCount - + " exceeds the max permutation level of " - + iv_maxPermutationLevel - + "."); - } - } - } - } - - return largestWindowAnnot; - } - - private int getFixedWindowEndOffset( - int tokenIdx, - LookupToken lt, - List ltList) - { - int fixedEndOffset = 0; - - for (int i = tokenIdx; (i < tokenIdx + iv_maxPermutationLevel) - && (i < ltList.size()); i++) - { - LookupToken tempLookupToken = (LookupToken) ltList.get(i); - if (tempLookupToken != null) - { - fixedEndOffset = tempLookupToken.getEndOffset(); - } - } - return fixedEndOffset; - } - - /** - * Creates a map that binds an object from a list to its index position. - * - * @param list - * @return - */ - private Map getListIndexMap(List list) - { - Map m = new HashMap(); - - for (int i = 0; i < list.size(); i++) - { - Integer index = new Integer(i); - m.put(list.get(i), index); - } - - return m; - } - - /** - * Creates a map that uses the start offset to index the LookupAnnotation - * objects. If multiple LookupAnnotations can exist at the same start - * offset, then hasMultiples=true and the values with be a List of - * LookupAnnotation objects at that offset. - * - * @param lookupAnnotList - * @param hasMultiples - * @return - */ - private Map getStartOffsetMap(List lookupAnnotList, boolean hasMultiples) - { - Map m = new HashMap(); - - Iterator laItr = lookupAnnotList.iterator(); - while (laItr.hasNext()) - { - LookupAnnotation la = (LookupAnnotation) laItr.next(); - Integer key = new Integer(la.getStartOffset()); - if (hasMultiples) - { - List list = (List) m.get(key); - if (list == null) - { - list = new ArrayList(); - } - list.add(la); - m.put(key, list); - } - else - { - m.put(key, la); - } - } - - return m; - } - - /** - * Creates a map that uses the end offset to index the LookupAnnotation - * objects. If multiple LookupAnnotations can exist at the end start offset, - * then hasMultiples=true and the values with be a List of LookupAnnotation - * objects at that offset. - * - * @param lookupAnnotList - * @param hasMultiples - * @return - */ - private Map getEndOffsetMap(List lookupAnnotList, boolean hasMultiples) - { - Map m = new HashMap(); - - Iterator laItr = lookupAnnotList.iterator(); - while (laItr.hasNext()) - { - LookupAnnotation la = (LookupAnnotation) laItr.next(); - Integer key = new Integer(la.getEndOffset()); - if (hasMultiples) - { - List list = (List) m.get(key); - if (list == null) - { - list = new ArrayList(); - } - list.add(la); - m.put(key, list); - } - else - { - m.put(key, la); - } - } - - return m; - } - - /** - * Gets the hits for the specified LookupToken. This uses the first token Dictionary. - * - * @param firstLookupToken - * @return - * @throws Exception - */ - private Collection getFirstTokenHits(LookupToken firstLookupToken) - throws Exception - { - List singleLtList = new ArrayList(); - singleLtList.add(firstLookupToken); - - String[] phrases = iv_phrBuilder.getPhrases(singleLtList); - - Collection mdhCol = new ArrayList(); - for (int i = 0; i < phrases.length; i++) - { - Collection curMdhCol = iv_firstTokenDictEngine.metaLookup(phrases[i]); - - if (curMdhCol.size() > 0) - { - mdhCol.addAll(curMdhCol); - } - } - return mdhCol; - } -} \ No newline at end of file +package edu.mayo.bmi.lookup.algorithms; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.log4j.Logger; + +import edu.mayo.bmi.dictionary.DictionaryEngine; +import edu.mayo.bmi.dictionary.MetaDataHit; +import edu.mayo.bmi.lookup.phrasebuilder.PhraseBuilder; +import edu.mayo.bmi.lookup.vo.LookupAnnotation; +import edu.mayo.bmi.lookup.vo.LookupHit; +import edu.mayo.bmi.lookup.vo.LookupToken; + +/** + * OVERVIEW: Each LookupToken is fed into a "first token" Dictionary. A + * hit indicates an anchor and the window around this anchor is based on + * context. This hit also contains all the presentations from the Dictionary + * where the "first token" is contained. + * + * The window is determined by finding the largest overlapping context window + * annotation. Permutations of LookupTokens found within this window are used to + * match against the presentations found earlier. If context window annotations + * are not provided, a fixed window is used based on the specified max + * permutation level. + * + * OPTIONAL CONTEXT: context window annotations + * + * @author Mayo Clinic + */ +public class FirstTokenPermutationImpl implements LookupAlgorithm +{ + // LOG4J logger based on class name + private Logger iv_logger = Logger.getLogger(getClass().getName()); + + /** + * Key value for context map. Value is expected to be a List of + * LookupAnnotation objects in sorted order. + */ + public static final String CTX_KEY_WINDOW_ANNOTATIONS = "WINDOW_ANNOTATIONS"; + + /** + * Key value for LookupToken attribute. Value is expected to be either TRUE + * or FALSE. This indicates whether to use this token for a "first token" + * lookup or not. This is optional. + */ + public static final String LT_KEY_USE_FOR_LOOKUP = "USE_FOR_LOOKUP"; + + private DictionaryEngine iv_firstTokenDictEngine; + private PhraseBuilder iv_phrBuilder; + + private int iv_maxPermutationLevel; + // key = level Integer, value = Permutation list + private Map iv_permCacheMap = new HashMap(); + + private String[] iv_textMetaFieldNames; + + /** + * Constructor + * + * @param firstTokenDictEngine + * Dictionary that is indexed against first tokens. + * @param phraseBuilder + * Builds phrases to match against Dictionary. + * @param textMetaFieldNames + * MetaFieldNames used to extract presentations. + * @param maxPermutationLevel + * Max permutation Level allowed. + */ + public FirstTokenPermutationImpl(DictionaryEngine firstTokenDictEngine, + PhraseBuilder phraseBuilder, String textMetaFieldNames[], + int maxPermutationLevel) + { + iv_firstTokenDictEngine = firstTokenDictEngine; + iv_phrBuilder = phraseBuilder; + iv_textMetaFieldNames = textMetaFieldNames; + + iv_maxPermutationLevel = maxPermutationLevel; + for (int i = 0; i <= maxPermutationLevel; i++) + { + Integer level = new Integer(i); + List permList = PermutationUtil.getPermutationList(i); + iv_permCacheMap.put(level, permList); + } + } + + /** + * Implementation of algorithm. + */ + public Collection lookup(List ltList, Map ctxMap) throws Exception + { + // setup optional window context data + boolean useWindowAnnots = false; + List wAnnotList = getWindowAnnotations(ctxMap); + if (wAnnotList.size() > 0) + { + useWindowAnnots = true; + } + Map wStartOffsetMap = getStartOffsetMap(wAnnotList, true); + Map wEndOffsetMap = getEndOffsetMap(wAnnotList, true); + + Map ltListIndexMap = getListIndexMap(ltList); + Map ltStartOffsetMap = getStartOffsetMap(ltList, true); + Map ltEndOffsetMap = getEndOffsetMap(ltList, true); + + List lhList = new ArrayList(); + for (int ltIdx = 0; ltIdx < ltList.size(); ltIdx++) + { + LookupToken lt = (LookupToken) ltList.get(ltIdx); + + Boolean useForLookup = Boolean.valueOf(lt.getStringAttribute(LT_KEY_USE_FOR_LOOKUP)); + + if ((useForLookup == null) || (useForLookup.booleanValue())) + { + Collection mdhCol = getFirstTokenHits(lt); + + if ((mdhCol != null) && (mdhCol.size() > 0)) + { + int wEndOffset = -1; + if (useWindowAnnots) + { + // get the largest overlapping window annotation + LookupAnnotation wAnnot = getLargestWindowAnnotation( + ltIdx, + lt, + ltStartOffsetMap, + ltEndOffsetMap, + ltListIndexMap, + wStartOffsetMap, + wEndOffsetMap); + if (wAnnot != null) + { + wEndOffset = wAnnot.getEndOffset(); + } + } + if (wEndOffset == -1) + { + iv_logger.debug("Window size set to max perm level."); + wEndOffset = getFixedWindowEndOffset(ltIdx, lt, ltList); + } + + List endLookupTokenList = getLookupTokenList( + wEndOffset, + ltEndOffsetMap, + false); + LookupToken endLookupToken = (LookupToken) endLookupTokenList.get(endLookupTokenList.size() - 1); + + int startTokenIdx = ltIdx; + int endTokenIdx = ((Integer) ltListIndexMap.get(endLookupToken)).intValue(); + + // list of LookupToken objects bound by the window + List wLookupTokenList = ltList.subList( + startTokenIdx, + endTokenIdx + 1); + + // use permutation algorithm to find any hits inside the window + Collection lhCol = getLookupHits( + mdhCol, + wLookupTokenList, + new Integer(ltIdx - startTokenIdx)); + + lhList.addAll(lhCol); + } + } + } + + return lhList; + } + + private Collection getLookupHits( + Collection mdhCol, + List wLookupTokenList, + Integer firstTokenIndex) throws Exception + { + if ((wLookupTokenList.size() - 1) > iv_maxPermutationLevel) + { + iv_logger.debug("Beyond permutation cache size."); + return new ArrayList(); + } + + // build a list of index values (excludes index of first token) + List idxList = new ArrayList(); + for (int i = 0; i < wLookupTokenList.size(); i++) + { + if (i != firstTokenIndex.intValue()) + { + idxList.add(new Integer(i)); + } + } + + Collection permCol = (Collection) iv_permCacheMap.get(new Integer( + idxList.size())); + + List lhList = new ArrayList(); + + Map mdhMap = new HashMap(); + Iterator mdhItr = mdhCol.iterator(); + while (mdhItr.hasNext()) + { + MetaDataHit mdh = (MetaDataHit) mdhItr.next(); + for (int i = 0; i < iv_textMetaFieldNames.length; i++) + { + String text = mdh.getMetaFieldValue(iv_textMetaFieldNames[i]); + if (text != null) + { + text = text.toLowerCase(); + Set mdhSet = (Set) mdhMap.get(text); + if (mdhSet == null) + { + mdhSet = new HashSet(); + } + mdhSet.add(mdh); + mdhMap.put(text, mdhSet); + } + else + { + if (iv_logger.isDebugEnabled()) + { + iv_logger.debug("MetaField " + + iv_textMetaFieldNames[i] + + " contains no data."); + } + } + } + } + + LookupToken firstWordLookupToken = (LookupToken) wLookupTokenList.get(firstTokenIndex.intValue()); + + Iterator permItr = permCol.iterator(); + while (permItr.hasNext()) + { + // convert permutation idx back into LookupTokens + List tempList = new ArrayList(); + List permutation = (List) permItr.next(); + Iterator idxItr = permutation.iterator(); + while (idxItr.hasNext()) + { + int idx = ((Integer) idxItr.next()).intValue(); + if (idx <= firstTokenIndex.intValue()) + { + idx--; + } + LookupToken lt = (LookupToken) wLookupTokenList.get(idx); + tempList.add(lt); + } + + List singleTokenList = new ArrayList(); + singleTokenList.add(firstWordLookupToken); + String[] fwPerms = iv_phrBuilder.getPhrases(singleTokenList); + + String[] phrArr = iv_phrBuilder.getPhrases(tempList); + for (int i = 0; i < phrArr.length; i++) + { + for (int fwPermIdx = 0; fwPermIdx < fwPerms.length; fwPermIdx++) + { + StringBuffer phraseSB = new StringBuffer(); + phraseSB.append(fwPerms[fwPermIdx]); + phraseSB.append(' '); + phraseSB.append(phrArr[i]); + String phrase = phraseSB.toString().trim().toLowerCase(); + Set mdhSet = (Set) mdhMap.get(phrase); + if (mdhSet != null) + { + Iterator mdhIterator = mdhSet.iterator(); + while (mdhIterator.hasNext()) + { + MetaDataHit mdh = (MetaDataHit) mdhIterator.next(); + // figure out start and end offsets + Collections.sort(permutation); + + int startOffset; + if (permutation.size() > 0) + { + int firstIdx = ((Integer) permutation.get(0)).intValue(); + if (firstIdx <= firstTokenIndex.intValue()) + { + firstIdx--; + } + LookupToken lt = (LookupToken) wLookupTokenList.get(firstIdx); + if (lt.getStartOffset() < firstWordLookupToken.getStartOffset()) + { + startOffset = lt.getStartOffset(); + } + else + { + startOffset = firstWordLookupToken.getStartOffset(); + } + } + else + { + startOffset = firstWordLookupToken.getStartOffset(); + } + + int endOffset; + if (permutation.size() > 0) + { + int lastIdx = ((Integer) permutation.get(permutation.size() - 1)).intValue(); + if (lastIdx <= firstTokenIndex.intValue()) + { + lastIdx--; + } + LookupToken lt = (LookupToken) wLookupTokenList.get(lastIdx); + if (lt.getEndOffset() > firstWordLookupToken.getEndOffset()) + { + endOffset = lt.getEndOffset(); + } + else + { + endOffset = firstWordLookupToken.getEndOffset(); + } + } + else + { + endOffset = firstWordLookupToken.getEndOffset(); + } + + LookupHit lh = new LookupHit( + mdh, + startOffset, + endOffset); + + lhList.add(lh); + } + } + } + } + } + return lhList; + } + + /** + * Extracts the list of LookupAnnotation objects representing noun phrases + * from the context map. + * + * @param contextMap + * @return + */ + private List getWindowAnnotations(Map contextMap) + { + List list = (List) contextMap.get(CTX_KEY_WINDOW_ANNOTATIONS); + if ((list == null) || (list.size() == 0)) + { + iv_logger.debug("No context window annotations."); + return new ArrayList(); + } + return list; + } + + /** + * Determines the number of ListTokens are contained within the specified + * start and end offsets; + * + * @param ltStartOffsetMap + * @param ltEndOffsetMap + * @param ltListIndexMap + * @param startOffset + * @param endOffset + * @return + */ + private int getNumberOfListTokens( + Map ltStartOffsetMap, + Map ltEndOffsetMap, + Map ltListIndexMap, + int startOffset, + int endOffset) + { + List startLookupTokenList = getLookupTokenList( + startOffset, + ltStartOffsetMap, + true); + List endLookupTokenList = getLookupTokenList( + endOffset, + ltEndOffsetMap, + false); + + if ((startLookupTokenList == null) || (endLookupTokenList == null)) + { + iv_logger.debug("Invalid window:" + startOffset + "," + endOffset); + return -1; + } + LookupToken startLookupToken = (LookupToken) startLookupTokenList.get(0); + Integer startIdx = (Integer) ltListIndexMap.get(startLookupToken); + + LookupToken endLookupToken = (LookupToken) endLookupTokenList.get(endLookupTokenList.size() - 1); + Integer endIdx = (Integer) ltListIndexMap.get(endLookupToken); + + return endIdx.intValue() - startIdx.intValue() + 1; + } + + /** + * Attempts to get a list of LookupToken objects at the specified offset. If + * there are none, this method attempts to try nearby offsets based on the + * traversal direction. + * + * @param offset + * @param ltOffsetMap + * @param traverseRight + * @return + */ + private List getLookupTokenList( + int offset, + Map ltOffsetMap, + boolean traverseRight) + { + // first attempt the original offset, which will be the case most of the + // time + List lookupTokenList = (List) ltOffsetMap.get(new Integer(offset)); + if (lookupTokenList != null) + { + return lookupTokenList; + } + else + { + // otherwise traverse some nearby offsets and attempt to find a + // token + + // TODO hardcoded max offset window is 10 char + final int offsetWindow = 10; + + // build list of offsets to try + List offsetList = new ArrayList(); + if (traverseRight) + { + int max = offset + offsetWindow; + for (int i = offset; i <= max; i++) + { + offsetList.add(new Integer(i)); + } + } + else + { + int min = offset - offsetWindow; + for (int i = offset; i >= min; i--) + { + offsetList.add(new Integer(i)); + } + } + + Iterator offsetItr = offsetList.iterator(); + while (offsetItr.hasNext()) + { + Integer tempOffset = (Integer) offsetItr.next(); + lookupTokenList = (List) ltOffsetMap.get(tempOffset); + if (lookupTokenList != null) + { + return lookupTokenList; + } + } + } + // no tokens in window + return null; + } + + /** + * Determines the largest overlapping window annotation for the specified + * LookupToken. + * + * @param lt + * @param wStartOffsetMap + * @param wEndOffsetMap + * @return + */ + private LookupAnnotation getLargestWindowAnnotation( + int tokenIdx, + LookupToken lt, + Map ltStartOffsetMap, + Map ltEndOffsetMap, + Map ltListIndexMap, + Map wStartOffsetMap, + Map wEndOffsetMap) + { + Set startCandidateSet = new HashSet(); + Set endCandidateSet = new HashSet(); + + Iterator startItr = wStartOffsetMap.keySet().iterator(); + while (startItr.hasNext()) + { + Integer startOffset = (Integer) startItr.next(); + if (startOffset.intValue() <= lt.getStartOffset()) + { + List wAnnotList = (List) wStartOffsetMap.get(startOffset); + startCandidateSet.addAll(wAnnotList); + } + } + + Iterator endItr = wEndOffsetMap.keySet().iterator(); + while (endItr.hasNext()) + { + Integer endOffset = (Integer) endItr.next(); + if (endOffset.intValue() >= lt.getEndOffset()) + { + List wAnnotList = (List) wEndOffsetMap.get(endOffset); + endCandidateSet.addAll(wAnnotList); + } + } + + // union to get window annotations that are overlapping with LookupToken + startCandidateSet.retainAll(endCandidateSet); + + // find largest overlapping window annotation + LookupAnnotation largestWindowAnnot = null; + Iterator laItr = startCandidateSet.iterator(); + while (laItr.hasNext()) + { + LookupAnnotation tempLookupAnnot = (LookupAnnotation) laItr.next(); + if ((largestWindowAnnot == null) + || (tempLookupAnnot.getLength() > largestWindowAnnot.getLength())) + { + // now see if we can handle the size of this window (permutation + // wise) + int ltCount = getNumberOfListTokens( + ltStartOffsetMap, + ltEndOffsetMap, + ltListIndexMap, + tempLookupAnnot.getStartOffset(), + tempLookupAnnot.getEndOffset()); + + if ((ltCount <= iv_maxPermutationLevel) && (ltCount > 0)) + { + largestWindowAnnot = tempLookupAnnot; + } + else + { + if (iv_logger.isDebugEnabled()) + { + iv_logger.debug("Window size of " + + ltCount + + " exceeds the max permutation level of " + + iv_maxPermutationLevel + + "."); + } + } + } + } + + return largestWindowAnnot; + } + + private int getFixedWindowEndOffset( + int tokenIdx, + LookupToken lt, + List ltList) + { + int fixedEndOffset = 0; + + for (int i = tokenIdx; (i < tokenIdx + iv_maxPermutationLevel) + && (i < ltList.size()); i++) + { + LookupToken tempLookupToken = (LookupToken) ltList.get(i); + if (tempLookupToken != null) + { + fixedEndOffset = tempLookupToken.getEndOffset(); + } + } + return fixedEndOffset; + } + + /** + * Creates a map that binds an object from a list to its index position. + * + * @param list + * @return + */ + private Map getListIndexMap(List list) + { + Map m = new HashMap(); + + for (int i = 0; i < list.size(); i++) + { + Integer index = new Integer(i); + m.put(list.get(i), index); + } + + return m; + } + + /** + * Creates a map that uses the start offset to index the LookupAnnotation + * objects. If multiple LookupAnnotations can exist at the same start + * offset, then hasMultiples=true and the values with be a List of + * LookupAnnotation objects at that offset. + * + * @param lookupAnnotList + * @param hasMultiples + * @return + */ + private Map getStartOffsetMap(List lookupAnnotList, boolean hasMultiples) + { + Map m = new HashMap(); + + Iterator laItr = lookupAnnotList.iterator(); + while (laItr.hasNext()) + { + LookupAnnotation la = (LookupAnnotation) laItr.next(); + Integer key = new Integer(la.getStartOffset()); + if (hasMultiples) + { + List list = (List) m.get(key); + if (list == null) + { + list = new ArrayList(); + } + list.add(la); + m.put(key, list); + } + else + { + m.put(key, la); + } + } + + return m; + } + + /** + * Creates a map that uses the end offset to index the LookupAnnotation + * objects. If multiple LookupAnnotations can exist at the end start offset, + * then hasMultiples=true and the values with be a List of LookupAnnotation + * objects at that offset. + * + * @param lookupAnnotList + * @param hasMultiples + * @return + */ + private Map getEndOffsetMap(List lookupAnnotList, boolean hasMultiples) + { + Map m = new HashMap(); + + Iterator laItr = lookupAnnotList.iterator(); + while (laItr.hasNext()) + { + LookupAnnotation la = (LookupAnnotation) laItr.next(); + Integer key = new Integer(la.getEndOffset()); + if (hasMultiples) + { + List list = (List) m.get(key); + if (list == null) + { + list = new ArrayList(); + } + list.add(la); + m.put(key, list); + } + else + { + m.put(key, la); + } + } + + return m; + } + + /** + * Gets the hits for the specified LookupToken. This uses the first token Dictionary. + * + * @param firstLookupToken + * @return + * @throws Exception + */ + private Collection getFirstTokenHits(LookupToken firstLookupToken) + throws Exception + { + List singleLtList = new ArrayList(); + singleLtList.add(firstLookupToken); + + String[] phrases = iv_phrBuilder.getPhrases(singleLtList); + + Collection mdhCol = new ArrayList(); + for (int i = 0; i < phrases.length; i++) + { + Collection curMdhCol = iv_firstTokenDictEngine.metaLookup(phrases[i]); + + if (curMdhCol.size() > 0) + { + mdhCol.addAll(curMdhCol); + } + } + return mdhCol; + } +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/LookupAlgorithm.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/dictionary%20lookup/src/edu/mayo/bmi/lookup/algorithms/LookupAlgorithm.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/LookupAlgorithm.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/LookupAlgorithm.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,31 +14,31 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.lookup.algorithms; - -import java.util.Collection; -import java.util.List; -import java.util.Map; - -/** - * Generic interface to specific lookup algorithm implementations. - * - * @author Mayo Clinic - */ -public interface LookupAlgorithm -{ - /** - * Lookup the given text specified via LookupToken objects. Any hits will be - * returned as a collection of LookupHit objects. - * - * @param lookupTokenList - * List of LookupTokens, must be sorted. - * @param contextMap - * Map where key=Impl specific String object and value=List of - * LookupAnnotation objects - * @return Collection of LookupHits. - * @throws Exception - */ - public Collection lookup(List lookupTokenList, Map contextMap) - throws Exception; -} +package edu.mayo.bmi.lookup.algorithms; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +/** + * Generic interface to specific lookup algorithm implementations. + * + * @author Mayo Clinic + */ +public interface LookupAlgorithm +{ + /** + * Lookup the given text specified via LookupToken objects. Any hits will be + * returned as a collection of LookupHit objects. + * + * @param lookupTokenList + * List of LookupTokens, must be sorted. + * @param contextMap + * Map where key=Impl specific String object and value=List of + * LookupAnnotation objects + * @return Collection of LookupHits. + * @throws Exception + */ + public Collection lookup(List lookupTokenList, Map contextMap) + throws Exception; +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/PermutationUtil.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/dictionary%20lookup/src/edu/mayo/bmi/lookup/algorithms/PermutationUtil.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/PermutationUtil.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/algorithms/PermutationUtil.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,220 +14,220 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.lookup.algorithms; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - -/** - * @author Mayo Clinic - */ -public class PermutationUtil -{ - /** - * Gets all permutations for the given level and all sub-levels. - * - * @param maxLevel - */ - public static List getPermutationList(int maxLevel) - { - List permList = new ArrayList(); - for (int levelIdx = maxLevel; levelIdx >= 0; levelIdx--) - { - // contains ALL index values - List baseNumList = new ArrayList(); - for (int j = 1; j <= levelIdx; j++) - { - baseNumList.add(new Integer(j)); - } - - Collection numListCol = new ArrayList(); - if (levelIdx != maxLevel) - { - numListCol.addAll(getNumLists(maxLevel, baseNumList)); - } - else - { - numListCol.add(baseNumList); - } - - Iterator numListItr = numListCol.iterator(); - while (numListItr.hasNext()) - { - List numList = (List) numListItr.next(); - Collection pCol = PermutationUtil - .getLinearPermutations(numList); - Iterator pItr = pCol.iterator(); - while (pItr.hasNext()) - { - List permutation = (List) pItr.next(); - permList.add(permutation); - } - } - - if (levelIdx == 0) - { - permList.add(new ArrayList()); - } - } - - return permList; - } - - private static Collection getNumLists(int maxLevel, List baseNumList) - { - Collection numListCol = new ArrayList(); - buildPermutations(maxLevel, baseNumList, numListCol, new ArrayList(), 0); - filterNonIncreasingLists(numListCol); - return numListCol; - } - - /** - * Filters the number lists such that only lists with increasing numbers are - * kept. - * - * @param numListCol - */ - private static void filterNonIncreasingLists(Collection numListCol) - { - Set removalSet = new HashSet(); - - Iterator numListItr = numListCol.iterator(); - while (numListItr.hasNext()) - { - List numList = (List) numListItr.next(); - Integer largestNum = null; - Iterator numItr = numList.iterator(); - while (numItr.hasNext()) - { - Integer num = (Integer) numItr.next(); - if (largestNum == null) - { - largestNum = num; - } - else - { - int comparison = largestNum.compareTo(num); - if (comparison == 1) - { - removalSet.add(numList); - } - else - { - largestNum = num; - } - } - } - } - numListCol.removeAll(removalSet); - } - - /** - * Recursively builds permutations of numbers specified by the base num - * list. This includes permutations of these numbers with few items than the - * original list. - * - * @param maxLevel - * @param baseNumList - * @param numListCol - * @param residualList - * @param residualCount - */ - private static void buildPermutations( - int maxLevel, - List baseNumList, - Collection numListCol, - List residualList, - int residualCount) - { - if (residualCount > baseNumList.size()) - { - return; - } - else if (residualCount == baseNumList.size()) - { - numListCol.add(new ArrayList(residualList)); - return; - } - else - { - int num = ((Integer) baseNumList.get(residualCount)).intValue(); - residualCount++; - for (int i = num; i <= maxLevel; i++) - { - List tempList = new ArrayList(residualList); - if (!tempList.contains(new Integer(i))) - { - tempList.add(new Integer(i)); - buildPermutations( - maxLevel, - baseNumList, - numListCol, - tempList, - residualCount); - } - } - } - } - - /** - * Gets a collection of lists, each list represents a single permutation. - * This permutation is composed of Integer objects in defined order. - * - * @param level - * @return - */ - public static Collection getLinearPermutations(List numList) - { - Collection permutations = new ArrayList(); - getLinearPermutations(permutations, new ArrayList(), numList); - return permutations; - } - - /** - * Recurisvely builds permutations from the number list. The size of the - * permutations remains constant. - * - * @param permutations - * @param plusList - * @param numList - */ - private static void getLinearPermutations( - Collection permutations, - List plusList, - List numList) - { - Iterator numItr = numList.iterator(); - while (numItr.hasNext()) - { - Integer num = (Integer) numItr.next(); - - List subList = new ArrayList(); - subList.addAll(numList); - subList.remove(num); - - plusList.add(num); - - if (subList.size() > 0) - { - getLinearPermutations(permutations, plusList, subList); - } - else - { - List permutation = new ArrayList(); - for (int i = 0; i < plusList.size(); i++) - { - Integer n = (Integer) plusList.get(i); - permutation.add(n); - } - permutations.add(permutation); - } - - plusList.remove(num); - } - } -} \ No newline at end of file +package edu.mayo.bmi.lookup.algorithms; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +/** + * @author Mayo Clinic + */ +public class PermutationUtil +{ + /** + * Gets all permutations for the given level and all sub-levels. + * + * @param maxLevel + */ + public static List getPermutationList(int maxLevel) + { + List permList = new ArrayList(); + for (int levelIdx = maxLevel; levelIdx >= 0; levelIdx--) + { + // contains ALL index values + List baseNumList = new ArrayList(); + for (int j = 1; j <= levelIdx; j++) + { + baseNumList.add(new Integer(j)); + } + + Collection numListCol = new ArrayList(); + if (levelIdx != maxLevel) + { + numListCol.addAll(getNumLists(maxLevel, baseNumList)); + } + else + { + numListCol.add(baseNumList); + } + + Iterator numListItr = numListCol.iterator(); + while (numListItr.hasNext()) + { + List numList = (List) numListItr.next(); + Collection pCol = PermutationUtil + .getLinearPermutations(numList); + Iterator pItr = pCol.iterator(); + while (pItr.hasNext()) + { + List permutation = (List) pItr.next(); + permList.add(permutation); + } + } + + if (levelIdx == 0) + { + permList.add(new ArrayList()); + } + } + + return permList; + } + + private static Collection getNumLists(int maxLevel, List baseNumList) + { + Collection numListCol = new ArrayList(); + buildPermutations(maxLevel, baseNumList, numListCol, new ArrayList(), 0); + filterNonIncreasingLists(numListCol); + return numListCol; + } + + /** + * Filters the number lists such that only lists with increasing numbers are + * kept. + * + * @param numListCol + */ + private static void filterNonIncreasingLists(Collection numListCol) + { + Set removalSet = new HashSet(); + + Iterator numListItr = numListCol.iterator(); + while (numListItr.hasNext()) + { + List numList = (List) numListItr.next(); + Integer largestNum = null; + Iterator numItr = numList.iterator(); + while (numItr.hasNext()) + { + Integer num = (Integer) numItr.next(); + if (largestNum == null) + { + largestNum = num; + } + else + { + int comparison = largestNum.compareTo(num); + if (comparison == 1) + { + removalSet.add(numList); + } + else + { + largestNum = num; + } + } + } + } + numListCol.removeAll(removalSet); + } + + /** + * Recursively builds permutations of numbers specified by the base num + * list. This includes permutations of these numbers with few items than the + * original list. + * + * @param maxLevel + * @param baseNumList + * @param numListCol + * @param residualList + * @param residualCount + */ + private static void buildPermutations( + int maxLevel, + List baseNumList, + Collection numListCol, + List residualList, + int residualCount) + { + if (residualCount > baseNumList.size()) + { + return; + } + else if (residualCount == baseNumList.size()) + { + numListCol.add(new ArrayList(residualList)); + return; + } + else + { + int num = ((Integer) baseNumList.get(residualCount)).intValue(); + residualCount++; + for (int i = num; i <= maxLevel; i++) + { + List tempList = new ArrayList(residualList); + if (!tempList.contains(new Integer(i))) + { + tempList.add(new Integer(i)); + buildPermutations( + maxLevel, + baseNumList, + numListCol, + tempList, + residualCount); + } + } + } + } + + /** + * Gets a collection of lists, each list represents a single permutation. + * This permutation is composed of Integer objects in defined order. + * + * @param level + * @return + */ + public static Collection getLinearPermutations(List numList) + { + Collection permutations = new ArrayList(); + getLinearPermutations(permutations, new ArrayList(), numList); + return permutations; + } + + /** + * Recurisvely builds permutations from the number list. The size of the + * permutations remains constant. + * + * @param permutations + * @param plusList + * @param numList + */ + private static void getLinearPermutations( + Collection permutations, + List plusList, + List numList) + { + Iterator numItr = numList.iterator(); + while (numItr.hasNext()) + { + Integer num = (Integer) numItr.next(); + + List subList = new ArrayList(); + subList.addAll(numList); + subList.remove(num); + + plusList.add(num); + + if (subList.size() > 0) + { + getLinearPermutations(permutations, plusList, subList); + } + else + { + List permutation = new ArrayList(); + for (int i = 0; i < plusList.size(); i++) + { + Integer n = (Integer) plusList.get(i); + permutation.add(n); + } + permutations.add(permutation); + } + + plusList.remove(num); + } + } +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/phrasebuilder/PhraseBuilder.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/dictionary%20lookup/src/edu/mayo/bmi/lookup/phrasebuilder/PhraseBuilder.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/phrasebuilder/PhraseBuilder.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/phrasebuilder/PhraseBuilder.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,22 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.lookup.phrasebuilder; - -import java.util.List; - -/** - * Interface for building phrases from LookupToken objects. - * - * @author Mayo Clinic - */ -public interface PhraseBuilder -{ - /** - * Given a list of LookupTokens, one or more phrases are built. - * - * @param lookupTokenList List of LookupTokens. - * @return One or more String phrases. - */ - public String[] getPhrases(List lookupTokenList); -} +package edu.mayo.bmi.lookup.phrasebuilder; + +import java.util.List; + +/** + * Interface for building phrases from LookupToken objects. + * + * @author Mayo Clinic + */ +public interface PhraseBuilder +{ + /** + * Given a list of LookupTokens, one or more phrases are built. + * + * @param lookupTokenList List of LookupTokens. + * @return One or more String phrases. + */ + public String[] getPhrases(List lookupTokenList); +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/phrasebuilder/VariantPhraseBuilderImpl.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/dictionary%20lookup/src/edu/mayo/bmi/lookup/phrasebuilder/VariantPhraseBuilderImpl.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/phrasebuilder/VariantPhraseBuilderImpl.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/phrasebuilder/VariantPhraseBuilderImpl.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,146 +14,146 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.lookup.phrasebuilder; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - -import edu.mayo.bmi.lookup.vo.LookupToken; - -/** - * Builds phrases based on various variants of a LookupToken. For instance, a - * single LookupToken may have a spell corrected variant, abbreviation expansion - * variant, etc... - * - * @author Mayo Clinic - */ -public class VariantPhraseBuilderImpl implements PhraseBuilder -{ - private List iv_textExtractorList; - - /** - * Constructor - * - * @param variantAttrNames - * Key names of the variant attributes attached to the - * LookupToken objects. - * @param useOriginalText - * flag that determines whether to use the original text or not. - */ - public VariantPhraseBuilderImpl(String[] variantAttrNames, - boolean useOriginalText) - { - iv_textExtractorList = new ArrayList(); - - if (useOriginalText) - { - // use original text as a variant - iv_textExtractorList.add(new OriginalTextImpl()); - } - - // add variants - for (int i = 0; i < variantAttrNames.length; i++) - { - iv_textExtractorList.add(new AttributeTextImpl(variantAttrNames[i])); - } - } - - public String[] getPhrases(List lookupTokenList) - { - Set phraseSet = new HashSet(); - Iterator teItr = iv_textExtractorList.iterator(); - while (teItr.hasNext()) - { - TextExtractor te = (TextExtractor) teItr.next(); - - StringBuffer sb = new StringBuffer(); - LookupToken previousLt = null; - Iterator ltItr = lookupTokenList.iterator(); - while (ltItr.hasNext()) - { - LookupToken lt = (LookupToken) ltItr.next(); - String variant = te.getText(lt); - - if (variant == null) - { - variant = lt.getText(); - } - - if (previousLt != null) - { - // check delta between previous token and current token - // this delta represents whitespace between tokens - if (previousLt.getEndOffset() != lt.getStartOffset()) - { - // insert whitespace - sb.append(' '); - } - } - - sb.append(variant); - - previousLt = lt; - } - String phrase = sb.toString().trim(); - phraseSet.add(phrase); - } - - String[] phraseArr = new String[phraseSet.size()]; - phraseSet.toArray(phraseArr); - - return phraseArr; - } - - /** - * Common interface to extract text from a LookupToken. - * - * @author Mayo Clinic - */ - private interface TextExtractor - { - public String getText(LookupToken lt); - } - - /** - * Implementation that extracts text from the original text of a - * LookupToken. - * - * @author Mayo Clinic - */ - class OriginalTextImpl implements TextExtractor - { - public String getText(LookupToken lt) - { - return lt.getText(); - } - } - - /** - * Implementation that extracts text from an attribute of a LookupToken. - * - * @author Mayo Clinic - */ - class AttributeTextImpl implements TextExtractor - { - private String iv_varAttrName; - - /** - * Constructor - * - * @param varAttrName - */ - public AttributeTextImpl(String varAttrName) - { - iv_varAttrName = varAttrName; - } - - public String getText(LookupToken lt) - { - return lt.getStringAttribute(iv_varAttrName); - } - } -} \ No newline at end of file +package edu.mayo.bmi.lookup.phrasebuilder; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import edu.mayo.bmi.lookup.vo.LookupToken; + +/** + * Builds phrases based on various variants of a LookupToken. For instance, a + * single LookupToken may have a spell corrected variant, abbreviation expansion + * variant, etc... + * + * @author Mayo Clinic + */ +public class VariantPhraseBuilderImpl implements PhraseBuilder +{ + private List iv_textExtractorList; + + /** + * Constructor + * + * @param variantAttrNames + * Key names of the variant attributes attached to the + * LookupToken objects. + * @param useOriginalText + * flag that determines whether to use the original text or not. + */ + public VariantPhraseBuilderImpl(String[] variantAttrNames, + boolean useOriginalText) + { + iv_textExtractorList = new ArrayList(); + + if (useOriginalText) + { + // use original text as a variant + iv_textExtractorList.add(new OriginalTextImpl()); + } + + // add variants + for (int i = 0; i < variantAttrNames.length; i++) + { + iv_textExtractorList.add(new AttributeTextImpl(variantAttrNames[i])); + } + } + + public String[] getPhrases(List lookupTokenList) + { + Set phraseSet = new HashSet(); + Iterator teItr = iv_textExtractorList.iterator(); + while (teItr.hasNext()) + { + TextExtractor te = (TextExtractor) teItr.next(); + + StringBuffer sb = new StringBuffer(); + LookupToken previousLt = null; + Iterator ltItr = lookupTokenList.iterator(); + while (ltItr.hasNext()) + { + LookupToken lt = (LookupToken) ltItr.next(); + String variant = te.getText(lt); + + if (variant == null) + { + variant = lt.getText(); + } + + if (previousLt != null) + { + // check delta between previous token and current token + // this delta represents whitespace between tokens + if (previousLt.getEndOffset() != lt.getStartOffset()) + { + // insert whitespace + sb.append(' '); + } + } + + sb.append(variant); + + previousLt = lt; + } + String phrase = sb.toString().trim(); + phraseSet.add(phrase); + } + + String[] phraseArr = new String[phraseSet.size()]; + phraseSet.toArray(phraseArr); + + return phraseArr; + } + + /** + * Common interface to extract text from a LookupToken. + * + * @author Mayo Clinic + */ + private interface TextExtractor + { + public String getText(LookupToken lt); + } + + /** + * Implementation that extracts text from the original text of a + * LookupToken. + * + * @author Mayo Clinic + */ + class OriginalTextImpl implements TextExtractor + { + public String getText(LookupToken lt) + { + return lt.getText(); + } + } + + /** + * Implementation that extracts text from an attribute of a LookupToken. + * + * @author Mayo Clinic + */ + class AttributeTextImpl implements TextExtractor + { + private String iv_varAttrName; + + /** + * Constructor + * + * @param varAttrName + */ + public AttributeTextImpl(String varAttrName) + { + iv_varAttrName = varAttrName; + } + + public String getText(LookupToken lt) + { + return lt.getStringAttribute(iv_varAttrName); + } + } +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/vo/LookupAnnotation.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/dictionary%20lookup/src/edu/mayo/bmi/lookup/vo/LookupAnnotation.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/vo/LookupAnnotation.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/dictionary lookup/src/edu/mayo/bmi/lookup/vo/LookupAnnotation.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,56 +14,56 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.lookup.vo; - -/** - * Value object that models a text annotation. - * - * @author Mayo Clinic - */ -public interface LookupAnnotation -{ - /** - * Gets the start offset. - * - * @return - */ - public int getStartOffset(); - - /** - * Gets the end offset. - * - * @return - */ - public int getEndOffset(); - - /** - * Gets the length of this annotation based on offsets. - * - * @return - */ - public int getLength(); - - /** - * Gets the text. - * - * @return - */ - public String getText(); - - /** - * Adds an attribute that may be used for filtering. - * - * @param attrKey - * @param attrVal - */ - public void addStringAttribute(String attrKey, String attrVal); - - /** - * Gets an attribute. - * - * @param attrKey - * @return - */ - public String getStringAttribute(String attrKey); -} \ No newline at end of file +package edu.mayo.bmi.lookup.vo; + +/** + * Value object that models a text annotation. + * + * @author Mayo Clinic + */ +public interface LookupAnnotation +{ + /** + * Gets the start offset. + * + * @return + */ + public int getStartOffset(); + + /** + * Gets the end offset. + * + * @return + */ + public int getEndOffset(); + + /** + * Gets the length of this annotation based on offsets. + * + * @return + */ + public int getLength(); + + /** + * Gets the text. + * + * @return + */ + public String getText(); + + /** + * Adds an attribute that may be used for filtering. + * + * @param attrKey + * @param attrVal + */ + public void addStringAttribute(String attrKey, String attrVal); + + /** + * Gets an attribute. + * + * @param attrKey + * @return + */ + public String getStringAttribute(String attrKey); +}