Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 0DB52104E7 for ; Fri, 6 Dec 2013 16:14:58 +0000 (UTC) Received: (qmail 5409 invoked by uid 500); 6 Dec 2013 16:14:57 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 4955 invoked by uid 500); 6 Dec 2013 16:14:55 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 4710 invoked by uid 99); 6 Dec 2013 16:14:54 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 06 Dec 2013 16:14:54 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 06 Dec 2013 16:14:47 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 7973E238883D; Fri, 6 Dec 2013 16:14:25 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1548577 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion: medfacts/cleartk/ medfacts/cleartk/extractors/ train/ Date: Fri, 06 Dec 2013 16:14:25 -0000 To: commits@ctakes.apache.org From: swu@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20131206161425.7973E238883D@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: swu Date: Fri Dec 6 16:14:24 2013 New Revision: 1548577 URL: http://svn.apache.org/r1548577 Log: ctakes-assertion with frustratingly easy domain adaptation and associated tests. assertion-evaluation will be updated separately Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityFedaCleartkAnalysisEngine.java (with props) ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/ExtractorListFeatureFunctionConverter.java (with props) ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/FedaFeatureFunction.java (with props) ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTests.java (with props) ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTrain.java (with props) Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java?rev=1548577&r1=1548576&r2=1548577&view=diff ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java (original) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java Fri Dec 6 16:14:24 2013 @@ -18,14 +18,20 @@ */ package org.apache.ctakes.assertion.medfacts.cleartk; +import java.io.File; import java.net.URI; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; +import org.apache.commons.io.FilenameUtils; import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection; +import org.apache.ctakes.assertion.medfacts.cleartk.extractors.FedaFeatureFunction; import org.apache.ctakes.assertion.zoner.types.Zone; import org.apache.ctakes.typesystem.type.constants.CONST; import org.apache.ctakes.typesystem.type.structured.DocumentID; @@ -50,12 +56,21 @@ import org.cleartk.classifier.feature.ex import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor; import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor; import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor; +import org.cleartk.classifier.feature.function.FeatureFunctionExtractor; import org.uimafit.descriptor.ConfigurationParameter; import org.uimafit.factory.AnalysisEngineFactory; import org.uimafit.factory.ConfigurationParameterFactory; import org.uimafit.util.JCasUtil; //import org.chboston.cnlp.ctakes.relationextractor.ae.ModifierExtractorAnnotator; + + +import scala.actors.threadpool.Arrays; + +/** + * @author swu + * + */ public abstract class AssertionCleartkAnalysisEngine extends CleartkAnnotator { @@ -65,6 +80,10 @@ public abstract class AssertionCleartkAn public static int relationId; // counter for error logging + // additional parameter for domain adaptation + public static final String FILE_TO_DOMAIN_MAP = "mapTrainFileToDomain"; + + @ConfigurationParameter( name = PARAM_GOLD_VIEW_NAME, mandatory = false, @@ -108,6 +127,13 @@ public abstract class AssertionCleartkAn protected static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures"; + @ConfigurationParameter( + name = FILE_TO_DOMAIN_MAP, + mandatory = false, + description = "a map of filenames to their respective domains (i.e., directories that contain them)") + protected String fileDomainMap; + protected Map fileToDomain = new HashMap(); + protected String lastLabel; @@ -137,7 +163,10 @@ public abstract class AssertionCleartkAn protected List tokenCleartkExtractors; protected List entityFeatureExtractors; protected CleartkExtractor cuePhraseInWindowExtractor; - + + protected List featureFunctionExtractors; + protected FedaFeatureFunction ffDomainAdaptor; + protected FeatureSelection featureSelection; public abstract void setClassLabel(IdentifiedAnnotation entityMention, Instance instance) throws AnalysisEngineProcessException; @@ -151,6 +180,24 @@ public abstract class AssertionCleartkAn public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); + // Re-process the "directory" string for domains that were used in the data + if (null != fileDomainMap) { + String[] dirs = fileDomainMap.split("[;:]"); + for (String dir : dirs) { + + // TODO: normalize dir to real domainId + String domainId = normalizeToDomain(dir); + + File dataDir = new File(dir); + if (dataDir.listFiles()!=null) { + for (File f : dataDir.listFiles()) { + fileToDomain.put( FilenameUtils.removeExtension(f.getName()), domainId ); + } + // System.out.println(trainFiles.toString()); + } + } + } + if (this.isTraining() && this.goldViewName == null) { throw new IllegalArgumentException(PARAM_GOLD_VIEW_NAME + " must be defined during training"); } @@ -229,16 +276,29 @@ public abstract class AssertionCleartkAn // new CleartkExtractor.Bag(new CleartkExtractor.Preceding(10)), // new CleartkExtractor.Bag(new CleartkExtractor.Following(10)) ); - + + if (!fileToDomain.isEmpty()) { + // set up FeatureFunction for all the laggard, non-Extractor features + ffDomainAdaptor = new FedaFeatureFunction( new ArrayList(new HashSet(fileToDomain.values())) ); + } } @Override public void process(JCas jCas) throws AnalysisEngineProcessException { DocumentID documentId = JCasUtil.selectSingle(jCas, DocumentID.class); + String domainId = ""; + + if (documentId != null) { logger.debug("processing next doc: " + documentId.getDocumentID()); + + // set the domain to be FeatureFunction'ed into all extractors + if (!fileToDomain.isEmpty()) { + domainId = fileToDomain.get(documentId.getDocumentID()); + ffDomainAdaptor.setDomain(domainId); // if domain is not found, no warning -- just considers general domain + } } else { logger.warn("processing next doc (doc id is null)"); @@ -323,10 +383,14 @@ public abstract class AssertionCleartkAn instance.addAll(extractor.extract(identifiedAnnotationView, entityMention)); } */ - for (CleartkExtractor extractor : this.tokenCleartkExtractors) { - //instance.addAll(extractor.extractWithin(identifiedAnnotationView, entityMention, sentence)); - instance.addAll(extractor.extract(identifiedAnnotationView, entityOrEventMention)); - } + + // only use extract this version if not doing domain adaptation + if (ffDomainAdaptor==null) { + for (CleartkExtractor extractor : this.tokenCleartkExtractors) { + //instance.addAll(extractor.extractWithin(identifiedAnnotationView, entityMention, sentence)); + instance.addAll(extractor.extract(identifiedAnnotationView, entityOrEventMention)); + } + } // List cuePhraseFeatures = null; // cuePhraseInWindowExtractor.extract(jCas, entityOrEventMention); @@ -351,6 +415,14 @@ public abstract class AssertionCleartkAn // instance.add(new Feature("ClosestCue_Phrase", closestCue.getCuePhrase())); instance.add(new Feature("ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily())); instance.add(new Feature("ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory())); + + // add hack-ey domain adaptation to these hacked-in features + if (!fileToDomain.isEmpty() && ffDomainAdaptor!=null) { + instance.addAll(ffDomainAdaptor.apply(new Feature("ClosestCue_Word", closestCue.getCoveredText()))); + instance.addAll(ffDomainAdaptor.apply(new Feature("ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily()))); + instance.addAll(ffDomainAdaptor.apply(new Feature("ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory()))); + } + } } // if (cuePhraseFeatures != null && !cuePhraseFeatures.isEmpty()) @@ -365,18 +437,25 @@ public abstract class AssertionCleartkAn // 7/9/13 srh modified per tmiller so it's binary but not numeric feature //instance.add(new Feature("ENTITY_TYPE_" + entityOrEventMention.getTypeID())); instance.add(new Feature("ENTITY_TYPE_ANAT_SITE")); - } /* This hurts recall more than it helps precision + // add hack-ey domain adaptation to these hacked-in features + if (!fileToDomain.isEmpty() && ffDomainAdaptor!=null) { + instance.addAll(ffDomainAdaptor.apply(new Feature("ENTITY_TYPE_ANAT_SITE"))); + } + } + /* This hurts recall more than it helps precision else if (eemTypeId == CONST.NE_TYPE_ID_DRUG) { // 7/10 adding drug instance.add(new Feature("ENTITY_TYPE_DRUG")); } */ - for (SimpleFeatureExtractor extractor : this.entityFeatureExtractors) { - instance.addAll(extractor.extract(jCas, entityOrEventMention)); + // only extract these features if not doing domain adaptation + if (ffDomainAdaptor==null) { + for (SimpleFeatureExtractor extractor : this.entityFeatureExtractors) { + instance.addAll(extractor.extract(jCas, entityOrEventMention)); + } } - List zoneFeatures = extractZoneFeatures(coveringZoneMap, entityOrEventMention); if (zoneFeatures != null && !zoneFeatures.isEmpty()) { @@ -388,11 +467,19 @@ public abstract class AssertionCleartkAn for(Feature feat : feats){ if(feat.getName() != null && (feat.getName().startsWith("TreeFrag") || feat.getName().startsWith("WORD") || feat.getName().startsWith("NEG"))) continue; + if(feat.getName() != null && (feat.getName().contains("_TreeFrag") || feat.getName().contains("_WORD") || feat.getName().contains("_NEG"))) continue; if(feat.getValue() instanceof String){ feat.setValue(((String)feat.getValue()).toLowerCase()); } } + if (!fileToDomain.isEmpty() && ffDomainAdaptor!=null) { + for (FeatureFunctionExtractor extractor : this.featureFunctionExtractors) { + // TODO: extend to the case where the extractors take a different argument besides entityOrEventMention + instance.addAll(extractor.extract(jCas, entityOrEventMention)); + } + } + // grab the output label setClassLabel(entityOrEventMention, instance); @@ -445,7 +532,31 @@ public abstract class AssertionCleartkAn return desc; } +public Map getTrainFileToDomain() { + return fileToDomain; +} +public void setTrainFileToDomain(Map trainFileToDomain) { + this.fileToDomain = trainFileToDomain; +} + +/** Looks in the domain string (path) for meaningful corpus names + * @param dir + * @return + */ +public static String normalizeToDomain(String dir) { + // TODO: real normalization + String[] p = dir.split("/"); + List parts = Arrays.asList(p); + Collections.reverse(parts); + for (String part : parts) { + if ( part.toLowerCase().startsWith("test") || part.toLowerCase().startsWith("train") || part.toLowerCase().startsWith("dev") ) { + continue; + } + return part; + } + return dir; +} /* public static AnalysisEngineDescription getClassifierDescription(String modelFileName) Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityFedaCleartkAnalysisEngine.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityFedaCleartkAnalysisEngine.java?rev=1548577&view=auto ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityFedaCleartkAnalysisEngine.java (added) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityFedaCleartkAnalysisEngine.java Fri Dec 6 16:14:24 2013 @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.ctakes.assertion.medfacts.cleartk; + +import java.io.File; +import java.net.URI; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection; +import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection; +import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor; +import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveRightFragmentExtractor; +import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor; +import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ExtractorListFeatureFunctionConverter; +import org.apache.ctakes.assertion.medfacts.cleartk.extractors.FedaFeatureFunction; +import org.apache.ctakes.assertion.medfacts.cleartk.extractors.NegationDependencyFeatureExtractor; +import org.apache.ctakes.typesystem.type.constants.CONST; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.resource.ResourceInitializationException; +import org.cleartk.classifier.Instance; +import org.cleartk.classifier.feature.extractor.CleartkExtractor; +import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor; +import org.cleartk.classifier.feature.function.FeatureFunctionExtractor; +import org.uimafit.descriptor.ConfigurationParameter; + + +public class PolarityFedaCleartkAnalysisEngine extends PolarityCleartkAnalysisEngine { + + public static final String NEGATED = "NEGATED"; + public static final String NOT_NEGATED = "NOT_NEGATED"; + + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + probabilityOfKeepingADefaultExample = 1.0; //0.1; + + if(this.entityFeatureExtractors == null){ + this.entityFeatureExtractors = new ArrayList(); + } + this.entityFeatureExtractors.add(new NegationDependencyFeatureExtractor()); + this.entityFeatureExtractors.add(new ContextWordWindowExtractor("org/apache/ctakes/assertion/models/polarity.txt")); + this.entityFeatureExtractors.add(new AboveLeftFragmentExtractor("AL_Polarity","org/apache/ctakes/assertion/models/sharpPolarityFrags.txt")); + // this.entityFeatureExtractors.add(new AboveRightFragmentExtractor("AR_Polarity","org/apache/ctakes/assertion/models/sharpArPolarityFrags.txt")); + + initializeDomainAdaptation(); + + initializeFeatureSelection(); + + } + + @Override + public void setClassLabel(IdentifiedAnnotation entityOrEventMention, Instance instance) throws AnalysisEngineProcessException { + if (this.isTraining()) + { + String polarity = (entityOrEventMention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT) ? NEGATED : NOT_NEGATED; // "negated" : "present"; + this.lastLabel = polarity; + // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling + if (NEGATED.equals(polarity)) + { + logger.debug("TRAINING: " + polarity); + } + if (NOT_NEGATED.equals(polarity) + && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample) { + return; + } + instance.setOutcome(polarity); +// this.dataWriter.write(instance); + } else + { + String label = this.classifier.classify(instance.getFeatures()); + this.lastLabel = label; + int polarity = CONST.NE_POLARITY_NEGATION_ABSENT; + if (NOT_NEGATED.equals(label)) + { + polarity = CONST.NE_POLARITY_NEGATION_ABSENT; + } else if (NEGATED.equals(label)) + { + polarity = CONST.NE_POLARITY_NEGATION_PRESENT; + logger.debug(String.format("DECODING/EVAL: %s//%s [%d-%d] (%s)", label, polarity, entityOrEventMention.getBegin(), entityOrEventMention.getEnd(), entityOrEventMention.getClass().getName())); + } + entityOrEventMention.setPolarity(polarity); + } + } + public static FeatureSelection createFeatureSelection(double threshold) { + return new Chi2FeatureSelection(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false); + // return new MutualInformationFeatureSelection(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME); + } + + public static URI createFeatureSelectionURI(File outputDirectoryName) { + return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI(); + } + + private void initializeDomainAdaptation() { + // Do domain adaptation + featureFunctionExtractors = new ArrayList(); + // FedaFeatureFunction ff = new FedaFeatureFunction(new ArrayList(trainFileToDomain.values())); + featureFunctionExtractors.addAll(ExtractorListFeatureFunctionConverter.convert(contextFeatureExtractors, ffDomainAdaptor)); + featureFunctionExtractors.addAll(ExtractorListFeatureFunctionConverter.convert(tokenContextFeatureExtractors, ffDomainAdaptor)); + featureFunctionExtractors.addAll(ExtractorListFeatureFunctionConverter.convert(tokenCleartkExtractors, ffDomainAdaptor)); + featureFunctionExtractors.addAll(ExtractorListFeatureFunctionConverter.convert(entityFeatureExtractors, ffDomainAdaptor)); + featureFunctionExtractors.add(new FeatureFunctionExtractor(cuePhraseInWindowExtractor, ffDomainAdaptor)); + } + @Override + protected void initializeFeatureSelection() throws ResourceInitializationException { + if (featureSelectionThreshold == 0) { + this.featureSelection = null; + } else { + this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold); + +// if ( (new File(this.featureSelectionURI)).exists() ) { +// try { +// this.featureSelection.load(this.featureSelectionURI); +// } catch (IOException e) { +// throw new ResourceInitializationException(e); +// } +// } + } + } + +} Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityFedaCleartkAnalysisEngine.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/ExtractorListFeatureFunctionConverter.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/ExtractorListFeatureFunctionConverter.java?rev=1548577&view=auto ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/ExtractorListFeatureFunctionConverter.java (added) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/ExtractorListFeatureFunctionConverter.java Fri Dec 6 16:14:24 2013 @@ -0,0 +1,26 @@ +package org.apache.ctakes.assertion.medfacts.cleartk.extractors; + +import java.util.ArrayList; +import java.util.List; + +import org.cleartk.classifier.feature.extractor.CleartkExtractor; +import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor; +import org.cleartk.classifier.feature.function.FeatureFunction; +import org.cleartk.classifier.feature.function.FeatureFunctionExtractor; + +public class ExtractorListFeatureFunctionConverter { + public static List convert( List extractors, FeatureFunction ff ) { + + List featureFunctionExtractors = new ArrayList(); + if (null!=extractors) { + for (SimpleFeatureExtractor extractor : extractors) { + featureFunctionExtractors.add( + new FeatureFunctionExtractor(extractor,ff) + ); + } + } + + return featureFunctionExtractors; + } + +} Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/ExtractorListFeatureFunctionConverter.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/FedaFeatureFunction.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/FedaFeatureFunction.java?rev=1548577&view=auto ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/FedaFeatureFunction.java (added) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/FedaFeatureFunction.java Fri Dec 6 16:14:24 2013 @@ -0,0 +1,48 @@ +package org.apache.ctakes.assertion.medfacts.cleartk.extractors; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.cleartk.classifier.Feature; +import org.cleartk.classifier.feature.function.FeatureFunction; + +public class FedaFeatureFunction implements FeatureFunction { + + public static final String DOMAIN_ADAPTATION_ALGORITHM = "FEDA"; + List domainIds; + String currentDomain; + + public FedaFeatureFunction ( List domains ) { + domainIds = domains; + } + + /** + * @return replicate the feature for the current domain, the original is a "general" domain + */ + @Override + public List apply(Feature feature) { + Object featureValue = feature.getValue(); + + List fedaFeatures = new ArrayList(); + fedaFeatures.add(feature); + if (null==currentDomain) { return fedaFeatures; } + +// for (String domain : domainIds) { +// String featureName = Feature.createName(domain, DOMAIN_ADAPTATION_ALGORITHM, feature.getName()); + String featureName = Feature.createName(currentDomain, DOMAIN_ADAPTATION_ALGORITHM, feature.getName()); + + fedaFeatures.add( + new Feature( + featureName, + featureValue.toString() ) + ); +// } + return fedaFeatures; + } + + public void setDomain(String domain) { + currentDomain = domain; + } + +} Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/extractors/FedaFeatureFunction.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTests.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTests.java?rev=1548577&view=auto ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTests.java (added) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTests.java Fri Dec 6 16:14:24 2013 @@ -0,0 +1,128 @@ +package org.apache.ctakes.assertion.train; + +import java.io.File; +import java.util.ArrayList; +import java.util.Date; + +import org.apache.ctakes.assertion.eval.AssertionEvaluation; +import org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine; +import org.apache.ctakes.assertion.util.AssertionConst; + +import scala.actors.threadpool.Arrays; + + +/** + * For each assertion attribute (polarity, conditional, etc), run against the test directories + * for that attribute, using models that are under the models-dir. + * Note that this uses constants within {@link AssertionConst} for the directory names. + */ +public class PolarityDomainAdaptationTests { + + final static String RUN_ID = "feda_"; + + protected final static String SHARP_TEST = AssertionConst.DATA_DIR + "preprocessed_data/sharp/test"; + protected final static String I2B2_TEST = AssertionConst.DATA_DIR + "preprocessed_data/i2b2/test"; + protected final static String MIPACQ_TEST = AssertionConst.DATA_DIR + "preprocessed_data/mipacq/test"; + protected final static String NEGEX_TEST = AssertionConst.DATA_DIR + "preprocessed_data/negex"; + + public static void main(String[] args) throws Exception { + + AssertionEvaluation.useEvaluationLogFile = true; + AssertionEvaluation.evaluationLogFilePath = "eval/"+RUN_ID+new Date().toString().replaceAll(" ","_") + ".txt"; + + ArrayList testGrid = new ArrayList(); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_FEDA, I2B2_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_FEDA, MIPACQ_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_FEDA, NEGEX_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.I2B2_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.I2B2_FEDA, I2B2_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.I2B2_FEDA, MIPACQ_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.I2B2_FEDA, NEGEX_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.MIPACQ_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.MIPACQ_FEDA, I2B2_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.MIPACQ_FEDA, MIPACQ_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.MIPACQ_FEDA, NEGEX_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.NEGEX_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.NEGEX_FEDA, I2B2_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.NEGEX_FEDA, MIPACQ_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.NEGEX_FEDA, NEGEX_TEST)); // not valid + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_FEDA, I2B2_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_FEDA, MIPACQ_TEST)); // not meaningful + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_FEDA, NEGEX_TEST)); // not meaningful + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_MIPACQ_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_MIPACQ_FEDA, I2B2_TEST)); // not meaningful + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_MIPACQ_FEDA, MIPACQ_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_MIPACQ_FEDA, NEGEX_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_NEGEX_FEDA, SHARP_TEST)); // not meaningful + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_NEGEX_FEDA, I2B2_TEST)); //not meaningful + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_NEGEX_FEDA, MIPACQ_TEST)); // not meaningful + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_NEGEX_FEDA, NEGEX_TEST)); // not valid + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.I2B2_MIPACQ_NEGEX_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_MIPACQ_NEGEX_FEDA, I2B2_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_NEGEX_FEDA, MIPACQ_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_MIPACQ_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_MIPACQ_FEDA, I2B2_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_MIPACQ_FEDA, MIPACQ_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_MIPACQ_FEDA, NEGEX_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_MIPACQ_NEGEX_FEDA, SHARP_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_MIPACQ_NEGEX_FEDA, I2B2_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_MIPACQ_NEGEX_FEDA, MIPACQ_TEST)); + testGrid.add(new TestPair(PolarityDomainAdaptationTrain.SHARP_I2B2_MIPACQ_NEGEX_FEDA, NEGEX_TEST)); //not valid + + + String attribute = "polarity"; + + for (TestPair oneTest : testGrid) { + ArrayList params = new ArrayList(); + + File instancef = new File("eval/instances_"+ + oneTest.model.substring(oneTest.model.lastIndexOf("/")+1)+"_"+ + AssertionCleartkAnalysisEngine.normalizeToDomain(oneTest.data)); + + params.add("--test-dir"); params.add(oneTest.data); + params.add("--models-dir"); params.add(oneTest.model); + String trainDomains = PolarityDomainAdaptationTrain.trainGrid.inverse().get(oneTest.model); + if (null == trainDomains) { continue; } + params.add("--train-dir"); params.add(trainDomains); // must list the train-dir in order to establish which domains + // params.add("--ytex-negation"); + // params.add("--evaluation-output-dir"); params.add(AssertionConst.evalOutputDir); + params.add("--test-only"); + params.add("--feda"); + params.add("--print-instances"); + // hack-y way to name this + params.add(instancef.getAbsolutePath()); + + // Build up an "ignore" string + for (String ignoreAttribute : AssertionConst.allAnnotationTypes) { + if (!ignoreAttribute.equals(attribute)) { + + if (ignoreAttribute.equals("historyOf")) { + ignoreAttribute = ignoreAttribute.substring(0, ignoreAttribute.length()-2); + } + + params.add("--ignore-" + ignoreAttribute); + } + } + String[] paramList = params.toArray(new String[]{}); + + System.out.println(Arrays.asList(paramList).toString()); + + // Run the actual assertion test on just one attribute + AssertionEvaluation.main( paramList ); + } + } + + + static class TestPair { + String model; + String data; + TestPair (String a, String b) { + model=a; + data=b; + } + } + + +} Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTests.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTrain.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTrain.java?rev=1548577&view=auto ============================================================================== --- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTrain.java (added) +++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTrain.java Fri Dec 6 16:14:24 2013 @@ -0,0 +1,88 @@ +package org.apache.ctakes.assertion.train; + +import java.util.ArrayList; +import java.util.Map.Entry; + +import org.apache.ctakes.assertion.eval.AssertionEvaluation; +import org.apache.ctakes.assertion.util.AssertionConst; + +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.google.common.collect.ImmutableBiMap; +/** + * For each assertion attribute (polarity, conditional, etc), train a model using the data + * in the training directories for that attribute, and store the model under the models-dir + * Note that this uses constants within {@link AssertionConst} for the directory names. + */ +public class PolarityDomainAdaptationTrain { + protected final static String SHARP_TRAIN = AssertionConst.DATA_DIR + "preprocessed_data/sharp/train"; + protected final static String I2B2_TRAIN = AssertionConst.DATA_DIR + "preprocessed_data/i2b2/train"; + protected final static String MIPACQ_TRAIN = AssertionConst.DATA_DIR + "preprocessed_data/mipacq/train"; + protected final static String NEGEX_TRAIN = AssertionConst.DATA_DIR + "preprocessed_data/negex"; // actually test + + public final static String SHARP_FEDA = "../ctakes-assertion-res/resources/model/sharptrain-feda"; + protected final static String I2B2_FEDA = "../ctakes-assertion-res/resources/model/i2b2train-feda"; + protected final static String MIPACQ_FEDA = "../ctakes-assertion-res/resources/model/mipacqtrain-feda"; + protected final static String NEGEX_FEDA = "../ctakes-assertion-res/resources/model/negextest-feda"; + protected final static String SHARP_I2B2_FEDA = "../ctakes-assertion-res/resources/model/sharptrain+i2b2train-feda"; + protected final static String SHARP_MIPACQ_FEDA = "../ctakes-assertion-res/resources/model/sharptrain+mipacqtrain-feda"; + protected final static String SHARP_NEGEX_FEDA = "../ctakes-assertion-res/resources/model/sharptrain+negextest-feda"; + protected final static String I2B2_MIPACQ_NEGEX_FEDA = "../ctakes-assertion-res/resources/model/i2b2train+mipacqtrain+negextest-feda"; + protected final static String SHARP_I2B2_MIPACQ_FEDA = "../ctakes-assertion-res/resources/model/sharptrain+i2b2train+mipacqtrain-feda"; + protected final static String SHARP_MIPACQ_NEGEX_FEDA = "../ctakes-assertion-res/resources/model/sharptrain+mipacqtrain+negextest-feda"; + protected final static String SHARP_I2B2_NEGEX_FEDA = "../ctakes-assertion-res/resources/model/sharptrain+i2b2train+negextest-feda"; + protected final static String SHARP_I2B2_MIPACQ_NEGEX_FEDA = "../ctakes-assertion-res/resources/model/sharpi2b2mipacqnegex-feda"; + + public static BiMap trainGrid = HashBiMap.create(); + static { + trainGrid.put(SHARP_TRAIN, SHARP_FEDA); + trainGrid.put(I2B2_TRAIN, I2B2_FEDA); + trainGrid.put(MIPACQ_TRAIN, MIPACQ_FEDA); + trainGrid.put(NEGEX_TRAIN, NEGEX_FEDA); + trainGrid.put(SHARP_TRAIN+":"+I2B2_TRAIN, SHARP_I2B2_FEDA); + trainGrid.put(SHARP_TRAIN+":"+MIPACQ_TRAIN, SHARP_MIPACQ_FEDA); + trainGrid.put(SHARP_TRAIN+":"+NEGEX_TRAIN, SHARP_NEGEX_FEDA); + trainGrid.put(I2B2_TRAIN+":"+MIPACQ_TRAIN+":"+NEGEX_TRAIN, I2B2_MIPACQ_NEGEX_FEDA); + trainGrid.put(SHARP_TRAIN+":"+I2B2_TRAIN+":"+MIPACQ_TRAIN, SHARP_I2B2_MIPACQ_FEDA); + trainGrid.put(SHARP_TRAIN+":"+MIPACQ_TRAIN+":"+NEGEX_TRAIN, SHARP_MIPACQ_NEGEX_FEDA); + trainGrid.put(SHARP_TRAIN+":"+I2B2_TRAIN+":"+NEGEX_TRAIN, SHARP_I2B2_NEGEX_FEDA); + trainGrid.put(SHARP_TRAIN+":"+I2B2_TRAIN+":"+MIPACQ_TRAIN+":"+NEGEX_TRAIN, + SHARP_I2B2_MIPACQ_NEGEX_FEDA); + } + + public static void main(String[] args) throws Exception { + + String attribute = "polarity"; + + + for (Entry oneTrain : trainGrid.entrySet()) { + + ArrayList params = new ArrayList(); + + params.add("--train-dir"); params.add(oneTrain.getKey()); + params.add("--models-dir"); params.add(oneTrain.getValue()); + params.add("--train-only"); + params.add("--feature-selection"); params.add(Float.toString(0.000000000001f)); + params.add("--feda"); + + // Build up an "ignore" string + for (String ignoreAttribute : AssertionConst.allAnnotationTypes) { + if (!ignoreAttribute.equals(attribute)) { + + if (ignoreAttribute.equals("historyOf")) { + ignoreAttribute = ignoreAttribute.substring(0, ignoreAttribute.length()-2); + } + + params.add("--ignore-" + ignoreAttribute); + } + } + String[] paramList = params.toArray(new String[]{}); + + // Run the actual assertion training on just one attribute + AssertionEvaluation.main( paramList ); + } + + + + } +} Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/PolarityDomainAdaptationTrain.java ------------------------------------------------------------------------------ svn:mime-type = text/plain