ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vjapa...@apache.org
Subject svn commit: r1551254 [18/26] - in /ctakes/branches/ytex: ctakes-ytex-res/ ctakes-ytex-res/.settings/ ctakes-ytex-res/src/ ctakes-ytex-res/src/main/ ctakes-ytex-res/src/main/resources/ ctakes-ytex-res/src/main/resources/org/ ctakes-ytex-res/src/main/res...
Date Mon, 16 Dec 2013 16:30:40 GMT
Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/FoldGenerator.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/FoldGenerator.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/FoldGenerator.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/FoldGenerator.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,49 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.util.SortedMap;
+
+public interface FoldGenerator {
+
+	/**
+	 * Generate cross validation folds, store in database.
+	 * 
+	 * @param corpusName
+	 *            class label
+	 * @param query
+	 *            query to get instance id - label - class triples
+	 * @param nFolds
+	 *            number of folds to generate
+	 * @param nMinPerClass
+	 *            minimum number of instances of each class per fold
+	 * @param nSeed
+	 *            random number seed; if null will be set currentTime in millis
+	 * @param nRuns
+	 *            number of runs
+	 */
+	public abstract void generateRuns(String corpusName, String splitName,
+			String query, int nFolds, int nMinPerClass, Integer nSeed, int nRuns);
+
+	/**
+	 * Generate cross validation folds, don't store in database.
+	 * 
+	 * @param labelToInstanceMap
+	 *            an instance class map without folds @see
+	 *            {@link InstanceData#labelToInstanceMap}
+	 * @param nFolds
+	 *            number of folds
+	 * @param nMinPerClass
+	 *            minimum instance per class
+	 * @param nSeed
+	 *            random seed default to System.currentTimeMillis()
+	 * @param nRuns
+	 *            number of runs
+	 * @param foldMap
+	 *            same structure as labelToInstanceMap, but with folds
+	 */
+	public SortedMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> generateRuns(
+			SortedMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> labelToInstanceMap,
+			int nFolds,
+			int nMinPerClass,
+			Integer nSeed,
+			int nRuns);
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/FoldGeneratorImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/FoldGeneratorImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/FoldGeneratorImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/FoldGeneratorImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,416 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Random;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao;
+import org.apache.ctakes.ytex.kernel.model.CrossValidationFold;
+import org.apache.ctakes.ytex.kernel.model.CrossValidationFoldInstance;
+
+
+/**
+ * utility generates cv fold splits, stores in db. Takes as a command line
+ * parameter -prop [property file]. Also reads properties from org.apache.ctakes.ytex.properties.
+ * Required properties:
+ * <ul>
+ * <li>org.apache.ctakes.ytex.corpusName
+ * <li>instanceClassQuery
+ * </ul>
+ * 
+ * Optional properties:
+ * <ul>
+ * <li>minPerClass default 1 minimum number of instances per class/fold. if not
+ * enough instances of a specific class, the instance will be repeated across
+ * folds. E.g. if you have only one example of one class, and 2 folds, that one
+ * example will be duplicated in both folds.
+ * <li>rand random number seed, defaults to current time millis
+ * <li>org.apache.ctakes.ytex.splitName default null cv_fold.split_name
+ * <li>folds default 2
+ * <li>runs default 5
+ * </ul>
+ * 
+ * @author vijay
+ */
+public class FoldGeneratorImpl implements FoldGenerator {
+	private static final Log log = LogFactory.getLog(FoldGeneratorImpl.class);
+
+	/**
+	 * iterate through the labels, split instances into folds
+	 * 
+	 * @param mapClassToInstanceId
+	 * @param nFolds
+	 * @param nMinPerClass
+	 * @param nSeed
+	 * @return list with nFolds sets of instance ids corresponding to the folds
+	 */
+	private static List<Set<Long>> createFolds(
+			Map<String, List<Long>> mapClassToInstanceId, int nFolds,
+			int nMinPerClass, Random r) {
+		List<Set<Long>> folds = new ArrayList<Set<Long>>(nFolds);
+		Map<String, List<Set<Long>>> mapLabelFolds = new HashMap<String, List<Set<Long>>>();
+		for (Map.Entry<String, List<Long>> classToInstanceId : mapClassToInstanceId
+				.entrySet()) {
+			List<Long> instanceIds = classToInstanceId.getValue();
+			Collections.shuffle(instanceIds, r);
+			List<Set<Long>> classFolds = new ArrayList<Set<Long>>(nFolds);
+			int blockSize = instanceIds.size() / nFolds;
+			for (int i = 0; i < nFolds; i++) {
+				Set<Long> foldInstanceIds = new HashSet<Long>(blockSize);
+				if (instanceIds.size() <= nMinPerClass) {
+					// we don't have minPerClass for the given class
+					// just add all of them to each fold
+					foldInstanceIds.addAll(instanceIds);
+				} else if (blockSize < nMinPerClass) {
+					// too few of the given class - just randomly select
+					// nMinPerClass
+					double fraction = (double) nMinPerClass
+							/ (double) instanceIds.size();
+					// iterate through the list, start somewhere in the middle
+					int instanceIdIndex = (int) (r.nextDouble() * instanceIds
+							.size());
+					while (foldInstanceIds.size() < nMinPerClass) {
+						// go back to beginning of list if we hit the end
+						if (instanceIdIndex >= instanceIds.size()) {
+							instanceIdIndex = 0;
+						}
+						// randomly select this line
+						if (r.nextDouble() <= fraction) {
+							long instanceId = instanceIds.get(instanceIdIndex);
+							foldInstanceIds.add(instanceId);
+						}
+						// go to next line
+						instanceIdIndex++;
+					}
+				} else {
+					int nStart = i * blockSize;
+					int nEnd = (i == nFolds - 1) ? instanceIds.size() : nStart
+							+ blockSize;
+					for (int instanceIdIndex = nStart; instanceIdIndex < nEnd; instanceIdIndex++) {
+						foldInstanceIds.add(instanceIds.get(instanceIdIndex));
+					}
+				}
+				classFolds.add(foldInstanceIds);
+			}
+			mapLabelFolds.put(classToInstanceId.getKey(), classFolds);
+		}
+		for (int i = 0; i < nFolds; i++) {
+			Set<Long> foldInstanceIds = new HashSet<Long>();
+			for (List<Set<Long>> labelFold : mapLabelFolds.values()) {
+				foldInstanceIds.addAll(labelFold.get(i));
+			}
+			folds.add(foldInstanceIds);
+		}
+		return folds;
+	}
+
+	@SuppressWarnings("static-access")
+	public static void main(String args[]) throws ParseException, IOException {
+		Options options = new Options();
+		options.addOption(OptionBuilder
+				.withArgName("prop")
+				.hasArg()
+				.withDescription(
+						"property file with query to retrieve instance id - label - class triples")
+				.create("prop"));
+		// OptionGroup group = new OptionGroup();
+		// group
+		// .addOption(OptionBuilder
+		// .withArgName("query")
+		// .hasArg()
+		// .withDescription(
+		// "query to retrieve instance id - label - class triples")
+		// .create("query"));
+		// group
+		// .addOption(OptionBuilder
+		// .withArgName("prop")
+		// .hasArg()
+		// .withDescription(
+		// "property file with query to retrieve instance id - label - class triples")
+		// .create("prop"));
+		// group.isRequired();
+		// options.addOptionGroup(group);
+		// options.addOption(OptionBuilder.withArgName("name").hasArg()
+		// .isRequired().withDescription("name. required").create("name"));
+		// options.addOption(OptionBuilder.withArgName("runs").hasArg()
+		// .withDescription("number of runs, default 1").create("runs"));
+		// options.addOption(OptionBuilder.withArgName("folds").hasArg()
+		// .withDescription("number of folds, default 4").create("folds"));
+		// options.addOption(OptionBuilder.withArgName("minPerClass").hasArg()
+		// .withDescription("minimum instances per class, default 1")
+		// .create("minPerClass"));
+		// options.addOption(OptionBuilder.withArgName("rand").hasArg()
+		// .withDescription(
+		// "random number seed; default current time in millis")
+		// .create("rand"));
+		try {
+			if (args.length == 0)
+				printHelp(options);
+			else {
+				CommandLineParser parser = new GnuParser();
+				CommandLine line = parser.parse(options, args);
+				String propFile = line.getOptionValue("prop");
+				Properties props = FileUtil.loadProperties(propFile, true);
+				// Integer rand = line.hasOption("rand") ? Integer.parseInt(line
+				// .getOptionValue("rand")) : null;
+				// int runs = Integer.parseInt(line.getOptionValue("runs",
+				// "1"));
+				// int minPerClass = Integer.parseInt(line.getOptionValue(
+				// "minPerClass", "1"));
+				// int folds = Integer.parseInt(line.getOptionValue("folds",
+				// "4"));
+				String corpusName = props.getProperty("org.apache.ctakes.ytex.corpusName");
+				String splitName = props.getProperty("org.apache.ctakes.ytex.splitName");
+				String query = props.getProperty("instanceClassQuery");
+				int folds = Integer.parseInt(props.getProperty("folds", "2"));
+				int runs = Integer.parseInt(props.getProperty("runs", "5"));
+				int minPerClass = Integer.parseInt(props.getProperty(
+						"minPerClass", "1"));
+				Integer rand = props.containsKey("rand") ? Integer
+						.parseInt(props.getProperty("rand")) : null;
+				boolean argsOk = true;
+				if (corpusName == null) {
+					log.error("missing parameter: org.apache.ctakes.ytex.corpusName");
+					argsOk = false;
+				}
+				if (query == null) {
+					log.error("missing parameter: instanceClassQuery");
+					argsOk = false;
+				}
+				if (!argsOk) {
+					printHelp(options);
+					System.exit(1);
+				} else {
+					KernelContextHolder
+							.getApplicationContext()
+							.getBean(FoldGenerator.class)
+							.generateRuns(corpusName, splitName, query, folds,
+									minPerClass, rand, runs);
+				}
+			}
+		} catch (ParseException pe) {
+			printHelp(options);
+		}
+	}
+
+	private static void printHelp(Options options) {
+		HelpFormatter formatter = new HelpFormatter();
+		formatter
+				.printHelp(
+						"java org.apache.ctakes.ytex.kernel.FoldGeneratorImpl splits training data into mxn training/test sets for mxn-fold cross validation",
+						options);
+	}
+
+	ClassifierEvaluationDao classifierEvaluationDao;
+
+	KernelUtil kernelUtil;
+
+	/**
+	 * generate folds for a run
+	 * 
+	 * @param labels
+	 * @param mapInstanceToClassLabel
+	 * @param name
+	 * @param splitName
+	 * @param run
+	 * @param query
+	 * @param nFolds
+	 * @param nMinPerClass
+	 * @param r
+	 */
+	public void generateFolds(Set<String> labels, InstanceData instances,
+			String corpusName, String splitName, int run, String query,
+			int nFolds, int nMinPerClass, Random r) {
+		for (String label : instances.getLabelToInstanceMap().keySet()) {
+			// there should not be any runs/folds/train test split - just unpeel
+			// until we get to the instance - class map
+			SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>> runMap = instances
+					.getLabelToInstanceMap().get(label);
+			SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>> foldMap = runMap
+					.values().iterator().next();
+			SortedMap<Boolean, SortedMap<Long, String>> trainMap = foldMap
+					.values().iterator().next();
+			SortedMap<Long, String> mapInstanceIdToClass = trainMap.values()
+					.iterator().next();
+			List<Set<Long>> folds = createFolds(nFolds, nMinPerClass, r,
+					mapInstanceIdToClass);
+			// insert the folds
+			insertFolds(folds, corpusName, splitName, label, run);
+		}
+	}
+
+	/**
+	 * inver the map of instance id to class, call createFolds
+	 * 
+	 * @param nFolds
+	 * @param nMinPerClass
+	 * @param r
+	 * @param mapInstanceIdToClass
+	 * @return
+	 */
+	private List<Set<Long>> createFolds(int nFolds, int nMinPerClass, Random r,
+			SortedMap<Long, String> mapInstanceIdToClass) {
+		// invert the mapInstanceIdToClass
+		Map<String, List<Long>> mapClassToInstanceId = new TreeMap<String, List<Long>>();
+		for (Map.Entry<Long, String> instance : mapInstanceIdToClass.entrySet()) {
+			String className = instance.getValue();
+			long instanceId = instance.getKey();
+			List<Long> classInstanceIds = mapClassToInstanceId.get(className);
+			if (classInstanceIds == null) {
+				classInstanceIds = new ArrayList<Long>();
+				mapClassToInstanceId.put(className, classInstanceIds);
+			}
+			classInstanceIds.add(instanceId);
+		}
+		// stratified split into folds
+		List<Set<Long>> folds = createFolds(mapClassToInstanceId, nFolds,
+				nMinPerClass, r);
+		return folds;
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see org.apache.ctakes.ytex.kernel.FoldGenerator#generateRuns(java.lang.String,
+	 * java.lang.String, int, int, java.lang.Integer, int)
+	 */
+	@Override
+	public void generateRuns(String corpusName, String splitName, String query,
+			int nFolds, int nMinPerClass, Integer nSeed, int nRuns) {
+		Random r = new Random(nSeed != null ? nSeed
+				: System.currentTimeMillis());
+		SortedSet<String> labels = new TreeSet<String>();
+		InstanceData instances = kernelUtil.loadInstances(query);
+		this.getClassifierEvaluationDao().deleteCrossValidationFoldByName(
+				corpusName, splitName);
+		for (int run = 1; run <= nRuns; run++) {
+			generateFolds(labels, instances, corpusName, splitName, run, query,
+					nFolds, nMinPerClass, r);
+		}
+	}
+
+	public ClassifierEvaluationDao getClassifierEvaluationDao() {
+		return classifierEvaluationDao;
+	}
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	/**
+	 * insert the folds into the database
+	 * 
+	 * @param folds
+	 * @param corpusName
+	 * @param run
+	 */
+	private void insertFolds(List<Set<Long>> folds, String corpusName,
+			String splitName, String label, int run) {
+		// iterate over fold numbers
+		for (int foldNum = 1; foldNum <= folds.size(); foldNum++) {
+			Set<CrossValidationFoldInstance> instanceIds = new HashSet<CrossValidationFoldInstance>();
+			// iterate over instances in each fold
+			for (int trainFoldNum = 1; trainFoldNum <= folds.size(); trainFoldNum++) {
+				// add the instance, set the train flag
+				for (long instanceId : folds.get(trainFoldNum - 1))
+					instanceIds.add(new CrossValidationFoldInstance(instanceId,
+							trainFoldNum != foldNum));
+			}
+			classifierEvaluationDao.saveFold(new CrossValidationFold(
+					corpusName, splitName, label, run, foldNum, instanceIds));
+			// insert test set
+			// classifierEvaluationDao.saveFold(new CrossValidationFold(name,
+			// label, run, foldNum, false, folds.get(foldNum - 1)));
+			// insert training set
+			// Set<Integer> trainInstances = new TreeSet<Integer>();
+			// for (int trainFoldNum = 1; trainFoldNum <= folds.size();
+			// trainFoldNum++) {
+			// if (trainFoldNum != foldNum)
+			// trainInstances.addAll(folds.get(trainFoldNum - 1));
+			// }
+			// classifierEvaluationDao.saveFold(new CrossValidationFold(name,
+			// label, run, foldNum, true, trainInstances));
+		}
+	}
+
+	public void setClassifierEvaluationDao(
+			ClassifierEvaluationDao classifierEvaluationDao) {
+		this.classifierEvaluationDao = classifierEvaluationDao;
+	}
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	@Override
+	public SortedMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> generateRuns(
+			SortedMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> labelToInstanceMap,
+			int nFolds, int nMinPerClass, Integer nSeed, int nRuns) {
+		// allocate map to return
+		SortedMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> labelToInstanceFoldMap = new TreeMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>>();
+		// initialize random seed
+		Random r = new Random(nSeed != null ? nSeed
+				: System.currentTimeMillis());
+		// iterate over labels
+		for (Map.Entry<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> labelRun : labelToInstanceMap
+				.entrySet()) {
+			String label = labelRun.getKey();
+			// extract the instance id - class map
+			SortedMap<Long, String> instanceClassMap = labelRun.getValue()
+					.get(0).get(0).get(true);
+			// allocate the run to fold map
+			SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>> runMap = new TreeMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>();
+			labelToInstanceFoldMap.put(label, runMap);
+			// iterate over runs
+			for (int run = 1; run <= nRuns; run++) {
+				// generate folds for run
+				List<Set<Long>> folds = createFolds(nFolds, nMinPerClass, r,
+						instanceClassMap);
+				SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>> foldMap = new TreeMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>();
+				// add the fold map to the run map
+				runMap.put(run, foldMap);
+				// iterate over folds
+				for (int trainFoldNum = 1; trainFoldNum <= folds.size(); trainFoldNum++) {
+					// add train/test sets for the fold
+					SortedMap<Boolean, SortedMap<Long, String>> trainTestMap = new TreeMap<Boolean, SortedMap<Long, String>>();
+					foldMap.put(trainFoldNum, trainTestMap);
+					trainTestMap.put(true, new TreeMap<Long, String>());
+					trainTestMap.put(false, new TreeMap<Long, String>());
+					// populate the train/test sets
+					Set<Long> testIds = folds.get(trainFoldNum - 1);
+					// iterate over all instances
+					for (Map.Entry<Long, String> instanceClass : instanceClassMap
+							.entrySet()) {
+						long instanceId = instanceClass.getKey();
+						String clazz = instanceClass.getValue();
+						// add the instance to the test set if it is in testIds,
+						// else to the train set
+						trainTestMap.get(!testIds.contains(instanceId)).put(
+								instanceId, clazz);
+					}
+				}
+			}
+		}
+		return labelToInstanceFoldMap;
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IRMetrics.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IRMetrics.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IRMetrics.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IRMetrics.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,95 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.text.MessageFormat;
+
+public class IRMetrics {
+	private static ThreadLocal<MessageFormat> doubleFormat = new ThreadLocal<MessageFormat>() {
+
+		@Override
+		protected MessageFormat initialValue() {
+			return new MessageFormat("{0,number,#.###}");
+		}
+
+	};
+
+	public static String formatDouble(double d) {
+		return doubleFormat.get().format(new Object[] { new Double(d) });
+	}
+
+	int tp;
+	int fp;
+	int tn;
+	int fn;
+
+	public IRMetrics() {
+		super();
+	}
+
+	public IRMetrics(int tp, int fp, int tn, int fn) {
+		super();
+		this.tp = tp;
+		this.fp = fp;
+		this.tn = tn;
+		this.fn = fn;
+	}
+
+	public int getTp() {
+		return tp;
+	}
+
+	public void setTp(int tp) {
+		this.tp = tp;
+	}
+
+	public int getFp() {
+		return fp;
+	}
+
+	public void setFp(int fp) {
+		this.fp = fp;
+	}
+
+	public int getTn() {
+		return tn;
+	}
+
+	public void setTn(int tn) {
+		this.tn = tn;
+	}
+
+	public int getFn() {
+		return fn;
+	}
+
+	public void setFn(int fn) {
+		this.fn = fn;
+	}
+
+	public double getPrecision() {
+
+		return (tp + fp) > 0 ? (double) (tp) / ((double) (tp + fp)) : 0;
+	}
+
+	public double getRecall() {
+		return (tp + fn) > 0 ? (double) (tp) / ((double) (tp + fn)) : 0;
+	}
+
+	public double getF1() {
+		return (getPrecision() + getRecall()) > 0 ? 2 * getPrecision()
+				* getRecall() / (getPrecision() + getRecall()) : 0;
+	}
+
+	/**
+	 * return tab delimited ir metrics:<br/>
+	 * tp fp tn fn precision recall f1
+	 */
+	public String toString() {
+		StringBuilder b = new StringBuilder();
+		b.append(this.getTp()).append("\t").append(this.getFp()).append("\t")
+				.append(this.getTn()).append("\t").append(this.getFn())
+				.append("\t").append(formatDouble(this.getPrecision()))
+				.append("\t").append(formatDouble(this.getRecall()))
+				.append("\t").append(formatDouble(this.getF1()));
+		return b.toString();
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/ImputedFeatureEvaluator.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/ImputedFeatureEvaluator.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/ImputedFeatureEvaluator.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/ImputedFeatureEvaluator.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,30 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.IOException;
+
+import org.apache.ctakes.ytex.kernel.ImputedFeatureEvaluatorImpl.Parameters;
+
+
+public interface ImputedFeatureEvaluator {
+	public enum MeasureType {
+		MUTUALINFO("mutualinfo"), INFOGAIN("infogain");
+		String name;
+
+		public String getName() {
+			return name;
+		}
+
+		MeasureType(String name) {
+			this.name = name;
+		}
+	};
+
+	public static final String SUFFIX_PROP = "-propagated";
+	public static final String SUFFIX_IMPUTED = "-imputed";
+	public static final String SUFFIX_IMPUTED_FILTERED = "-imputed-filt";
+
+	public abstract boolean evaluateCorpus(String propFile) throws IOException;
+
+	boolean evaluateCorpus(Parameters params);
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/ImputedFeatureEvaluatorImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/ImputedFeatureEvaluatorImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/ImputedFeatureEvaluatorImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/ImputedFeatureEvaluatorImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,1318 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Properties;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import javax.sql.DataSource;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao;
+import org.apache.ctakes.ytex.kernel.dao.ConceptDao;
+import org.apache.ctakes.ytex.kernel.model.ConcRel;
+import org.apache.ctakes.ytex.kernel.model.ConceptGraph;
+import org.apache.ctakes.ytex.kernel.model.CrossValidationFold;
+import org.apache.ctakes.ytex.kernel.model.FeatureEvaluation;
+import org.apache.ctakes.ytex.kernel.model.FeatureParentChild;
+import org.apache.ctakes.ytex.kernel.model.FeatureRank;
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.jdbc.core.RowCallbackHandler;
+import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
+import org.springframework.transaction.PlatformTransactionManager;
+
+import weka.core.ContingencyTables;
+
+/**
+ * Calculate the mutual information of each concept of a corpus wrt a concept
+ * graph and classification task (label) and possibly a fold. We calculate the
+ * following:
+ * <ul>
+ * <li>raw mutual information of each concept (infogain). We calculate the joint
+ * distribution of concepts (X) and document classes (Y), and compute the mutual
+ * information for each concept.
+ * <li>mutual information inherited by parents (infogain-parent). For each
+ * concept in the concept graph, we merge the joint distribution of child
+ * concepts. This is done recursively.
+ * <li>mutual information inherited by children from parents (infogain-child).
+ * We take the top n concepts and assign their children (entire subgraph) the
+ * mutual info of the parent.
+ * </ul>
+ * <p>
+ * The mutual information of each concept is stored in the feature_rank table.
+ * The related records in the feature_eval table have the following values:
+ * <ul>
+ * <li>type = infogain, infogain-parent, infogain-imputed, infogain-imputed-filt
+ * <li>feature_set_name = conceptSetName
+ * <li>param1 = conceptGraphName
+ * </ul>
+ * 
+ * How this works in broad strokes:
+ * <ul>
+ * <li> {@link #evaluateCorpus(Parameters)} load instances, iterate through
+ * labels
+ * <li>
+ * {@link #evaluateCorpusLabel(Parameters, ConceptGraph, InstanceData, String)}
+ * load concept - set[document] map for the specified label, iterate through
+ * folds
+ * <li>
+ * {@link #evaluateCorpusFold(Parameters, Map, ConceptGraph, InstanceData, String, Map, int)}
+ * create raw joint distribution of each concept, compute parent joint
+ * distributions, assign children mutual info of parents
+ * <li> {@link #completeJointDistroForFold(Map, Map, Set, Set, String)} computes
+ * raw joint distribution of each concept
+ * <li>
+ * {@link #propagateJointDistribution(Map, Parameters, String, int, ConceptGraph, Map)}
+ * recursively compute parent joint distribution by merging joint distro of
+ * children.
+ * <li>{@link #storeChildConcepts(Parameters, String, int, ConceptGraph)} take
+ * top ranked parent concepts, assign concepts in subtrees the mutual info of
+ * parents. Only concepts that exist in the corpus are added (depends on
+ * computing the infocontent of concepts with CorpusEvaluator)
+ * </ul>
+ * 
+ * 
+ * @author vijay
+ * 
+ */
+public class ImputedFeatureEvaluatorImpl implements ImputedFeatureEvaluator {
+
+	/**
+	 * fill in map of Concept Id - bin - instance ids
+	 * 
+	 * @author vijay
+	 * 
+	 */
+	public class ConceptInstanceMapExtractor implements RowCallbackHandler {
+		ConceptGraph cg;
+		Map<String, Map<String, Set<Long>>> conceptInstanceMap;
+
+		ConceptInstanceMapExtractor(
+				Map<String, Map<String, Set<Long>>> conceptInstanceMap,
+				ConceptGraph cg) {
+			this.cg = cg;
+			this.conceptInstanceMap = conceptInstanceMap;
+		}
+
+		public void processRow(ResultSet rs) throws SQLException {
+			String conceptId = rs.getString(1);
+			long instanceId = rs.getLong(2);
+			String x = rs.getString(3);
+			Map<String, Set<Long>> binInstanceMap = conceptInstanceMap
+					.get(conceptId);
+			if (binInstanceMap == null) {
+				// use the conceptId from the concept to save memory
+				binInstanceMap = new HashMap<String, Set<Long>>(2);
+				conceptInstanceMap.put(conceptId, binInstanceMap);
+			}
+			Set<Long> instanceIds = binInstanceMap.get(x);
+			if (instanceIds == null) {
+				instanceIds = new HashSet<Long>();
+				binInstanceMap.put(x, instanceIds);
+			}
+			instanceIds.add(instanceId);
+		}
+	}
+
+	/**
+	 * joint distribution of concept (x) and class (y). The bins for x and y are
+	 * predetermined. Typical levels for x are 0/1 (absent/present) and -1/0/1
+	 * (negated/not present/affirmed).
+	 * 
+	 * @author vijay
+	 * 
+	 */
+	public static class JointDistribution {
+		/**
+		 * merge joint distributions into a single distribution. For each value
+		 * of Y, the cells for each X bin, except for the xMerge bin, are the
+		 * intersection of all the instances in each of the corresponding bins.
+		 * The xMerge bin gets everything that is leftover.
+		 * 
+		 * @param jointDistros
+		 *            list of joint distribution tables to merge
+		 * @param yMargin
+		 *            map of y val - instance id. this could be calculated on
+		 *            the fly, but we have this information already.
+		 * @param xMerge
+		 *            the x val that contains everything that doesn't land in
+		 *            any of the other bins.
+		 * @return
+		 */
+		public static JointDistribution merge(
+				List<JointDistribution> jointDistros,
+				Map<String, Set<Long>> yMargin, String xMerge) {
+			Set<String> xVals = jointDistros.get(0).xVals;
+			Set<String> yVals = jointDistros.get(0).yVals;
+			JointDistribution mergedDistro = new JointDistribution(xVals, yVals);
+			for (String y : yVals) {
+				// intersect all bins besides the merge bin
+				Set<Long> xMergedInst = mergedDistro.getInstances(xMerge, y);
+				// everything comes into the merge bin
+				// we take out things that land in other bins
+				xMergedInst.addAll(yMargin.get(y));
+				// iterate over other bins
+				for (String x : xVals) {
+					if (!x.equals(xMerge)) {
+						Set<Long> intersectIds = mergedDistro
+								.getInstances(x, y);
+						boolean bFirstIter = true;
+						// iterate over all joint distribution tables
+						for (JointDistribution distro : jointDistros) {
+							if (bFirstIter) {
+								// 1st iter - add all
+								intersectIds.addAll(distro.getInstances(x, y));
+								bFirstIter = false;
+							} else {
+								// subsequent iteration - intersect
+								intersectIds.retainAll(distro
+										.getInstances(x, y));
+							}
+						}
+						// remove from the merge bin
+						xMergedInst.removeAll(intersectIds);
+					}
+				}
+			}
+			return mergedDistro;
+		}
+
+		protected double[][] contingencyTable;
+		/**
+		 * the entropy of X. Calculated once and returned as needed.
+		 */
+		protected Double entropyX = null;
+		/**
+		 * the entropy of X*Y. Calculated once and returned as needed.
+		 */
+		protected Double entropyXY = null;
+		/**
+		 * A y*x table where the cells hold the instance ids. We use the
+		 * instance ids instead of counts so we can merge the tables.
+		 */
+		protected SortedMap<String, SortedMap<String, Set<Long>>> jointDistroTable;
+		/**
+		 * the possible values of X (e.g. concept)
+		 */
+		protected Set<String> xVals;
+
+		/**
+		 * the possible values of Y (e.g. text)
+		 */
+		protected Set<String> yVals;
+
+		/**
+		 * set up the joint distribution table.
+		 * 
+		 * @param xVals
+		 *            the possible x values (bins)
+		 * @param yVals
+		 *            the possible y values (bins)
+		 */
+		public JointDistribution(Set<String> xVals, Set<String> yVals) {
+			this.xVals = xVals;
+			this.yVals = yVals;
+			jointDistroTable = new TreeMap<String, SortedMap<String, Set<Long>>>();
+			for (String yVal : yVals) {
+				SortedMap<String, Set<Long>> yMap = new TreeMap<String, Set<Long>>();
+				jointDistroTable.put(yVal, yMap);
+				for (String xVal : xVals) {
+					yMap.put(xVal, new HashSet<Long>());
+				}
+			}
+		}
+
+		public JointDistribution(Set<String> xVals, Set<String> yVals,
+				Map<String, Set<Long>> xMargin, Map<String, Set<Long>> yMargin,
+				String xLeftover) {
+			this.xVals = xVals;
+			this.yVals = yVals;
+			jointDistroTable = new TreeMap<String, SortedMap<String, Set<Long>>>();
+			for (String yVal : yVals) {
+				SortedMap<String, Set<Long>> yMap = new TreeMap<String, Set<Long>>();
+				jointDistroTable.put(yVal, yMap);
+				for (String xVal : xVals) {
+					yMap.put(xVal, new HashSet<Long>());
+				}
+			}
+			for (Map.Entry<String, Set<Long>> yEntry : yMargin.entrySet()) {
+				// iterate over 'rows' i.e. the class names
+				String yName = yEntry.getKey();
+				Set<Long> yInst = new HashSet<Long>(yEntry.getValue());
+				// iterate over 'columns' i.e. the values of x
+				for (Map.Entry<String, Set<Long>> xEntry : xMargin.entrySet()) {
+					// copy the instances
+					Set<Long> foldXInst = jointDistroTable.get(yName).get(
+							xEntry.getKey());
+					foldXInst.addAll(xEntry.getValue());
+					// keep only the ones that are in this fold
+					foldXInst.retainAll(yInst);
+					// remove the instances for this value of x from the set of
+					// all instances
+					yInst.removeAll(foldXInst);
+				}
+				if (yInst.size() > 0) {
+					// add the leftovers to the leftover bin
+					jointDistroTable.get(yEntry.getKey()).get(xLeftover)
+							.addAll(yInst);
+				}
+			}
+
+		}
+
+		// /**
+		// * add an instance to the joint probability table
+		// *
+		// * @param x
+		// * @param y
+		// * @param instanceId
+		// */
+		// public void addInstance(String x, String y, int instanceId) {
+		// // add the current row to the bin matrix
+		// SortedMap<String, Set<Integer>> xMap = jointDistroTable.get(y);
+		// if (xMap == null) {
+		// xMap = new TreeMap<String, Set<Integer>>();
+		// jointDistroTable.put(y, xMap);
+		// }
+		// Set<Integer> instanceSet = xMap.get(x);
+		// if (instanceSet == null) {
+		// instanceSet = new HashSet<Integer>();
+		// xMap.put(x, instanceSet);
+		// }
+		// instanceSet.add(instanceId);
+		// }
+
+		// /**
+		// * finalize the joint probability table wrt the specified instances.
+		// If
+		// * we are doing this per fold, then not all instances are going to be
+		// in
+		// * each fold. Limit to the instances in the specified fold.
+		// * <p>
+		// * Also, we might not have filled in all the cells. if necessary, put
+		// * instances in the 'leftover' cell, fill it in based on the marginal
+		// * distribution of the instances wrt classes.
+		// *
+		// * @param yMargin
+		// * map of values of y to the instances with that value
+		// * @param xLeftover
+		// * the value of x to assign the the leftover instances
+		// */
+		// public JointDistribution complete(Map<String, Set<Integer>> xMargin,
+		// Map<String, Set<Integer>> yMargin, String xLeftover) {
+		// JointDistribution foldDistro = new JointDistribution(this.xVals,
+		// this.yVals);
+		// for (Map.Entry<String, Set<Integer>> yEntry : yMargin.entrySet()) {
+		// // iterate over 'rows' i.e. the class names
+		// String yName = yEntry.getKey();
+		// Set<Integer> yInst = new HashSet<Integer>(yEntry.getValue());
+		// // iterate over 'columns' i.e. the values of x
+		// for (Map.Entry<String, Set<Integer>> xEntry : this.jointDistroTable
+		// .get(yName).entrySet()) {
+		// // copy the instances
+		// Set<Integer> foldXInst = foldDistro.jointDistroTable.get(
+		// yName).get(xEntry.getKey());
+		// foldXInst.addAll(xEntry.getValue());
+		// // keep only the ones that are in this fold
+		// foldXInst.retainAll(yInst);
+		// // remove the instances for this value of x from the set of
+		// // all instances
+		// yInst.removeAll(foldXInst);
+		// }
+		// if (yInst.size() > 0) {
+		// // add the leftovers to the leftover bin
+		// foldDistro.jointDistroTable.get(yEntry.getKey())
+		// .get(xLeftover).addAll(yInst);
+		// }
+		// }
+		// return foldDistro;
+		// }
+
+		public double[][] getContingencyTable() {
+			if (contingencyTable == null) {
+				contingencyTable = new double[this.yVals.size()][this.xVals
+						.size()];
+				int i = 0;
+				for (String yVal : yVals) {
+					int j = 0;
+					for (String xVal : xVals) {
+						contingencyTable[i][j] = jointDistroTable.get(yVal)
+								.get(xVal).size();
+						j++;
+					}
+					i++;
+				}
+			}
+			return contingencyTable;
+		}
+
+		public double getEntropyX() {
+			double probs[] = new double[xVals.size()];
+			Arrays.fill(probs, 0d);
+			if (entropyX == null) {
+				double nTotal = 0;
+				for (Map<String, Set<Long>> xInstance : this.jointDistroTable
+						.values()) {
+					int i = 0;
+					for (Set<Long> instances : xInstance.values()) {
+						double nCell = (double) instances.size();
+						nTotal += nCell;
+						probs[i] += nCell;
+						i++;
+					}
+				}
+				for (int i = 0; i < probs.length; i++)
+					probs[i] /= nTotal;
+				entropyX = entropy(probs);
+			}
+			return entropyX;
+		}
+
+		public double getEntropyXY() {
+			double probs[] = new double[xVals.size() * yVals.size()];
+			Arrays.fill(probs, 0d);
+			if (entropyXY == null) {
+				double nTotal = 0;
+				int i = 0;
+				for (Map<String, Set<Long>> xInstance : this.jointDistroTable
+						.values()) {
+					for (Set<Long> instances : xInstance.values()) {
+						probs[i] = (double) instances.size();
+						nTotal += probs[i];
+						i++;
+					}
+				}
+				for (int j = 0; j < probs.length; j++)
+					probs[j] /= nTotal;
+				entropyXY = entropy(probs);
+			}
+			return entropyXY;
+		}
+
+		public double getInfoGain() {
+			return ContingencyTables.entropyOverColumns(getContingencyTable())
+					- ContingencyTables
+							.entropyConditionedOnRows(getContingencyTable());
+		}
+
+		public Set<Long> getInstances(String x, String y) {
+			return jointDistroTable.get(y).get(x);
+		}
+
+		public double getMutualInformation(double entropyY) {
+			return entropyY + this.getEntropyX() - this.getEntropyXY();
+		}
+
+		/**
+		 * print out joint distribution table
+		 */
+		public String toString() {
+			StringBuilder b = new StringBuilder();
+			b.append(this.getClass().getCanonicalName());
+			b.append(" [jointDistro=(");
+			Iterator<Entry<String, SortedMap<String, Set<Long>>>> yIter = this.jointDistroTable
+					.entrySet().iterator();
+			while (yIter.hasNext()) {
+				Entry<String, SortedMap<String, Set<Long>>> yEntry = yIter
+						.next();
+				Iterator<Entry<String, Set<Long>>> xIter = yEntry.getValue()
+						.entrySet().iterator();
+				while (xIter.hasNext()) {
+					Entry<String, Set<Long>> xEntry = xIter.next();
+					b.append(xEntry.getValue().size());
+					if (xIter.hasNext())
+						b.append(", ");
+				}
+				if (yIter.hasNext())
+					b.append("| ");
+			}
+			b.append(")]");
+			return b.toString();
+		}
+	}
+
+	/**
+	 * We are passing around quite a few parameters. It gets to be a pain, so
+	 * put everything in an object.
+	 * 
+	 * @author vijay
+	 * 
+	 */
+	public static class Parameters {
+		String classFeatureQuery;
+		String conceptGraphName;
+		String conceptSetName;
+		String corpusName;
+		String freqQuery;
+		double imputeWeight;
+
+		String labelQuery;
+		MeasureType measure;
+		double minInfo;
+		Double parentConceptEvalThreshold;
+		Integer parentConceptTopThreshold;
+		String splitName;
+		String xLeftover;
+		String xMerge;
+		Set<String> xVals;
+
+		public Parameters() {
+
+		}
+
+		public Parameters(Properties props) {
+			corpusName = props.getProperty("org.apache.ctakes.ytex.corpusName");
+			conceptGraphName = props.getProperty("org.apache.ctakes.ytex.conceptGraphName");
+			conceptSetName = props.getProperty("org.apache.ctakes.ytex.conceptSetName");
+			splitName = props.getProperty("org.apache.ctakes.ytex.splitName");
+			labelQuery = props.getProperty("instanceClassQuery");
+			classFeatureQuery = props.getProperty("org.apache.ctakes.ytex.conceptInstanceQuery");
+			freqQuery = props.getProperty("org.apache.ctakes.ytex.freqQuery");
+			minInfo = Double.parseDouble(props.getProperty("min.info", "1e-4"));
+			String xValStr = props.getProperty("org.apache.ctakes.ytex.xVals", "0,1");
+			xVals = new HashSet<String>();
+			xVals.addAll(Arrays.asList(xValStr.split(",")));
+			xLeftover = props.getProperty("org.apache.ctakes.ytex.xLeftover", "0");
+			xMerge = props.getProperty("org.apache.ctakes.ytex.xMerge", "1");
+			this.measure = MeasureType.valueOf(props.getProperty(
+					"org.apache.ctakes.ytex.measure", "INFOGAIN"));
+			parentConceptEvalThreshold = FileUtil.getDoubleProperty(props,
+					"org.apache.ctakes.ytex.parentConceptEvalThreshold", null);
+			parentConceptTopThreshold = parentConceptEvalThreshold == null ? FileUtil
+					.getIntegerProperty(props,
+							"org.apache.ctakes.ytex.parentConceptTopThreshold", 25) : null;
+			imputeWeight = FileUtil.getDoubleProperty(props,
+					"org.apache.ctakes.ytex.imputeWeight", 1d);
+		}
+
+		public String getClassFeatureQuery() {
+			return classFeatureQuery;
+		}
+
+		public String getConceptGraphName() {
+			return conceptGraphName;
+		}
+
+		public String getConceptSetName() {
+			return conceptSetName;
+		}
+
+		public String getCorpusName() {
+			return corpusName;
+		}
+
+		public String getFreqQuery() {
+			return freqQuery;
+		}
+
+		public double getImputeWeight() {
+			return imputeWeight;
+		}
+
+		public String getLabelQuery() {
+			return labelQuery;
+		}
+
+		public MeasureType getMeasure() {
+			return measure;
+		}
+
+		public double getMinInfo() {
+			return minInfo;
+		}
+
+		public Double getParentConceptEvalThreshold() {
+			return parentConceptEvalThreshold;
+		}
+
+		public Integer getParentConceptTopThreshold() {
+			return parentConceptTopThreshold;
+		}
+
+		public String getSplitName() {
+			return splitName;
+		}
+
+		public String getxLeftover() {
+			return xLeftover;
+		}
+
+		public String getxMerge() {
+			return xMerge;
+		}
+
+		public Set<String> getxVals() {
+			return xVals;
+		}
+	}
+
+	// /**
+	// * iterates through query results and computes infogain
+	// *
+	// * @author vijay
+	// *
+	// */
+	// public class JointDistroExtractor implements RowCallbackHandler {
+	// /**
+	// * key - fold
+	// * <p/>
+	// * value - map of concept id - joint distribution
+	// */
+	// private Map<String, JointDistribution> jointDistroMap;
+	// private Set<String> xVals;
+	// private Set<String> yVals;
+	// private Map<Integer, String> instanceClassMap;
+	//
+	// public JointDistroExtractor(
+	// Map<String, JointDistribution> jointDistroMap,
+	// Set<String> xVals, Set<String> yVals,
+	// Map<Integer, String> instanceClassMap) {
+	// super();
+	// this.xVals = xVals;
+	// this.yVals = yVals;
+	// this.jointDistroMap = jointDistroMap;
+	// this.instanceClassMap = instanceClassMap;
+	// }
+	//
+	// public void processRow(ResultSet rs) throws SQLException {
+	// int instanceId = rs.getInt(1);
+	// String conceptId = rs.getString(2);
+	// String x = rs.getString(3);
+	// String y = instanceClassMap.get(instanceId);
+	// JointDistribution distro = jointDistroMap.get(conceptId);
+	// if (distro == null) {
+	// distro = new JointDistribution(xVals, yVals);
+	// jointDistroMap.put(conceptId, distro);
+	// }
+	// distro.addInstance(x, y, instanceId);
+	// }
+	// }
+
+	private static final Log log = LogFactory
+			.getLog(ImputedFeatureEvaluatorImpl.class);
+
+	protected static double entropy(double[] classProbs) {
+		double entropy = 0;
+		double log2 = Math.log(2);
+		for (double prob : classProbs) {
+			if (prob > 0)
+				entropy += prob * Math.log(prob) / log2;
+		}
+		return entropy * -1;
+	}
+
+	/**
+	 * calculate entropy from a list/array of probabilities
+	 * 
+	 * @param classProbs
+	 * @return
+	 */
+	protected static double entropy(Iterable<Double> classProbs) {
+		double entropy = 0;
+		double log2 = Math.log(2);
+		for (double prob : classProbs) {
+			if (prob > 0)
+				entropy += prob * Math.log(prob) / log2;
+		}
+		return entropy * -1;
+	}
+
+	@SuppressWarnings("static-access")
+	public static void main(String args[]) throws ParseException, IOException {
+		Options options = new Options();
+		options.addOption(OptionBuilder
+				.withArgName("prop")
+				.hasArg()
+				.isRequired()
+				.withDescription(
+						"property file with queries and other parameters. todo desc")
+				.create("prop"));
+		try {
+			CommandLineParser parser = new GnuParser();
+			CommandLine line = parser.parse(options, args);
+			if (!KernelContextHolder.getApplicationContext()
+					.getBean(ImputedFeatureEvaluator.class)
+					.evaluateCorpus(line.getOptionValue("prop"))) {
+				printHelp(options);
+			}
+		} catch (ParseException pe) {
+			printHelp(options);
+		}
+	}
+
+	private static void printHelp(Options options) {
+		HelpFormatter formatter = new HelpFormatter();
+		formatter
+				.printHelp(
+						"java "
+								+ ImputedFeatureEvaluatorImpl.class.getName()
+								+ " calculate raw, propagated, and imputed infogain for each feature",
+						options);
+	}
+
+	protected ClassifierEvaluationDao classifierEvaluationDao;
+
+	protected ConceptDao conceptDao;
+
+	private InfoContentEvaluator infoContentEvaluator;
+
+	protected JdbcTemplate jdbcTemplate;
+
+	protected KernelUtil kernelUtil;
+	protected NamedParameterJdbcTemplate namedParamJdbcTemplate;
+
+	protected PlatformTransactionManager transactionManager;
+
+	private Properties ytexProperties = null;
+
+	/**
+	 * recursively add children of cr to childConcepts
+	 * 
+	 * @param childConcepts
+	 * @param cr
+	 */
+	private void addSubtree(Set<String> childConcepts, ConcRel cr) {
+		childConcepts.add(cr.getConceptID());
+		for (ConcRel crc : cr.getChildren()) {
+			addSubtree(childConcepts, crc);
+		}
+	}
+
+	private JointDistribution calcMergedJointDistribution(
+			Map<String, JointDistribution> conceptJointDistroMap,
+			Map<String, Integer> conceptDistMap, ConcRel cr,
+			Map<String, JointDistribution> rawJointDistroMap,
+			Map<String, Set<Long>> yMargin, String xMerge, double minInfo,
+			List<String> path) {
+		if (conceptJointDistroMap.containsKey(cr.getConceptID())) {
+			return conceptJointDistroMap.get(cr.getConceptID());
+		} else {
+			List<JointDistribution> distroList = new ArrayList<JointDistribution>(
+					cr.getChildren().size() + 1);
+			int distance = -1;
+			// if this concept is in the raw joint distro map, add it to the
+			// list of joint distributions to merge
+			if (rawJointDistroMap.containsKey(cr.getConceptID())) {
+				JointDistribution rawJointDistro = rawJointDistroMap.get(cr
+						.getConceptID());
+				distroList.add(rawJointDistro);
+				distance = 0;
+			}
+			// get the joint distributions of children
+			for (ConcRel crc : cr.getChildren()) {
+				List<String> pathChild = new ArrayList<String>(path.size() + 1);
+				pathChild.addAll(path);
+				pathChild.add(crc.getConceptID());
+				// recurse - get joint distribution of children
+				JointDistribution jdChild = calcMergedJointDistribution(
+						conceptJointDistroMap, conceptDistMap, crc,
+						rawJointDistroMap, yMargin, xMerge, minInfo, pathChild);
+				if (jdChild != null) {
+					distroList.add(jdChild);
+					if (distance != 0) {
+						// look at children's distance from raw data, add 1
+						int distChild = conceptDistMap.get(crc.getConceptID());
+						if (distance == -1 || (distChild + 1) < distance) {
+							distance = distChild + 1;
+						}
+					}
+				}
+			}
+			// merge the joint distributions
+			JointDistribution mergedDistro;
+			if (distroList.size() > 0) {
+				if (distroList.size() == 1) {
+					// only one joint distro - trivial merge
+					mergedDistro = distroList.get(0);
+				} else {
+					// multiple joint distros - merge them into a new one
+					mergedDistro = JointDistribution.merge(distroList, yMargin,
+							xMerge);
+				}
+				// if (log.isDebugEnabled()) {
+				// log.debug("path = " + path + ", distroList = " + distroList
+				// + ", distro = " + mergedDistro);
+				// }
+			} else {
+				// no joint distros to merge - null
+				mergedDistro = null;
+			}
+			// save this in the map
+			conceptJointDistroMap.put(cr.getConceptID(), mergedDistro);
+			if (distance > -1)
+				conceptDistMap.put(cr.getConceptID(), distance);
+			return mergedDistro;
+		}
+	}
+
+	/**
+	 * 
+	 */
+	private double calculateFoldEntropy(Map<String, Set<Long>> classCountMap) {
+		int total = 0;
+		List<Double> classProbs = new ArrayList<Double>(classCountMap.size());
+		// calculate total number of instances in this fold
+		for (Set<Long> instances : classCountMap.values()) {
+			total += instances.size();
+		}
+		// calculate per-class probability in this fold
+		for (Set<Long> instances : classCountMap.values()) {
+			classProbs.add((double) instances.size() / (double) total);
+		}
+		return entropy(classProbs);
+	}
+
+	/**
+	 * finalize the joint distribution tables wrt a fold.
+	 * 
+	 * @param jointDistroMap
+	 * @param yMargin
+	 * @param yVals
+	 * @param xVals
+	 * @param xLeftover
+	 */
+	private Map<String, JointDistribution> completeJointDistroForFold(
+			Map<String, Map<String, Set<Long>>> conceptInstanceMap,
+			Map<String, Set<Long>> yMargin, Set<String> xVals,
+			Set<String> yVals, String xLeftover) {
+		//
+		Map<String, JointDistribution> foldJointDistroMap = new HashMap<String, JointDistribution>(
+				conceptInstanceMap.size());
+		for (Map.Entry<String, Map<String, Set<Long>>> conceptInstance : conceptInstanceMap
+				.entrySet()) {
+			foldJointDistroMap.put(
+					conceptInstance.getKey(),
+					new JointDistribution(xVals, yVals, conceptInstance
+							.getValue(), yMargin, xLeftover));
+		}
+		return foldJointDistroMap;
+	}
+
+	/**
+	 * delete the feature evaluations before we insert them
+	 * 
+	 * @param params
+	 * @param label
+	 * @param foldId
+	 */
+	private void deleteFeatureEval(Parameters params, String label, int foldId) {
+
+		for (String type : new String[] { params.getMeasure().getName(),
+				params.getMeasure().getName() + SUFFIX_PROP,
+				params.getMeasure().getName() + SUFFIX_IMPUTED,
+				params.getMeasure().getName() + SUFFIX_IMPUTED_FILTERED })
+			this.classifierEvaluationDao.deleteFeatureEvaluation(
+					params.getCorpusName(), params.getConceptSetName(), label,
+					type, foldId, 0d, params.getConceptGraphName());
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see org.apache.ctakes.ytex.kernel.CorpusLabelEvaluator#evaluateCorpus(java.lang.String,
+	 * java.lang.String, java.lang.String, java.lang.String, java.lang.String,
+	 * java.lang.Double, java.util.Set, java.lang.String, java.lang.String)
+	 */
+	@Override
+	public boolean evaluateCorpus(final Parameters params) {
+		if (!(params.getCorpusName() != null
+				&& params.getConceptGraphName() != null
+				&& params.getLabelQuery() != null && params
+				.getClassFeatureQuery() != null))
+			return false;
+		ConceptGraph cg = conceptDao.getConceptGraph(params
+				.getConceptGraphName());
+		InstanceData instanceData = kernelUtil.loadInstances(params
+				.getLabelQuery());
+		for (String label : instanceData.getLabelToInstanceMap().keySet()) {
+			evaluateCorpusLabel(params, cg, instanceData, label);
+		}
+		return true;
+	}
+
+	@Override
+	public boolean evaluateCorpus(String propFile) throws IOException {
+		Properties props = new Properties();
+		// put org.apache.ctakes.ytex properties in props
+		props.putAll(this.getYtexProperties());
+		// override org.apache.ctakes.ytex properties with propfile
+		props.putAll(FileUtil.loadProperties(propFile, true));
+		return this.evaluateCorpus(new Parameters(props));
+	}
+
+	private void evaluateCorpusFold(Parameters params,
+			Map<String, Set<Long>> yMargin, ConceptGraph cg,
+			InstanceData instanceData, String label,
+			Map<String, Map<String, Set<Long>>> conceptInstanceMap, int foldId) {
+		if (log.isInfoEnabled())
+			log.info("evaluateCorpusFold() label = " + label + ", fold = "
+					+ foldId);
+		deleteFeatureEval(params, label, foldId);
+
+		// get the entropy of Y for this fold
+		double yEntropy = this.calculateFoldEntropy(yMargin);
+		// get the joint distribution of concepts and instances
+		Map<String, JointDistribution> rawJointDistro = this
+				.completeJointDistroForFold(conceptInstanceMap, yMargin, params
+						.getxVals(),
+						instanceData.getLabelToClassMap().get(label), params
+								.getxLeftover());
+		List<FeatureRank> listRawRanks = new ArrayList<FeatureRank>(
+				rawJointDistro.size());
+		FeatureEvaluation feRaw = saveFeatureEvaluation(rawJointDistro, params,
+				label, foldId, yEntropy, "", listRawRanks);
+		// propagate across graph and save
+		propagateJointDistribution(rawJointDistro, params, label, foldId, cg,
+				yMargin);
+		// store children of top concepts
+		storeChildConcepts(listRawRanks, params, label, foldId, cg, true);
+		storeChildConcepts(listRawRanks, params, label, foldId, cg, false);
+	}
+
+	/**
+	 * evaluate corpus on label
+	 * 
+	 * @param classFeatureQuery
+	 * @param minInfo
+	 * @param xVals
+	 * @param xLeftover
+	 * @param xMerge
+	 * @param eval
+	 * @param cg
+	 * @param instanceData
+	 * @param label
+	 * @param parentConceptTopThreshold
+	 * @param parentConceptEvalThreshold
+	 */
+	private void evaluateCorpusLabel(Parameters params, ConceptGraph cg,
+			InstanceData instanceData, String label) {
+		if (log.isInfoEnabled())
+			log.info("evaluateCorpusLabel() label = " + label);
+		Map<String, Map<String, Set<Long>>> conceptInstanceMap = loadConceptInstanceMap(
+				params.getClassFeatureQuery(), cg, label);
+		for (int run : instanceData.getLabelToInstanceMap().get(label).keySet()) {
+			for (int fold : instanceData.getLabelToInstanceMap().get(label)
+					.get(run).keySet()) {
+				int foldId = this.getFoldId(params, label, run, fold);
+				// evaluate for the specified fold training set
+				// construct map of class - [instance ids]
+				Map<String, Set<Long>> yMargin = getFoldYMargin(instanceData,
+						label, run, fold);
+				evaluateCorpusFold(params, yMargin, cg, instanceData, label,
+						conceptInstanceMap, foldId);
+			}
+		}
+	}
+
+	public ClassifierEvaluationDao getClassifierEvaluationDao() {
+		return classifierEvaluationDao;
+	}
+
+	public ConceptDao getConceptDao() {
+		return conceptDao;
+	}
+
+	public DataSource getDataSource(DataSource ds) {
+		return this.jdbcTemplate.getDataSource();
+	}
+
+	private int getFoldId(Parameters params, String label, int run, int fold) {
+		// figure out fold id
+		int foldId = 0;
+		if (run > 0 && fold > 0) {
+			CrossValidationFold cvFold = this.classifierEvaluationDao
+					.getCrossValidationFold(params.getCorpusName(),
+							params.getSplitName(), label, run, fold);
+			if (cvFold != null) {
+				foldId = cvFold.getCrossValidationFoldId();
+			} else {
+				log.warn("could not find cv fold, name="
+						+ params.getCorpusName() + ", run=" + run + ", fold="
+						+ fold);
+			}
+		}
+		return foldId;
+	}
+
+	private Map<String, Set<Long>> getFoldYMargin(InstanceData instanceData,
+			String label, int run, int fold) {
+		Map<Long, String> instanceClassMap = instanceData
+				.getLabelToInstanceMap().get(label).get(run).get(fold)
+				.get(true);
+		Map<String, Set<Long>> yMargin = new HashMap<String, Set<Long>>();
+		for (Map.Entry<Long, String> instanceClass : instanceClassMap
+				.entrySet()) {
+			Set<Long> instanceIds = yMargin.get(instanceClass.getValue());
+			if (instanceIds == null) {
+				instanceIds = new HashSet<Long>();
+				yMargin.put(instanceClass.getValue(), instanceIds);
+			}
+			instanceIds.add(instanceClass.getKey());
+		}
+		return yMargin;
+	}
+
+	public InfoContentEvaluator getInfoContentEvaluator() {
+		return infoContentEvaluator;
+	}
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public Properties getYtexProperties() {
+		return ytexProperties;
+	}
+
+	private FeatureEvaluation initFeatureEval(Parameters params, String label,
+			int foldId, String type) {
+		FeatureEvaluation feval = new FeatureEvaluation();
+		feval.setCorpusName(params.getCorpusName());
+		feval.setLabel(label);
+		feval.setCrossValidationFoldId(foldId);
+		feval.setParam2(params.getConceptGraphName());
+		feval.setEvaluationType(type);
+		feval.setFeatureSetName(params.getConceptSetName());
+		return feval;
+	}
+
+	/**
+	 * load the map of concept - instances
+	 * 
+	 * @param classFeatureQuery
+	 * @param cg
+	 * @param label
+	 * @return
+	 */
+	private Map<String, Map<String, Set<Long>>> loadConceptInstanceMap(
+			String classFeatureQuery, ConceptGraph cg, String label) {
+		Map<String, Map<String, Set<Long>>> conceptInstanceMap = new HashMap<String, Map<String, Set<Long>>>();
+		Map<String, Object> args = new HashMap<String, Object>(1);
+		if (label != null && label.length() > 0) {
+			args.put("label", label);
+		}
+		ConceptInstanceMapExtractor ex = new ConceptInstanceMapExtractor(
+				conceptInstanceMap, cg);
+		this.namedParamJdbcTemplate.query(classFeatureQuery, args, ex);
+		return conceptInstanceMap;
+	}
+
+	/**
+	 * 'complete' the joint distribution tables wrt a fold (yMargin). propagate
+	 * the joint distribution of all concepts recursively.
+	 * 
+	 * @param rawJointDistroMap
+	 * @param labelEval
+	 * @param cg
+	 * @param yMargin
+	 * @param xMerge
+	 * @param minInfo
+	 */
+	private FeatureEvaluation propagateJointDistribution(
+			Map<String, JointDistribution> rawJointDistroMap,
+			Parameters params, String label, int foldId, ConceptGraph cg,
+			Map<String, Set<Long>> yMargin) {
+		// get the entropy of Y for this fold
+		double yEntropy = this.calculateFoldEntropy(yMargin);
+		// allocate a map to hold the results of the propagation across the
+		// concept graph
+		Map<String, JointDistribution> conceptJointDistroMap = new HashMap<String, JointDistribution>(
+				cg.getConceptMap().size());
+		Map<String, Integer> conceptDistMap = new HashMap<String, Integer>();
+		// recurse
+		calcMergedJointDistribution(conceptJointDistroMap, conceptDistMap, cg
+				.getConceptMap().get(cg.getRoot()), rawJointDistroMap, yMargin,
+				params.getxMerge(), params.getMinInfo(),
+				Arrays.asList(new String[] { cg.getRoot() }));
+		List<FeatureRank> listPropRanks = new ArrayList<FeatureRank>(
+				conceptJointDistroMap.size());
+		return this.saveFeatureEvaluation(conceptJointDistroMap, params, label,
+				foldId, yEntropy, SUFFIX_PROP, listPropRanks);
+	}
+
+	private List<FeatureRank> rank(MeasureType measureType,
+			FeatureEvaluation fe,
+			Map<String, JointDistribution> rawJointDistro, double yEntropy,
+			List<FeatureRank> featureRankList) {
+		for (Map.Entry<String, JointDistribution> conceptJointDistro : rawJointDistro
+				.entrySet()) {
+			JointDistribution d = conceptJointDistro.getValue();
+			if (d != null) {
+				double evaluation;
+				if (MeasureType.MUTUALINFO.equals(measureType)) {
+					evaluation = d.getMutualInformation(yEntropy);
+				} else {
+					evaluation = d.getInfoGain();
+				}
+				if (evaluation > 1e-3) {
+					FeatureRank r = new FeatureRank(fe,
+							conceptJointDistro.getKey(), evaluation);
+					featureRankList.add(r);
+				}
+			}
+		}
+		return FeatureRank.sortFeatureRankList(featureRankList,
+				new FeatureRank.FeatureRankDesc());
+	}
+
+	private FeatureEvaluation saveFeatureEvaluation(
+			Map<String, JointDistribution> rawJointDistro, Parameters params,
+			String label, int foldId, double yEntropy, String suffix,
+			List<FeatureRank> listRawRanks) {
+		FeatureEvaluation fe = initFeatureEval(params, label, foldId, params
+				.getMeasure().getName() + suffix);
+		this.classifierEvaluationDao.saveFeatureEvaluation(
+				fe,
+				rank(params.getMeasure(), fe, rawJointDistro, yEntropy,
+						listRawRanks));
+		return fe;
+	}
+
+	public void setClassifierEvaluationDao(
+			ClassifierEvaluationDao classifierEvaluationDao) {
+		this.classifierEvaluationDao = classifierEvaluationDao;
+	}
+
+	public void setConceptDao(ConceptDao conceptDao) {
+		this.conceptDao = conceptDao;
+	}
+
+	public void setDataSource(DataSource ds) {
+		this.jdbcTemplate = new JdbcTemplate(ds);
+		this.namedParamJdbcTemplate = new NamedParameterJdbcTemplate(ds);
+	}
+
+	// private CorpusLabelEvaluation initCorpusLabelEval(CorpusEvaluation eval,
+	// String label, String splitName, int run, int fold) {
+	// Integer foldId = getFoldId(eval, label, splitName, run, fold);
+	// // see if the labelEval is already there
+	// CorpusLabelEvaluation labelEval = corpusDao.getCorpusLabelEvaluation(
+	// eval.getCorpusName(), eval.getConceptGraphName(),
+	// eval.getConceptSetName(), label, foldId);
+	// if (labelEval == null) {
+	// // not there - add it
+	// labelEval = new CorpusLabelEvaluation();
+	// labelEval.setCorpus(eval);
+	// labelEval.setFoldId(foldId);
+	// labelEval.setLabel(label);
+	// corpusDao.addCorpusLabelEval(labelEval);
+	// }
+	// return labelEval;
+	// }
+
+	public void setInfoContentEvaluator(
+			InfoContentEvaluator infoContentEvaluator) {
+		this.infoContentEvaluator = infoContentEvaluator;
+	}
+
+	// /**
+	// * create the corpusEvaluation if it doesn't exist
+	// *
+	// * @param corpusName
+	// * @param conceptGraphName
+	// * @param conceptSetName
+	// * @return
+	// */
+	// private CorpusEvaluation initEval(String corpusName,
+	// String conceptGraphName, String conceptSetName) {
+	// CorpusEvaluation eval = this.corpusDao.getCorpus(corpusName,
+	// conceptGraphName, conceptSetName);
+	// if (eval == null) {
+	// eval = new CorpusEvaluation();
+	// eval.setConceptGraphName(conceptGraphName);
+	// eval.setConceptSetName(conceptSetName);
+	// eval.setCorpusName(corpusName);
+	// this.corpusDao.addCorpus(eval);
+	// }
+	// return eval;
+	// }
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	//
+	// private void saveLabelStatistic(String conceptID,
+	// JointDistribution distroMerged, JointDistribution distroRaw,
+	// CorpusLabelEvaluation labelEval, double yEntropy, double minInfo,
+	// int distance) {
+	// double miMerged = distroMerged.getMutualInformation(yEntropy);
+	// double miRaw = distroRaw != null ? distroRaw
+	// .getMutualInformation(yEntropy) : 0;
+	// if (miMerged > minInfo || miRaw > minInfo) {
+	// ConceptLabelStatistic stat = new ConceptLabelStatistic();
+	// stat.setCorpusLabel(labelEval);
+	// stat.setMutualInfo(miMerged);
+	// if (distroRaw != null)
+	// stat.setMutualInfoRaw(miRaw);
+	// stat.setConceptId(conceptID);
+	// stat.setDistance(distance);
+	// this.corpusDao.addLabelStatistic(stat);
+	// }
+	// }
+
+	public void setYtexProperties(Properties ytexProperties) {
+		this.ytexProperties = ytexProperties;
+	}
+
+	/**
+	 * save the children of the 'top' parent concepts.
+	 * 
+	 * @param labelEval
+	 * @param parentConceptTopThreshold
+	 * @param parentConceptEvalThreshold
+	 * @param cg
+	 * @param bAll
+	 *            impute to all concepts/concepts actually in corpus. if we are
+	 *            imputing to all concepts, filter by infocontent (this includes
+	 *            hypernyms of concepts in the corpus). else only impute to
+	 *            conrete concepts in the corpus
+	 */
+	public void storeChildConcepts(List<FeatureRank> listRawRanks,
+			Parameters params, String label, int foldId, ConceptGraph cg,
+			boolean bAll) {
+		// only include concepts that actually occur in the corpus
+		Map<String, Double> conceptICMap = bAll ? classifierEvaluationDao
+				.getInfoContent(params.getCorpusName(),
+						params.getConceptGraphName(),
+						params.getConceptSetName()) : this.infoContentEvaluator
+				.getFrequencies(params.getFreqQuery());
+		// get the raw feature evaluations. The imputed feature evaluation is a
+		// mixture of the parent feature eval and the raw feature eval.
+		Map<String, Double> conceptRawEvalMap = new HashMap<String, Double>(
+				listRawRanks.size());
+		for (FeatureRank r : listRawRanks) {
+			conceptRawEvalMap.put(r.getFeatureName(), r.getEvaluation());
+		}
+		// this map will get filled with the links between parent and child
+		// concepts for imputation
+		Map<FeatureRank, Set<FeatureRank>> childParentMap = bAll ? null
+				: new HashMap<FeatureRank, Set<FeatureRank>>();
+		// .getFeatureRankEvaluations(params.getCorpusName(),
+		// params.getConceptSetName(), null,
+		// InfoContentEvaluator.INFOCONTENT, 0,
+		// params.getConceptGraphName());
+		// get the top parent concepts - use either top N, or those with a
+		// cutoff greater than the specified threshold
+		// List<ConceptLabelStatistic> listConceptStat =
+		// parentConceptTopThreshold != null ? this.corpusDao
+		// .getTopCorpusLabelStat(labelEval, parentConceptTopThreshold)
+		// : this.corpusDao.getThresholdCorpusLabelStat(labelEval,
+		// parentConceptMutualInfoThreshold);
+		String propagatedType = params.getMeasure().getName() + SUFFIX_PROP;
+		List<FeatureRank> listConceptStat = params
+				.getParentConceptTopThreshold() != null ? this.classifierEvaluationDao
+				.getTopFeatures(params.getCorpusName(),
+						params.getConceptSetName(), label, propagatedType,
+						foldId, 0, params.getConceptGraphName(),
+						params.getParentConceptTopThreshold())
+				: this.classifierEvaluationDao.getThresholdFeatures(
+						params.getCorpusName(), params.getConceptSetName(),
+						label, propagatedType, foldId, 0,
+						params.getConceptGraphName(),
+						params.getParentConceptEvalThreshold());
+		FeatureEvaluation fe = this.initFeatureEval(params, label, foldId,
+				params.getMeasure().getName()
+						+ (bAll ? SUFFIX_IMPUTED : SUFFIX_IMPUTED_FILTERED));
+		// map of concept id to children and the 'best' statistic
+		Map<String, FeatureRank> mapChildConcept = new HashMap<String, FeatureRank>();
+		// get all the children of the parent concepts
+		for (FeatureRank parentConcept : listConceptStat) {
+			updateChildren(parentConcept, mapChildConcept, fe, cg,
+					conceptICMap, conceptRawEvalMap, childParentMap,
+					params.getImputeWeight(), params.getMinInfo());
+		}
+		// save the imputed feature ranks
+		List<FeatureRank> features = new ArrayList<FeatureRank>(
+				mapChildConcept.values());
+		FeatureRank.sortFeatureRankList(features,
+				new FeatureRank.FeatureRankDesc());
+		this.classifierEvaluationDao.saveFeatureEvaluation(fe, features);
+		if (!bAll) {
+			// save the parent-child links
+			for (Map.Entry<FeatureRank, Set<FeatureRank>> childParentEntry : childParentMap
+					.entrySet()) {
+				FeatureRank child = childParentEntry.getKey();
+				for (FeatureRank parent : childParentEntry.getValue()) {
+					FeatureParentChild parchd = new FeatureParentChild();
+					parchd.setFeatureRankParent(parent);
+					parchd.setFeatureRankChild(child);
+					this.classifierEvaluationDao.saveFeatureParentChild(parchd);
+				}
+			}
+		}
+	}
+
+	/**
+	 * add the children of parentConcept to mapChildConcept. Assign the child
+	 * the best mutual information value of the parent.
+	 * 
+	 * @param parentConcept
+	 * @param mapChildConcept
+	 * @param labelEval
+	 * @param cg
+	 * @param parentChildMap
+	 * @param conceptRawEvalMap
+	 */
+	private void updateChildren(FeatureRank parentConcept,
+			Map<String, FeatureRank> mapChildConcept, FeatureEvaluation fe,
+			ConceptGraph cg, Map<String, Double> conceptICMap,
+			Map<String, Double> conceptRawEvalMap,
+			Map<FeatureRank, Set<FeatureRank>> childParentMap,
+			double imputeWeight, double minInfo) {
+		ConcRel cr = cg.getConceptMap().get(parentConcept.getFeatureName());
+		Set<String> childConcepts = new HashSet<String>();
+		addSubtree(childConcepts, cr);
+		for (String childConceptId : childConcepts) {
+			// only add the child to the map if it exists in the corpus
+			if (conceptICMap.containsKey(childConceptId)) {
+				FeatureRank chd = mapChildConcept.get(childConceptId);
+				// create the child if it does not already exist
+				if (chd == null) {
+					chd = new FeatureRank(fe, childConceptId, 0d);
+					mapChildConcept.put(childConceptId, chd);
+				}
+				// give the child the mutual info of the parent with the highest
+				// score
+				double rawEvaluation = conceptRawEvalMap
+						.containsKey(childConceptId) ? conceptRawEvalMap
+						.get(childConceptId) : minInfo;
+				double imputedEvaluation = (imputeWeight * parentConcept
+						.getEvaluation())
+						+ ((1 - imputeWeight) * rawEvaluation);
+				if (chd.getEvaluation() < imputedEvaluation) {
+					chd.setEvaluation(imputedEvaluation);
+				}
+				// add the relationship to the parentChildMap
+				// do this only if the childParentMap is not null
+				if (childParentMap != null) {
+					Set<FeatureRank> parents = childParentMap.get(chd);
+					if (parents == null) {
+						parents = new HashSet<FeatureRank>(10);
+						childParentMap.put(chd, parents);
+					}
+					parents.add(parentConcept);
+				}
+			}
+		}
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InfoContentEvaluator.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InfoContentEvaluator.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InfoContentEvaluator.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InfoContentEvaluator.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,17 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.util.Map;
+
+public interface InfoContentEvaluator {
+
+	public static final String INFOCONTENT = "infocontent";
+
+	/**
+	 * calculate information content for all concepts
+	 */
+	public abstract void evaluateCorpusInfoContent(String freqQuery,
+			String corpusName, String conceptGraphName, String conceptSetName);
+
+	public abstract Map<String, Double> getFrequencies(String freqQuery);
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InfoContentEvaluatorImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InfoContentEvaluatorImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InfoContentEvaluatorImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InfoContentEvaluatorImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,253 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+import javax.sql.DataSource;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao;
+import org.apache.ctakes.ytex.kernel.dao.ConceptDao;
+import org.apache.ctakes.ytex.kernel.model.ConcRel;
+import org.apache.ctakes.ytex.kernel.model.ConceptGraph;
+import org.apache.ctakes.ytex.kernel.model.FeatureEvaluation;
+import org.apache.ctakes.ytex.kernel.model.FeatureRank;
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.jdbc.core.RowCallbackHandler;
+
+
+/**
+ * Calculate the information content of each concept in a corpus wrt the
+ * specified concept graph. Required properties:
+ * <ul>
+ * <li>org.apache.ctakes.ytex.conceptGraphName - required - name of conceptGraph. @see ConceptDao
+ * <li>org.apache.ctakes.ytex.corpusName - required - name of corpus
+ * <li>org.apache.ctakes.ytex.conceptSetName - optional - you may want to experiment with
+ * different sets of concepts from a corpus, e.g. concepts from certain
+ * sections, or different ways of counting concepts.
+ * <li>org.apache.ctakes.ytex.freqQuery - query to obtain raw concept frequencies for the corpus
+ * </ul>
+ * to execute, either specify these options via system properties (-D options)
+ * on the command line, or supply this class with the path to a properties file
+ * used for evaluation, or both (-D overrides properties file).
+ * <p>
+ * The information content of each concept is stored in the feature_rank table.
+ * The related record in the feature_eval table has
+ * <ul>
+ * <li>type = infocontent
+ * <li>feature_set_name = conceptSetName
+ * <li>param1 = conceptGraphName
+ * </ul>
+ * 
+ * @author vijay
+ * 
+ */
+public class InfoContentEvaluatorImpl implements InfoContentEvaluator {
+	/**
+	 * @param args
+	 * @throws IOException
+	 */
+	@SuppressWarnings("static-access")
+	public static void main(String[] args) throws IOException {
+		Options options = new Options();
+		options.addOption(OptionBuilder
+				.withArgName("property file")
+				.hasArg()
+				.isRequired()
+				.withDescription(
+						"property file with queries and other parameters. todo desc")
+				.create("prop"));
+		try {
+			CommandLineParser parser = new GnuParser();
+			CommandLine line = parser.parse(options, args);
+			Properties props = (Properties) KernelContextHolder
+					.getApplicationContext().getBean("ytexProperties");
+			Properties propsArgs = FileUtil.loadProperties(
+					line.getOptionValue("prop"), true);
+			props.putAll(propsArgs);
+			if (!props.containsKey("org.apache.ctakes.ytex.conceptGraphName")
+					|| !props.containsKey("org.apache.ctakes.ytex.corpusName")
+					|| !props.containsKey("org.apache.ctakes.ytex.freqQuery")) {
+				System.err.println("error: required parameter not specified");
+				System.exit(1);
+			} else {
+				InfoContentEvaluator corpusEvaluator = KernelContextHolder
+						.getApplicationContext().getBean(
+								InfoContentEvaluator.class);
+				corpusEvaluator.evaluateCorpusInfoContent(
+						props.getProperty("org.apache.ctakes.ytex.freqQuery"),
+						props.getProperty("org.apache.ctakes.ytex.corpusName"),
+						props.getProperty("org.apache.ctakes.ytex.conceptGraphName"),
+						props.getProperty("org.apache.ctakes.ytex.conceptSetName"));
+				System.exit(0);
+			}
+		} catch (ParseException pe) {
+			printHelp(options);
+			System.exit(1);
+		}
+	}
+
+	private static void printHelp(Options options) {
+		HelpFormatter formatter = new HelpFormatter();
+		formatter.printHelp("java " + InfoContentEvaluatorImpl.class.getName()
+				+ " calculate information content of corpus wrt concept graph",
+				options);
+	}
+
+	private ClassifierEvaluationDao classifierEvaluationDao;
+	private ConceptDao conceptDao;
+
+	// private CorpusDao corpusDao;
+	private JdbcTemplate jdbcTemplate;
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see
+	 * org.apache.ctakes.ytex.kernel.CorpusEvaluator#evaluateCorpusInfoContent(java.lang.String,
+	 * java.lang.String, java.lang.String, java.lang.String)
+	 */
+	@Override
+	public void evaluateCorpusInfoContent(final String freqQuery,
+			final String corpusName, final String conceptGraphName,
+			final String conceptSetName) {
+		ConceptGraph cg = conceptDao.getConceptGraph(conceptGraphName);
+		classifierEvaluationDao.deleteFeatureEvaluation(corpusName,
+				conceptSetName, null, INFOCONTENT, 0, 0d, conceptGraphName);
+		FeatureEvaluation eval = new FeatureEvaluation();
+		eval.setCorpusName(corpusName);
+		if (conceptSetName != null)
+			eval.setFeatureSetName(conceptSetName);
+		eval.setEvaluationType(INFOCONTENT);
+		eval.setParam2(conceptGraphName);
+		// CorpusEvaluation eval = corpusDao.getCorpus(corpusName,
+		// conceptGraphName, conceptSetName);
+		// if (eval == null) {
+		// eval = new CorpusEvaluation();
+		// eval.setConceptGraphName(conceptGraphName);
+		// eval.setConceptSetName(conceptSetName);
+		// eval.setCorpusName(corpusName);
+		// this.corpusDao.addCorpus(eval);
+		// }
+		Map<String, Double> rawFreq = getFrequencies(freqQuery);
+		double totalFreq = 0d;
+		// map of cui to cumulative frequency
+		Map<String, Double> conceptFreq = new HashMap<String, Double>(cg
+				.getConceptMap().size());
+		// recurse through the tree
+		totalFreq = getFrequency(cg.getConceptMap().get(cg.getRoot()),
+				conceptFreq, rawFreq);
+		List<FeatureRank> featureRankList = new ArrayList<FeatureRank>(
+				conceptFreq.size());
+		// update information content
+		double log2inv = -1d / Math.log(2);
+		for (Map.Entry<String, Double> cfreq : conceptFreq.entrySet()) {
+			if (cfreq.getValue() > 0) {
+				FeatureRank featureRank = new FeatureRank(eval, cfreq.getKey(),
+						log2inv * Math.log(cfreq.getValue() / totalFreq));
+				featureRankList.add(featureRank);
+			}
+		}
+		// the rank is irrelevant, but rank the features anyways
+		featureRankList = FeatureRank.sortFeatureRankList(featureRankList,
+				new FeatureRank.FeatureRankDesc());
+		classifierEvaluationDao.saveFeatureEvaluation(eval, featureRankList);
+	}
+
+	public ClassifierEvaluationDao getClassifierEvaluationDao() {
+		return classifierEvaluationDao;
+	}
+
+	public ConceptDao getConceptDao() {
+		return conceptDao;
+	}
+
+	public DataSource getDataSource(DataSource ds) {
+		return this.jdbcTemplate.getDataSource();
+	}
+
+	// public CorpusDao getCorpusDao() {
+	// return corpusDao;
+	// }
+	//
+	// public void setCorpusDao(CorpusDao corpusDao) {
+	// this.corpusDao = corpusDao;
+	// }
+
+	/**
+	 * get the frequency of each term in the corpus.
+	 * 
+	 * @param freqQuery
+	 *            query returns 2 columns. 1st column - concept id (string), 2nd
+	 *            column - frequency (double)
+	 * @return
+	 */
+	@Override
+	public Map<String, Double> getFrequencies(String freqQuery) {
+		// get the raw frequency
+		final Map<String, Double> rawFreq = new HashMap<String, Double>();
+		jdbcTemplate.query(freqQuery, new RowCallbackHandler() {
+
+			@Override
+			public void processRow(ResultSet rs) throws SQLException {
+				rawFreq.put(rs.getString(1), rs.getDouble(2));
+			}
+		});
+		return rawFreq;
+	}
+
+	/**
+	 * recursively sum frequency of parent and all its childrens' frequencies
+	 * 
+	 * @param parent
+	 *            parent node
+	 * @param conceptFreq
+	 *            results stored here
+	 * @param conceptIdToTermMap
+	 *            raw frequencies here
+	 * @return double sum of concept frequency in the subtree with parent as
+	 *         root
+	 */
+	double getFrequency(ConcRel parent, Map<String, Double> conceptFreq,
+			Map<String, Double> rawFreq) {
+		double dFreq = 0d;
+		if (conceptFreq.containsKey(parent.getConceptID())) {
+			dFreq = conceptFreq.get(parent.getConceptID());
+		} else {
+			// get raw freq
+			dFreq = rawFreq.containsKey(parent.getConceptID()) ? rawFreq
+					.get(parent.getConceptID()) : 0d;
+			// recurse
+			for (ConcRel child : parent.getChildren()) {
+				dFreq += getFrequency(child, conceptFreq, rawFreq);
+			}
+			conceptFreq.put(parent.getConceptID(), dFreq);
+		}
+		return dFreq;
+	}
+
+	public void setClassifierEvaluationDao(
+			ClassifierEvaluationDao classifierEvaluationDao) {
+		this.classifierEvaluationDao = classifierEvaluationDao;
+	}
+
+	public void setConceptDao(ConceptDao conceptDao) {
+		this.conceptDao = conceptDao;
+	}
+
+	public void setDataSource(DataSource ds) {
+		this.jdbcTemplate = new JdbcTemplate(ds);
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InstanceData.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InstanceData.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InstanceData.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/InstanceData.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,82 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+/**
+ * data structure to store instance ids, their classes, folds, runs, and labels.
+ * 
+ * @author vijay
+ * 
+ */
+public class InstanceData {
+	/**
+	 * labels - class
+	 */
+	SortedMap<String, SortedSet<String>> labelToClassMap = new TreeMap<String, SortedSet<String>>();
+	/**
+	 * map of labels - runs - folds - train/test - instances - class for test
+	 * instances
+	 */
+	SortedMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> labelToInstanceMap = new TreeMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>>();
+
+	public SortedMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> getLabelToInstanceMap() {
+		return labelToInstanceMap;
+	}
+
+	public void setLabelToInstanceMap(
+			SortedMap<String, SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>> labelToInstanceMap) {
+		this.labelToInstanceMap = labelToInstanceMap;
+	}
+
+	public SortedMap<String, SortedSet<String>> getLabelToClassMap() {
+		return labelToClassMap;
+	}
+
+	public void setLabelToClassMap(
+			SortedMap<String, SortedSet<String>> labelToClassMap) {
+		this.labelToClassMap = labelToClassMap;
+	}
+
+	/**
+	 * get all the instance ids for the specified scope
+	 * 
+	 * @param label
+	 *            if null, then all instance ids, else if run & fold = 0, then
+	 *            all instance ids for this label.
+	 * @param run
+	 * @param fold
+	 *            if run & fold != 0, then all instance ids for the specified
+	 *            fold
+	 * @return
+	 */
+	public SortedSet<Long> getAllInstanceIds(String label, int run, int fold) {
+		SortedSet<Long> instanceIds = new TreeSet<Long>();
+		if (label == null) {
+			for (String labelKey : this.getLabelToInstanceMap().keySet()) {
+				instanceIds.addAll(getAllInstanceIds(labelKey, 0, 0));
+			}
+		} else if (label != null && fold == 0 && run == 0) {
+			for (int runKey : this.getLabelToInstanceMap().get(label).keySet()) {
+				for (int foldKey : this.getLabelToInstanceMap().get(label)
+						.get(runKey).keySet()) {
+					for (SortedMap<Long, String> inst : this
+							.getLabelToInstanceMap().get(label).get(runKey)
+							.get(foldKey).values()) {
+						instanceIds.addAll(inst.keySet());
+					}
+				}
+			}
+		}
+		if (fold != 0 && run != 0) {
+			for (SortedMap<Long, String> foldInst : this
+					.getLabelToInstanceMap().get(label).get(run).get(fold)
+					.values()) {
+				instanceIds.addAll(foldInst.keySet());
+			}
+		}
+		return instanceIds;
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluator.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluator.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluator.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluator.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,17 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.IOException;
+import java.util.Properties;
+
+import org.apache.ctakes.ytex.kernel.model.ConceptGraph;
+
+
+public interface IntrinsicInfoContentEvaluator {
+
+	public static final String INTRINSIC_INFOCONTENT = "intrinsic-infocontent";
+	public abstract void evaluateIntrinsicInfoContent(
+			final Properties props) throws IOException;
+	public abstract void evaluateIntrinsicInfoContent(String conceptGraphName,
+			String conceptGraphDir, ConceptGraph cg) throws IOException;
+
+}
\ No newline at end of file



Mime
View raw message