ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vjapa...@apache.org
Subject svn commit: r1551254 [19/26] - in /ctakes/branches/ytex: ctakes-ytex-res/ ctakes-ytex-res/.settings/ ctakes-ytex-res/src/ ctakes-ytex-res/src/main/ ctakes-ytex-res/src/main/resources/ ctakes-ytex-res/src/main/resources/org/ ctakes-ytex-res/src/main/res...
Date Mon, 16 Dec 2013 16:30:40 GMT
Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluatorImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluatorImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluatorImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluatorImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,511 @@
+package org.apache.ctakes.ytex.kernel;
+
+import gnu.trove.iterator.TIntIterator;
+import gnu.trove.set.TIntSet;
+import gnu.trove.set.hash.TIntHashSet;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.lang.ref.SoftReference;
+import java.lang.reflect.Array;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.WeakHashMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao;
+import org.apache.ctakes.ytex.kernel.dao.ConceptDao;
+import org.apache.ctakes.ytex.kernel.model.ConcRel;
+import org.apache.ctakes.ytex.kernel.model.ConceptGraph;
+import org.apache.ctakes.ytex.kernel.model.FeatureEvaluation;
+import org.apache.ctakes.ytex.kernel.model.FeatureRank;
+
+
+public class IntrinsicInfoContentEvaluatorImpl implements
+		IntrinsicInfoContentEvaluator {
+	public static class IntrinsicICInfo {
+		private ConcRel concept;
+
+		private int leafCount = 0;
+
+		private int subsumerCount = 0;
+
+		public IntrinsicICInfo(ConcRel concept) {
+			this.concept = concept;
+		}
+
+		public ConcRel getConcept() {
+			return concept;
+		}
+
+		public int getLeafCount() {
+			return leafCount;
+		}
+
+		public int getSubsumerCount() {
+			return subsumerCount;
+		}
+
+		public void setConcept(ConcRel concept) {
+			this.concept = concept;
+		}
+
+		public void setLeafCount(int leafCount) {
+			this.leafCount = leafCount;
+		}
+
+		public void setSubsumerCount(int subsumerCount) {
+			this.subsumerCount = subsumerCount;
+		}
+	}
+
+	private static final Log log = LogFactory
+			.getLog(IntrinsicInfoContentEvaluatorImpl.class);
+	private static final double log2adjust = 1d / Math.log(2);
+
+	/**
+	 * @param args
+	 * @throws IOException
+	 */
+	public static void main(String[] args) throws IOException {
+		Properties props = (Properties) KernelContextHolder
+				.getApplicationContext().getBean("ytexProperties");
+		props.putAll(System.getProperties());
+		if (!props.containsKey("org.apache.ctakes.ytex.conceptGraphName")) {
+			System.err.println("error: org.apache.ctakes.ytex.conceptGraphName not specified");
+			System.exit(1);
+		} else {
+			IntrinsicInfoContentEvaluator corpusEvaluator = KernelContextHolder
+					.getApplicationContext().getBean(
+							IntrinsicInfoContentEvaluator.class);
+			corpusEvaluator.evaluateIntrinsicInfoContent(props);
+			System.exit(0);
+		}
+	}
+
+	private ClassifierEvaluationDao classifierEvaluationDao;
+
+	private ConceptDao conceptDao;
+
+	private double computeIC(IntrinsicICInfo icInfo, int maxLeaves) {
+		// |leaves(c)|/|subsumers(c)| + 1
+		double denom = log2adjust
+				* Math.log((double) icInfo.getLeafCount()
+						/ (double) icInfo.getSubsumerCount() + 1d);
+		// max_leaves + 1
+		double num = log2adjust * Math.log((double) maxLeaves + 1d);
+		if (denom == Double.NaN || num == Double.NaN) {
+			log.error("IC = NaN for " + icInfo.getConcept().getConceptID()
+					+ ", leafCount=" + icInfo.getLeafCount()
+					+ ", subsumerCount = " + icInfo.getSubsumerCount());
+			return -1d;
+		} else
+			return num - denom;
+	}
+
+	/**
+	 * recursively compute the number of leaves. fill in the icInfoMap as we go
+	 * along
+	 * 
+	 * @param concept
+	 *            concept for which we should get the leaves
+	 * @param leafCache
+	 *            cache of concept's leaves
+	 * @param icInfoMap
+	 *            to be updated with leaf counts
+	 * @param cg
+	 * @param w
+	 * @param visitedNodes
+	 *            list of nodes that have already been visited - we don't need
+	 *            to revisit them when getting the leaves
+	 * @return
+	 * @throws IOException
+	 */
+	private TIntSet getLeaves(ConcRel concept,
+			SoftReference<TIntSet>[] leafCache,
+			Map<String, IntrinsicICInfo> icInfoMap, ConceptGraph cg,
+			BufferedWriter w, TIntSet visitedNodes) throws IOException {
+		// look in cache
+		SoftReference<TIntSet> refLeaves = leafCache[concept.getNodeIndex()];
+		if (refLeaves != null && refLeaves.get() != null) {
+			return refLeaves.get();
+		}
+		// not in cache - compute recursively
+		TIntSet leaves = new TIntHashSet();
+		leafCache[concept.getNodeIndex()] = new SoftReference<TIntSet>(leaves);
+		if (concept.isLeaf()) {
+			// for leaves, just add the concept id
+			leaves.add(concept.getNodeIndex());
+		} else {
+			IntrinsicICInfo icInfo = icInfoMap.get(concept.getConceptID());
+			// have we already computed the leaf count for this node?
+			// if yes, then we can ignore previously visited nodes
+			// if no, then compute it now and revisit previously visited nodes
+			// if we have to
+			boolean needLeaves = (icInfo != null && icInfo.getLeafCount() == 0);
+			TIntSet visitedNodesLocal = visitedNodes;
+			if (needLeaves || visitedNodesLocal == null) {
+				// allocate a set to keep track of nodes we've already visited
+				// so that we don't revisit them. if we have already computed
+				// this node's leaf count then we reuse whatever the caller gave
+				// us if non null, else allocate a new one.
+				// if we haven't already computed this node's leaf count,
+				// allocate a new set to avoid duplications in the traversal for
+				// this node
+				visitedNodesLocal = new TIntHashSet();
+			}
+			// for inner nodes, recurse
+			for (ConcRel child : concept.getChildren()) {
+				// if we've already visited a node, then don't bother adding
+				// that node's leaves - we already have them
+				if (!visitedNodesLocal.contains(child.getNodeIndex())) {
+					leaves.addAll(getLeaves(child, leafCache, icInfoMap, cg, w,
+							visitedNodesLocal));
+				}
+			}
+			// add this node to the set of visited nodes so we know not to
+			// revisit. This is only of importance if the caller gave us
+			// a non-empty set.
+			if (visitedNodes != null && visitedNodes != visitedNodesLocal) {
+				visitedNodes.add(concept.getNodeIndex());
+				visitedNodes.addAll(visitedNodesLocal);
+			}
+			// update the leaf count if we haven't done so already
+			if (needLeaves) {
+				icInfo.setLeafCount(leaves.size());
+				// output leaves if desired
+				if (w != null) {
+					w.write(concept.getConceptID());
+					w.write("\t");
+					w.write(Integer.toString(leaves.size()));
+					w.write("\t");
+					TIntIterator iter = leaves.iterator();
+					while (iter.hasNext()) {
+						w.write(cg.getConceptList().get(iter.next())
+								.getConceptID());
+						w.write(" ");
+					}
+					w.newLine();
+				}
+			}
+		}
+		return leaves;
+	}
+
+	// /**
+	// * add/update icInfoMap entry for concept with the concept's leaf count
+	// *
+	// * @param concept
+	// * @param icInfoMap
+	// * @param w
+	// * @param subsumerMap
+	// * @throws IOException
+	// */
+	// private void computeLeafCount(ConcRel concept,
+	// Map<String, IntrinsicICInfo> icInfoMap,
+	// SoftReference<TIntSet>[] leafCache, ConceptGraph cg,
+	// BufferedWriter w) throws IOException {
+	// // see if we already computed this
+	// IntrinsicICInfo icInfo = icInfoMap.get(concept.getConceptID());
+	// if (icInfo != null && icInfo.getLeafCount() > 0) {
+	// return;
+	// }
+	// // if not, figure it out
+	// if (icInfo == null) {
+	// icInfo = new IntrinsicICInfo(concept);
+	// icInfoMap.put(concept.getConceptID(), icInfo);
+	// }
+	// // for leaves the default (0) is correct
+	// if (!concept.isLeaf()) {
+	// TIntSet leaves = this.getLeaves(concept, leafCache);
+	// icInfo.setLeafCount(leaves.size());
+	// if (w != null) {
+	// w.write(concept.getConceptID());
+	// w.write("\t");
+	// w.write(Integer.toString(leaves.size()));
+	// w.write("\t");
+	// TIntIterator iter = leaves.iterator();
+	// while (iter.hasNext()) {
+	// w.write(cg.getConceptList().get(iter.next()).getConceptID());
+	// w.write(" ");
+	// }
+	// w.newLine();
+	// }
+	// }
+	// // recurse to parents
+	// for (ConcRel parent : concept.getParents()) {
+	// computeLeafCount(parent, icInfoMap, leafCache, cg, w);
+	// }
+	// }
+
+	/**
+	 * add/update icInfoMap entry for concept with the concept's subsumer count
+	 * 
+	 * @param concept
+	 * @param icInfoMap
+	 * @param subsumerMap
+	 * @param w
+	 * @throws IOException
+	 */
+	private void computeSubsumerCount(ConcRel concept,
+			Map<String, IntrinsicICInfo> icInfoMap,
+			Map<String, Set<String>> subsumerMap, short[] depthArray,
+			BufferedWriter w) throws IOException {
+		// see if we already computed this
+		IntrinsicICInfo icInfo = icInfoMap.get(concept.getConceptID());
+		if (icInfo != null && icInfo.getSubsumerCount() > 0) {
+			return;
+		}
+		// if not, figure it out
+		if (icInfo == null) {
+			icInfo = new IntrinsicICInfo(concept);
+			icInfoMap.put(concept.getConceptID(), icInfo);
+		}
+		Set<String> subsumers = this.getSubsumers(concept, subsumerMap,
+				depthArray);
+		if (w != null) {
+			w.write(concept.getConceptID());
+			w.write("\t");
+			w.write(Integer.toString(subsumers.size()));
+			w.write("\t");
+			w.write(subsumers.toString());
+			w.newLine();
+		}
+		icInfo.setSubsumerCount(subsumers.size());
+		// recursively compute the children's subsumer counts
+		for (ConcRel child : concept.getChildren()) {
+			computeSubsumerCount(child, icInfoMap, subsumerMap, depthArray, w);
+		}
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see
+	 * org.apache.ctakes.ytex.kernel.IntrinsicInfoContentEvaluator#evaluateIntrinsicInfoContent
+	 * (java.lang.String)
+	 */
+	@Override
+	public void evaluateIntrinsicInfoContent(final Properties props)
+			throws IOException {
+		String conceptGraphName = props.getProperty("org.apache.ctakes.ytex.conceptGraphName");
+		String conceptGraphDir = props.getProperty("org.apache.ctakes.ytex.conceptGraphDir",
+				System.getProperty("java.io.tmpdir"));
+		ConceptGraph cg = this.conceptDao.getConceptGraph(conceptGraphName);
+		evaluateIntrinsicInfoContent(conceptGraphName, conceptGraphDir, cg);
+	}
+
+	@Override
+	public void evaluateIntrinsicInfoContent(String conceptGraphName,
+			String conceptGraphDir, ConceptGraph cg) throws IOException {
+		log.info("computing subsumer counts");
+		// compute the subsumer count
+		Map<String, IntrinsicICInfo> icInfoMap = new HashMap<String, IntrinsicICInfo>();
+		Map<String, Set<String>> subsumerMap = new WeakHashMap<String, Set<String>>();
+		short[] depthArray = new short[cg.getConceptList().size()];
+		BufferedWriter w = null;
+		try {
+			w = this.getOutputFile(conceptGraphName, conceptGraphDir,
+					"subsumer");
+			computeSubsumerCount(cg.getConceptMap().get(cg.getRoot()),
+					icInfoMap, subsumerMap, depthArray, w);
+		} finally {
+			if (w != null) {
+				try {
+					w.close();
+				} catch (IOException e) {
+				}
+			}
+		}
+		subsumerMap = null;
+		log.info("computing max leaves");
+		// get the leaves in this concept graph
+		Set<String> leafSet = null;
+		try {
+			w = this.getOutputFile(conceptGraphName, conceptGraphDir, "allleaf");
+			leafSet = this.getAllLeaves(cg, w);
+		} finally {
+			if (w != null) {
+				try {
+					w.close();
+				} catch (IOException e) {
+				}
+			}
+		}
+		log.info("computing leaf counts");
+		@SuppressWarnings("unchecked")
+		SoftReference<TIntSet>[] leafCache = (SoftReference<TIntSet>[]) Array
+				.newInstance((new SoftReference<TIntSet>(new TIntHashSet()))
+						.getClass(), cg.getConceptList().size());
+		// compute leaf count of all concepts in this graph
+		try {
+			w = this.getOutputFile(conceptGraphName, conceptGraphDir, "leaf");
+			// for (String leaf : leafSet) {
+			// computeLeafCount(cg.getConceptMap().get(leaf), icInfoMap,
+			// leafCache, cg, w);
+			// }
+			this.getLeaves(cg.getConceptMap().get(cg.getRoot()), leafCache,
+					icInfoMap, cg, w, null);
+		} finally {
+			if (w != null) {
+				try {
+					w.close();
+				} catch (IOException e) {
+				}
+			}
+		}
+		leafCache = null;
+		log.info("storing intrinsic ic");
+		storeIntrinsicIC(conceptGraphName, leafSet.size(), icInfoMap,
+				depthArray, cg);
+		log.info("finished computing intrinsic ic");
+	}
+
+	private BufferedWriter getOutputFile(final String conceptGraphName,
+			final String conceptGraphDir, String type) throws IOException {
+		if ("true".equalsIgnoreCase(System
+				.getProperty("org.apache.ctakes.ytex.ic.debug", "false"))) {
+			return new BufferedWriter(new FileWriter(FileUtil.addFilenameToDir(
+					conceptGraphDir, conceptGraphName + "-" + type + ".txt")));
+		} else
+			return null;
+	}
+
+	public Set<String> getAllLeaves(ConceptGraph cg, BufferedWriter w)
+			throws IOException {
+		Set<String> leafSet = new HashSet<String>();
+		for (Map.Entry<String, ConcRel> con : cg.getConceptMap().entrySet()) {
+			if (con.getValue().isLeaf()) {
+				leafSet.add(con.getValue().getConceptID());
+			}
+		}
+		if (w != null) {
+			w.write(Integer.toString(leafSet.size()));
+			w.write("\t");
+			w.write(leafSet.toString());
+			w.newLine();
+		}
+		return leafSet;
+	}
+
+	public ClassifierEvaluationDao getClassifierEvaluationDao() {
+		return classifierEvaluationDao;
+	}
+
+	public ConceptDao getConceptDao() {
+		return conceptDao;
+	}
+
+	// private TIntSet getLeaves(ConcRel concept,
+	// SoftReference<TIntSet>[] leafCache) {
+	// // look in cache
+	// SoftReference<TIntSet> refLeaves = leafCache[concept.getNodeIndex()];
+	// if (refLeaves != null && refLeaves.get() != null) {
+	// return refLeaves.get();
+	// }
+	// // not in cache - compute recursively
+	// TIntSet leaves = new TIntHashSet();
+	// leafCache[concept.getNodeIndex()] = new SoftReference<TIntSet>(leaves);
+	// if (concept.isLeaf()) {
+	// // for leaves, just add the concept id
+	// leaves.add(concept.getNodeIndex());
+	// } else {
+	// // for inner nodes, recurse
+	// for (ConcRel child : concept.getChildren()) {
+	// leaves.addAll(getLeaves(child, leafCache));
+	// }
+	// }
+	// return leaves;
+	// }
+
+	/**
+	 * recursively compute the subsumers of a concept
+	 * 
+	 * @param concept
+	 * @param subsumerMap
+	 * @return
+	 */
+	private Set<String> getSubsumers(ConcRel concept,
+			Map<String, Set<String>> subsumerMap, short depthArray[]) {
+		// look in cache
+		if (subsumerMap.containsKey(concept.getConceptID()))
+			return subsumerMap.get(concept.getConceptID());
+		// not in cache - compute recursively
+		Set<String> subsumers = new HashSet<String>();
+		boolean calcDepth = depthArray[concept.getNodeIndex()] == 0;
+		short parentMaxDepth = 0;
+		if (concept.getParents() != null && !concept.getParents().isEmpty()) {
+			// parents - recurse
+			for (ConcRel parent : concept.getParents()) {
+				subsumers.addAll(getSubsumers(parent, subsumerMap, depthArray));
+				// get the deepest parent
+				if (calcDepth) {
+					short parentDepth = depthArray[parent.getNodeIndex()];
+					if (parentDepth > parentMaxDepth)
+						parentMaxDepth = parentDepth;
+				}
+			}
+		}
+		if (calcDepth)
+			depthArray[concept.getNodeIndex()] = (short) (parentMaxDepth + 1);
+		// add the concept itself to the set of subsumers
+		subsumers.add(concept.getConceptID());
+		// add this to the cache - copy the key so that this can be gc'ed as
+		// needed
+		subsumerMap.put(new String(concept.getConceptID()), subsumers);
+		return subsumers;
+	}
+
+	public void setClassifierEvaluationDao(
+			ClassifierEvaluationDao classifierEvaluationDao) {
+		this.classifierEvaluationDao = classifierEvaluationDao;
+	}
+
+	public void setConceptDao(ConceptDao conceptDao) {
+		this.conceptDao = conceptDao;
+	}
+
+	private void storeIntrinsicIC(String conceptGraphName, int maxLeaves,
+			Map<String, IntrinsicICInfo> icInfoMap, short depthArray[],
+			ConceptGraph cg) {
+		FeatureEvaluation fe = new FeatureEvaluation();
+		fe.setEvaluationType("intrinsic-infocontent");
+		fe.setParam2(conceptGraphName);
+		List<FeatureRank> listFeatureRank = new ArrayList<FeatureRank>(
+				icInfoMap.size());
+		double maxIC = 0d;
+		short maxDepth = 0;
+		for (IntrinsicICInfo icInfo : icInfoMap.values()) {
+			ConcRel cr = icInfo.getConcept();
+			short depth = depthArray[cr.getNodeIndex()];
+			cr.setDepth(depth);
+			if (depth > maxDepth)
+				maxDepth = depth;
+			double ic = computeIC(icInfo, maxLeaves);
+			cr.setIntrinsicInfoContent(ic);
+			if (ic > maxIC)
+				maxIC = ic;
+			if (log.isDebugEnabled())
+				log.debug(icInfo.getConcept().getConceptID() + "=" + ic);
+			listFeatureRank.add(new FeatureRank(fe, icInfo.getConcept()
+					.getConceptID(), ic, depthArray[icInfo.getConcept()
+					.getNodeIndex()]));
+		}
+		cg.setDepthMax(maxDepth);
+		cg.setIntrinsicICMax(maxIC);
+		if ("true".equalsIgnoreCase(System
+				.getProperty("org.apache.ctakes.ytex.ic.debug", "false"))) {
+			this.classifierEvaluationDao.deleteFeatureEvaluation(null, null,
+					null, fe.getEvaluationType(), null, 0d, conceptGraphName);
+			this.classifierEvaluationDao.saveFeatureEvaluation(fe,
+					listFeatureRank);
+		}
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelContextHolder.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelContextHolder.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelContextHolder.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelContextHolder.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,19 @@
+package org.apache.ctakes.ytex.kernel;
+
+import org.springframework.context.ApplicationContext;
+import org.springframework.context.access.ContextSingletonBeanFactoryLocator;
+
+public class KernelContextHolder {
+	static ApplicationContext kernelApplicationContext = null;
+	static {
+		String beanRefContext = "classpath*:org/apache/ctakes/ytex/kernelBeanRefContext.xml";
+		kernelApplicationContext = (ApplicationContext) ContextSingletonBeanFactoryLocator
+				.getInstance(beanRefContext).useBeanFactory(
+						"kernelApplicationContext").getFactory();
+	}
+
+	public static ApplicationContext getApplicationContext() {
+		return kernelApplicationContext;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelUtil.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelUtil.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelUtil.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelUtil.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,80 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.BufferedWriter;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.InvalidPropertiesFormatException;
+import java.util.Map;
+import java.util.Properties;
+import java.util.SortedSet;
+
+import org.apache.ctakes.ytex.kernel.model.KernelEvaluation;
+
+import com.google.common.collect.BiMap;
+
+
+public interface KernelUtil {
+
+	public abstract void loadProperties(String propertyFile, Properties props)
+			throws FileNotFoundException, IOException,
+			InvalidPropertiesFormatException;
+
+	/**
+	 * read query
+	 * 
+	 * <pre>
+	 * [instance id] [class name] [train/test boolean optioanl] [label optional] [fold optional] [run optional]
+	 * </pre>
+	 * 
+	 * return map of
+	 * 
+	 * <pre>
+	 * [label, [run, [fold, [train/test , [instance id, class]]]]]
+	 * </pre>
+	 * 
+	 * <ul>
+	 * <li>if label not defined, will be ""
+	 * <li>if run not defined, will be 0
+	 * <li>if fold not defined, will be 0
+	 * <li>if train not defined, will be 1
+	 * </ul>
+	 * 
+	 */
+	public abstract InstanceData loadInstances(String strQuery);
+
+	public abstract void fillGramMatrix(
+			final KernelEvaluation kernelEvaluation,
+			final SortedSet<Long> trainInstanceLabelMap,
+			final double[][] trainGramMatrix);
+
+	public abstract double[][] loadGramMatrix(SortedSet<Long> instanceIds,
+			String name, String splitName, String experiment, String label,
+			int run, int fold, double param1, String param2);
+
+	/**
+	 * generate folds from the label to instance map. use properties specified
+	 * in props to generate folds.
+	 * 
+	 * @param instanceLabel
+	 * @param props
+	 */
+	public abstract void generateFolds(InstanceData instanceLabel,
+			Properties props);
+
+	public abstract void fillLabelToClassToIndexMap(
+			Map<String, SortedSet<String>> labelToClasMap,
+			Map<String, BiMap<String, Integer>> labelToClassIndexMap);
+
+	/**
+	 * export the class id to class name map.
+	 * 
+	 * @param classIdMap
+	 * @param label
+	 * @param run
+	 * @param fold
+	 * @throws IOException
+	 */
+	public void exportClassIds(String outdir, Map<String, Integer> classIdMap,
+			String label) throws IOException;
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelUtilImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelUtilImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelUtilImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/KernelUtilImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,407 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectOutputStream;
+import java.lang.reflect.InvocationTargetException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.InvalidPropertiesFormatException;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import javax.sql.DataSource;
+
+import org.apache.commons.beanutils.BeanUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao;
+import org.apache.ctakes.ytex.kernel.dao.KernelEvaluationDao;
+import org.apache.ctakes.ytex.kernel.model.CrossValidationFold;
+import org.apache.ctakes.ytex.kernel.model.KernelEvaluation;
+import org.apache.ctakes.ytex.kernel.model.KernelEvaluationInstance;
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.jdbc.core.RowCallbackHandler;
+import org.springframework.transaction.PlatformTransactionManager;
+import org.springframework.transaction.TransactionStatus;
+import org.springframework.transaction.support.TransactionCallback;
+import org.springframework.transaction.support.TransactionTemplate;
+
+import com.google.common.collect.BiMap;
+import com.google.common.collect.HashBiMap;
+
+
+public class KernelUtilImpl implements KernelUtil {
+	private static final Log log = LogFactory.getLog(KernelUtilImpl.class);
+	private ClassifierEvaluationDao classifierEvaluationDao;
+
+	private JdbcTemplate jdbcTemplate = null;
+
+	private KernelEvaluationDao kernelEvaluationDao = null;
+	private PlatformTransactionManager transactionManager;
+	private FoldGenerator foldGenerator = null;
+
+	public FoldGenerator getFoldGenerator() {
+		return foldGenerator;
+	}
+
+	public void setFoldGenerator(FoldGenerator foldGenerator) {
+		this.foldGenerator = foldGenerator;
+	}
+
+	private Map<Long, Integer> createInstanceIdToIndexMap(
+			SortedSet<Long> instanceIDs) {
+		Map<Long, Integer> instanceIdToIndexMap = new HashMap<Long, Integer>(
+				instanceIDs.size());
+		int i = 0;
+		for (Long instanceId : instanceIDs) {
+			instanceIdToIndexMap.put(instanceId, i);
+			i++;
+		}
+		return instanceIdToIndexMap;
+	}
+
+	@Override
+	public void fillGramMatrix(final KernelEvaluation kernelEvaluation,
+			final SortedSet<Long> trainInstanceLabelMap,
+			final double[][] trainGramMatrix) {
+		// final Set<String> kernelEvaluationNames = new HashSet<String>(1);
+		// kernelEvaluationNames.add(name);
+		// prepare map of instance id to gram matrix index
+		final Map<Long, Integer> trainInstanceToIndexMap = createInstanceIdToIndexMap(trainInstanceLabelMap);
+
+		// iterate through the training instances
+		for (Map.Entry<Long, Integer> instanceIdIndex : trainInstanceToIndexMap
+				.entrySet()) {
+			// index of this instance
+			final int indexThis = instanceIdIndex.getValue();
+			// id of this instance
+			final long instanceId = instanceIdIndex.getKey();
+			// get all kernel evaluations for this instance in a new transaction
+			// don't want too many objects in hibernate session
+			TransactionTemplate t = new TransactionTemplate(
+					this.transactionManager);
+			t.setPropagationBehavior(TransactionTemplate.PROPAGATION_REQUIRES_NEW);
+			t.execute(new TransactionCallback<Object>() {
+				@Override
+				public Object doInTransaction(TransactionStatus arg0) {
+					List<KernelEvaluationInstance> kevals = getKernelEvaluationDao()
+							.getAllKernelEvaluationsForInstance(
+									kernelEvaluation, instanceId);
+					for (KernelEvaluationInstance keval : kevals) {
+						// determine the index of the instance
+						Integer indexOtherTrain = null;
+						long instanceIdOther = instanceId != keval
+								.getInstanceId1() ? keval.getInstanceId1()
+								: keval.getInstanceId2();
+						// look in training set for the instance id
+						indexOtherTrain = trainInstanceToIndexMap
+								.get(instanceIdOther);
+						if (indexOtherTrain != null) {
+							trainGramMatrix[indexThis][indexOtherTrain] = keval
+									.getSimilarity();
+							trainGramMatrix[indexOtherTrain][indexThis] = keval
+									.getSimilarity();
+						}
+					}
+					return null;
+				}
+			});
+		}
+		// put 1's in the diagonal of the training gram matrix
+		for (int i = 0; i < trainGramMatrix.length; i++) {
+			if (trainGramMatrix[i][i] == 0)
+				trainGramMatrix[i][i] = 1;
+		}
+	}
+
+	public ClassifierEvaluationDao getClassifierEvaluationDao() {
+		return classifierEvaluationDao;
+	}
+
+	public DataSource getDataSource() {
+		return jdbcTemplate.getDataSource();
+	}
+
+	public KernelEvaluationDao getKernelEvaluationDao() {
+		return kernelEvaluationDao;
+	}
+
+	public PlatformTransactionManager getTransactionManager() {
+		return transactionManager;
+	}
+
+	@Override
+	public double[][] loadGramMatrix(SortedSet<Long> instanceIds, String name,
+			String splitName, String experiment, String label, int run,
+			int fold, double param1, String param2) {
+		int foldId = 0;
+		double[][] gramMatrix = null;
+		if (run != 0 && fold != 0) {
+			CrossValidationFold f = this.classifierEvaluationDao
+					.getCrossValidationFold(name, splitName, label, run, fold);
+			if (f != null)
+				foldId = f.getCrossValidationFoldId();
+		}
+		KernelEvaluation kernelEval = this.kernelEvaluationDao.getKernelEval(
+				name, experiment, label, foldId, param1, param2);
+		if (kernelEval == null) {
+			log.warn("could not find kernelEvaluation.  name=" + name
+					+ ", experiment=" + experiment + ", label=" + label
+					+ ", fold=" + fold + ", run=" + run);
+		} else {
+			gramMatrix = new double[instanceIds.size()][instanceIds.size()];
+			fillGramMatrix(kernelEval, instanceIds, gramMatrix);
+		}
+		return gramMatrix;
+	}
+
+	/**
+	 * this can be very large - avoid loading the entire jdbc ResultSet into
+	 * memory
+	 */
+	@Override
+	public InstanceData loadInstances(String strQuery) {
+		final InstanceData instanceLabel = new InstanceData();
+		PreparedStatement s = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		try {
+			// jdbcTemplate.query(strQuery, new RowCallbackHandler() {
+			RowCallbackHandler ch = new RowCallbackHandler() {
+
+				@Override
+				public void processRow(ResultSet rs) throws SQLException {
+					String label = "";
+					int run = 0;
+					int fold = 0;
+					boolean train = true;
+					long instanceId = rs.getLong(1);
+					String className = rs.getString(2);
+					if (rs.getMetaData().getColumnCount() >= 3)
+						train = rs.getBoolean(3);
+					if (rs.getMetaData().getColumnCount() >= 4) {
+						label = rs.getString(4);
+						if (label == null)
+							label = "";
+					}
+					if (rs.getMetaData().getColumnCount() >= 5)
+						fold = rs.getInt(5);
+					if (rs.getMetaData().getColumnCount() >= 6)
+						run = rs.getInt(6);
+					// get runs for label
+					SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>> runToInstanceMap = instanceLabel
+							.getLabelToInstanceMap().get(label);
+					if (runToInstanceMap == null) {
+						runToInstanceMap = new TreeMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>>();
+						instanceLabel.getLabelToInstanceMap().put(label,
+								runToInstanceMap);
+					}
+					// get folds for run
+					SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>> foldToInstanceMap = runToInstanceMap
+							.get(run);
+					if (foldToInstanceMap == null) {
+						foldToInstanceMap = new TreeMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>();
+						runToInstanceMap.put(run, foldToInstanceMap);
+					}
+					// get train/test set for fold
+					SortedMap<Boolean, SortedMap<Long, String>> ttToClassMap = foldToInstanceMap
+							.get(fold);
+					if (ttToClassMap == null) {
+						ttToClassMap = new TreeMap<Boolean, SortedMap<Long, String>>();
+						foldToInstanceMap.put(fold, ttToClassMap);
+					}
+					// get instances for train/test set
+					SortedMap<Long, String> instanceToClassMap = ttToClassMap
+							.get(train);
+					if (instanceToClassMap == null) {
+						instanceToClassMap = new TreeMap<Long, String>();
+						ttToClassMap.put(train, instanceToClassMap);
+					}
+					// set the instance class
+					instanceToClassMap.put(instanceId, className);
+					// add the class to the labelToClassMap
+					SortedSet<String> labelClasses = instanceLabel
+							.getLabelToClassMap().get(label);
+					if (labelClasses == null) {
+						labelClasses = new TreeSet<String>();
+						instanceLabel.getLabelToClassMap().put(label,
+								labelClasses);
+					}
+					if (!labelClasses.contains(className))
+						labelClasses.add(className);
+				}
+			};
+			conn = this.jdbcTemplate.getDataSource().getConnection();
+			s = conn.prepareStatement(strQuery,
+					java.sql.ResultSet.TYPE_FORWARD_ONLY,
+					java.sql.ResultSet.CONCUR_READ_ONLY);
+			if ("MySQL".equals(conn.getMetaData().getDatabaseProductName())) {
+				s.setFetchSize(Integer.MIN_VALUE);
+			} else if (s.getClass().getName()
+					.equals("com.microsoft.sqlserver.jdbc.SQLServerStatement")) {
+				try {
+					BeanUtils.setProperty(s, "responseBuffering", "adaptive");
+				} catch (IllegalAccessException e) {
+					log.warn("error setting responseBuffering", e);
+				} catch (InvocationTargetException e) {
+					log.warn("error setting responseBuffering", e);
+				}
+			}
+			rs = s.executeQuery();
+			while (rs.next()) {
+				ch.processRow(rs);
+			}
+		} catch (SQLException j) {
+			log.error("loadInstances failed", j);
+			throw new RuntimeException(j);
+		} finally {
+			if (rs != null) {
+				try {
+					rs.close();
+				} catch (SQLException e) {
+				}
+			}
+			if (s != null) {
+				try {
+					s.close();
+				} catch (SQLException e) {
+				}
+			}
+			if (conn != null) {
+				try {
+					conn.close();
+				} catch (SQLException e) {
+				}
+			}
+		}
+		return instanceLabel;
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see org.apache.ctakes.ytex.kernel.DataExporter#loadProperties(java.lang.String,
+	 * java.util.Properties)
+	 */
+	@Override
+	public void loadProperties(String propertyFile, Properties props)
+			throws FileNotFoundException, IOException,
+			InvalidPropertiesFormatException {
+		InputStream in = null;
+		try {
+			in = new FileInputStream(propertyFile);
+			if (propertyFile.endsWith(".xml"))
+				props.loadFromXML(in);
+			else
+				props.load(in);
+		} finally {
+			if (in != null) {
+				in.close();
+			}
+		}
+	}
+
+	public void setClassifierEvaluationDao(
+			ClassifierEvaluationDao classifierEvaluationDao) {
+		this.classifierEvaluationDao = classifierEvaluationDao;
+	}
+
+	public void setDataSource(DataSource dataSource) {
+		this.jdbcTemplate = new JdbcTemplate(dataSource);
+	}
+
+	public void setKernelEvaluationDao(KernelEvaluationDao kernelEvaluationDao) {
+		this.kernelEvaluationDao = kernelEvaluationDao;
+	}
+
+	public void setTransactionManager(
+			PlatformTransactionManager transactionManager) {
+		this.transactionManager = transactionManager;
+	}
+
+	@Override
+	public void generateFolds(InstanceData instanceLabel, Properties props) {
+		int folds = Integer.parseInt(props.getProperty("folds"));
+		int runs = Integer.parseInt(props.getProperty("runs", "1"));
+		int minPerClass = Integer.parseInt(props
+				.getProperty("minPerClass", "0"));
+		Integer randomNumberSeed = props.containsKey("rand") ? Integer
+				.parseInt(props.getProperty("rand")) : null;
+		instanceLabel.setLabelToInstanceMap(foldGenerator.generateRuns(
+				instanceLabel.getLabelToInstanceMap(), folds, minPerClass,
+				randomNumberSeed, runs));
+	}
+
+	/**
+	 * assign numeric indices to string class names
+	 * 
+	 * @param labelToClasMap
+	 * @param labelToClassIndexMap
+	 */
+	@Override
+	public void fillLabelToClassToIndexMap(
+			Map<String, SortedSet<String>> labelToClasMap,
+			Map<String, BiMap<String, Integer>> labelToClassIndexMap) {
+		for (Map.Entry<String, SortedSet<String>> labelToClass : labelToClasMap
+				.entrySet()) {
+			BiMap<String, Integer> classToIndexMap = HashBiMap.create();
+			labelToClassIndexMap.put(labelToClass.getKey(), classToIndexMap);
+			int nIndex = 1;
+			for (String className : labelToClass.getValue()) {
+				Integer classNumber = null;
+				try {
+					classNumber = Integer.parseInt(className);
+				} catch (NumberFormatException fe) {
+				}
+				if (classNumber == null) {
+					classToIndexMap.put(className, nIndex++);
+				} else {
+					classToIndexMap.put(className, classNumber);
+				}
+			}
+		}
+	}
+
+	/**
+	 * export the class id to class name map.
+	 * 
+	 * @param classIdMap
+	 * @param label
+	 * @param run
+	 * @param fold
+	 * @throws IOException
+	 */
+	public void exportClassIds(String outdir, Map<String, Integer> classIdMap,
+			String label) throws IOException {
+		// construct file name
+		String filename = FileUtil.getScopedFileName(outdir, label, null, null,
+				"class.properties");
+		Properties props = new Properties();
+		for (Map.Entry<String, Integer> entry : classIdMap.entrySet()) {
+			props.put(entry.getValue().toString(), entry.getKey());
+		}
+		BufferedWriter w = null;
+		try {
+			w = new BufferedWriter(new FileWriter(filename));
+			props.store(w, "class id to class name map");
+		} finally {
+			if (w != null) {
+				w.close();
+			}
+		}
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/OrderedPair.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/OrderedPair.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/OrderedPair.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/OrderedPair.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,59 @@
+package org.apache.ctakes.ytex.kernel;
+
+/**
+ * simple pair class used for cache keys where the result is symmetric, e.g.
+ * similarity measures. Order o1 and o2 so that the results of the comparison is
+ * the same, regardless of the order of o1 and o2.
+ * 
+ * @author vijay
+ * 
+ * @param <T>
+ */
+public class OrderedPair<T extends Comparable<T>> {
+	private T o1;
+	private T o2;
+
+	public OrderedPair(T o1, T o2) {
+		super();
+		if (o1.compareTo(o2) <= 0) {
+			this.o1 = o1;
+			this.o2 = o2;
+		} else {
+			this.o1 = o2;
+			this.o2 = o1;
+		}
+	}
+
+	@Override
+	public int hashCode() {
+		final int prime = 31;
+		int result = 1;
+		result = prime * result + ((o1 == null) ? 0 : o1.hashCode());
+		result = prime * result + ((o2 == null) ? 0 : o2.hashCode());
+		return result;
+	}
+
+	@Override
+	public boolean equals(Object obj) {
+		if (this == obj)
+			return true;
+		if (obj == null)
+			return false;
+		if (getClass() != obj.getClass())
+			return false;
+		@SuppressWarnings("unchecked")
+		OrderedPair<T> other = (OrderedPair<T>) obj;
+		if (o1 == null) {
+			if (other.o1 != null)
+				return false;
+		} else if (!o1.equals(other.o1))
+			return false;
+		if (o2 == null) {
+			if (other.o2 != null)
+				return false;
+		} else if (!o2.equals(other.o2))
+			return false;
+		return true;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/Pair.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/Pair.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/Pair.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/Pair.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,48 @@
+package org.apache.ctakes.ytex.kernel;
+
+/**
+ * simple object pair implementation
+ * @author vijay
+ *
+ * @param <T>
+ */
+public class Pair<T> {
+	T o1;
+	T o2;
+	public Pair(T o1, T o2) {
+		super();
+		this.o1 = o1;
+		this.o2 = o2;
+	}
+	@Override
+	public boolean equals(Object obj) {
+		if (this == obj)
+			return true;
+		if (obj == null)
+			return false;
+		if (getClass() != obj.getClass())
+			return false;
+		@SuppressWarnings("unchecked")
+		Pair<T> other = (Pair<T>) obj;
+		if (o1 == null) {
+			if (other.o1 != null)
+				return false;
+		} else if (!o1.equals(other.o1))
+			return false;
+		if (o2 == null) {
+			if (other.o2 != null)
+				return false;
+		} else if (!o2.equals(other.o2))
+			return false;
+		return true;
+	}
+	@Override
+	public int hashCode() {
+		final int prime = 31;
+		int result = 1;
+		result = prime * result + ((o1 == null) ? 0 : o1.hashCode());
+		result = prime * result + ((o2 == null) ? 0 : o2.hashCode());
+		return result;
+	}
+}
+	

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/RRFtoWideTab.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/RRFtoWideTab.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/RRFtoWideTab.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/RRFtoWideTab.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,68 @@
+package org.apache.ctakes.ytex.kernel;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+
+/**
+ * Convert RRF files to UCS2 tab-delimited format for import into SQL Server.
+ * This adds the Unicode Byte Order Marker to the output file.
+ * 
+ * @author vijay
+ */
+public class RRFtoWideTab {
+
+	/**
+	 * @param args
+	 * @throws Exception
+	 */
+	public static void main(String[] args) throws Exception {
+		InputStream fis = null;
+		OutputStream fos = null;
+		boolean bCloseFis = false;
+		boolean bCloseFos = false;
+		if (args.length == 1 || args.length == 2) {
+			fis = new FileInputStream(args[0]);
+			bCloseFis = true;
+		} else {
+			fis = System.in;
+		}
+		if (args.length == 2) {
+			fos = new FileOutputStream(args[1]);
+			bCloseFos = true;
+		} else {
+			fos = System.out;
+		}
+		//add the byte order mark
+	    byte[] utf16lemessage = new byte[2];
+	    utf16lemessage[0] = (byte)0xFF;
+	    utf16lemessage[1] = (byte)0xFE;
+		fos.write(utf16lemessage);
+		//convert from utf8 to utf16
+		BufferedReader r = new BufferedReader(
+				new InputStreamReader(fis, "UTF8"));
+		BufferedWriter w = new BufferedWriter(new OutputStreamWriter(fos,
+				"UTF-16LE"));
+		for (String s = ""; (s = r.readLine()) != null;) {
+			// chop off the trailing '|'
+			if(s.charAt(s.length()-1) == '|') {
+				s = s.substring(0,s.length()-1);
+			}
+			// replace | with tab
+			s = s.replace('|', '\t');
+			w.write(s);
+			w.newLine();
+		}
+		w.flush();
+		if (bCloseFis) {
+			r.close();
+		}
+		if (bCloseFos) {
+			w.close();
+		}
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SVMLinFormatterFactory.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SVMLinFormatterFactory.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SVMLinFormatterFactory.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SVMLinFormatterFactory.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,202 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Map;
+import java.util.Properties;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.semil.SemiLFormatterFactory.SemiLDataFormatter;
+
+
+import com.google.common.base.Strings;
+
+/**
+ * for each train/test pair create the following files:
+ * <ul>
+ * <li>[prefix]class.txt - as in semil: instance id \t train/test flag \t target
+ * class id</li
+ * <li>[prefix]code.properties - map of codes to classes. currently only do a
+ * one-against-all coding
+ * <li>[prefix]code[n]_label.txt - for each class
+ * </ul>
+ * 
+ * @author vijay
+ * 
+ */
+public class SVMLinFormatterFactory implements SparseDataFormatterFactory {
+	public static class SVMLinDataFormatter extends SemiLDataFormatter {
+		private static final Log log = LogFactory
+				.getLog(SVMLinDataFormatter.class);
+
+		public SVMLinDataFormatter(KernelUtil kernelUtil) {
+			super(kernelUtil);
+		}
+
+		@Override
+		protected void exportData(SparseData sparseData, String label,
+				Integer run, Integer fold) throws IOException {
+			exportAttributeNames(sparseData, label, run, fold);
+			String filename = FileUtil.getScopedFileName(outdir, label, run,
+					fold, "data.txt");
+			BufferedWriter wData = null;
+			try {
+				wData = new BufferedWriter(new FileWriter(filename));
+				for (long instanceId : sparseData.getInstanceIds()) {
+					// get line with sparse attribute indices and values
+					SortedMap<Integer, Double> instanceValues = getSparseLineValues(
+							sparseData, numericAttributeMap,
+							nominalAttributeMap, instanceId);
+					// write the line
+					writeLibsvmLine(wData, instanceValues);
+				}
+			} finally {
+				if (wData != null) {
+					wData.close();
+				}
+			}
+		}
+
+		/**
+		 * recode the classes. the codes are bits in some sort of class coding
+		 * scheme. this creates one-against-all codes.
+		 * <p>
+		 * creates [scope]code.properties. file to write the codes to. When
+		 * parsing results, we will read this properties file.
+		 * <p>
+		 * creates [scope]code[n]_label.txt. Class label files for
+		 * one-against-all classification.
+		 * 
+		 * @param trainInstanceIdToClass
+		 *            map of training instance id to class id
+		 * @return map of code to map of instance id - recoded class id
+		 * @throws IOException
+		 */
+		private void exportOneAgainstAllCodes(String label, Integer run,
+				Integer fold, SortedMap<Long, Integer> trainInstanceIdToClass,
+				Map<Integer, String> codeToClassNameMap) throws IOException {
+			// file to write the map between codes and classes
+			String classFileName = FileUtil.getScopedFileName(outdir, label,
+					run, fold, "code.properties");
+			SortedSet<Integer> classIds = new TreeSet<Integer>();
+			classIds.addAll(trainInstanceIdToClass.values());
+			classIds.remove(0);
+			// if there is only 1 class, abort
+			if (classIds.size() < 2) {
+				log.warn("<2 classes, skipping export for label " + label
+						+ " run " + run + " fold " + fold);
+				return;
+			}
+			Properties props = new Properties();
+			StringBuilder bCodeList = new StringBuilder();
+			int code = 1;
+			Integer[] classIdArray = classIds.toArray(new Integer[0]);
+			for (int i = 0; i < classIdArray.length; i++) {
+				int classId = classIdArray[i];
+				String className = codeToClassNameMap.get(classId);
+				// recode the instances
+				SortedMap<Long, Integer> mapRecodedInstanceIdToClass = new TreeMap<Long, Integer>();
+				for (Map.Entry<Long, Integer> instanceIdToClassEntry : trainInstanceIdToClass
+						.entrySet()) {
+					int trainClassId = instanceIdToClassEntry.getValue();
+					int codedClassId = 0; // default to unlabeled
+					if (trainClassId == classId) {
+						codedClassId = 1;
+					} else if (trainClassId != 0) {
+						codedClassId = -1;
+					}
+					mapRecodedInstanceIdToClass.put(
+							instanceIdToClassEntry.getKey(), codedClassId);
+				}
+				String labelFileBaseName = FileUtil.getScopedFileName(outdir,
+						label, run, fold,
+						"class" + codeToClassNameMap.get(classId) + ".txt");
+				exportLabel(labelFileBaseName, mapRecodedInstanceIdToClass);
+				// add the map from code to class
+				props.setProperty(labelFileBaseName + ".class",
+						Integer.toString(classId));
+				props.setProperty(labelFileBaseName + ".className", className);
+				// add the key to the classWeights.properties file that will
+				// have the positive class fraction.  the key is of the form 
+				// label<label>_class<class>
+				props.setProperty("classrel",
+						formatWeightKey(label, className));
+				// add the code to the list of codes
+				bCodeList.append(labelFileBaseName).append(",");
+				// if there are just 2 classes, stop here
+				if (classIdArray.length == 2) {
+					props.setProperty("classOther",
+							Integer.toString(classIdArray[1]));
+					props.setProperty("classOtherName",
+							codeToClassNameMap.get(classIdArray[1]));
+					break;
+				}
+				// increment the code
+				code++;
+			}
+			props.setProperty("codes", bCodeList.toString());
+			Writer w = null;
+			try {
+				w = new BufferedWriter(new FileWriter(classFileName));
+				props.store(w, "oneAgainstAll");
+			} finally {
+				if (w != null) {
+					try {
+						w.close();
+					} catch (Exception e) {
+					}
+				}
+			}
+			// return mapCodeToInstanceClass;
+		}
+
+		@Override
+		public void initializeFold(SparseData sparseData, String label,
+				Integer run, Integer fold,
+				SortedMap<Boolean, SortedMap<Long, String>> foldInstanceLabelMap)
+				throws IOException {
+			if (SCOPE_FOLD.equals(this.exportProperties.getProperty(SCOPE))) {
+				exportData(sparseData, label, run, fold);
+			}
+			String idFileName = FileUtil.getScopedFileName(outdir, label, run,
+					fold, "id.txt");
+			SortedMap<Long, Integer> trainInstanceIdToClass = super
+					.getTrainingClassMap(idFileName,
+							foldInstanceLabelMap.get(true),
+							foldInstanceLabelMap.get(false),
+							this.labelToClassIndexMap.get(label),
+							sparseData.getInstanceIds());
+			exportOneAgainstAllCodes(label, run, fold, trainInstanceIdToClass,
+					this.labelToClassIndexMap.get(label).inverse());
+		}
+	}
+
+	private KernelUtil kernelUtil;
+
+	@Override
+	public SparseDataFormatter getFormatter() {
+		return new SVMLinDataFormatter(kernelUtil);
+	}
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	public static String formatWeightKey(String label, String className) {
+		return (Strings.isNullOrEmpty(label) ? "" : "label" + label
+				+ "_")
+				+ "class" + className;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SimSvcContextHolder.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SimSvcContextHolder.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SimSvcContextHolder.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SimSvcContextHolder.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,19 @@
+package org.apache.ctakes.ytex.kernel;
+
+import org.springframework.context.ApplicationContext;
+import org.springframework.context.access.ContextSingletonBeanFactoryLocator;
+
+public class SimSvcContextHolder {
+	static ApplicationContext kernelApplicationContext = null;
+	static {
+		String beanRefContext = "classpath*:org/apache/ctakes/ytex/simSvcBeanRefContext.xml";
+		kernelApplicationContext = (ApplicationContext) ContextSingletonBeanFactoryLocator
+				.getInstance(beanRefContext).useBeanFactory(
+						"kernelApplicationContext").getFactory();
+	}
+
+	public static ApplicationContext getApplicationContext() {
+		return kernelApplicationContext;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseData.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseData.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseData.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseData.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,86 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+/**
+ * Data structure populated by SparseDataExporter that has all the instance
+ * attributes needed for exporting to various formats.
+ * 
+ * @author vijay
+ * 
+ */
+public class SparseData {
+	/**
+	 * the instance ids in this sparse data set
+	 */
+	SortedSet<Long> instanceIds = new TreeSet<Long>();
+	/**
+	 * instance nominal attribute values
+	 */
+	Map<Long, SortedMap<String, String>> instanceNominalWords = new HashMap<Long, SortedMap<String, String>>();
+	/**
+	 * map if instance id to map of attribute name - value pairs
+	 */
+	Map<Long, SortedMap<String, Double>> instanceNumericWords = new HashMap<Long, SortedMap<String, Double>>();
+
+	/**
+	 * nominal attribute names and values
+	 */
+	SortedMap<String, SortedSet<String>> nominalWordValueMap = new TreeMap<String, SortedSet<String>>();
+
+	/**
+	 * numeric attribute labels
+	 */
+	SortedSet<String> numericWords = new TreeSet<String>();
+	
+	
+
+	public SortedSet<Long> getInstanceIds() {
+		return instanceIds;
+	}
+
+	public void setInstanceIds(SortedSet<Long> instanceIds) {
+		this.instanceIds = instanceIds;
+	}
+
+	public Map<Long, SortedMap<String, String>> getInstanceNominalWords() {
+		return instanceNominalWords;
+	}
+
+	public Map<Long, SortedMap<String, Double>> getInstanceNumericWords() {
+		return instanceNumericWords;
+	}
+
+	public SortedMap<String, SortedSet<String>> getNominalWordValueMap() {
+		return nominalWordValueMap;
+	}
+
+	public SortedSet<String> getNumericWords() {
+		return numericWords;
+	}
+
+	public void setInstanceNominalWords(
+			Map<Long, SortedMap<String, String>> instanceNominalWords) {
+		this.instanceNominalWords = instanceNominalWords;
+	}
+
+	public void setInstanceNumericWords(
+			Map<Long, SortedMap<String, Double>> instanceNumericWords) {
+		this.instanceNumericWords = instanceNumericWords;
+	}
+
+	public void setNominalWordValueMap(
+			SortedMap<String, SortedSet<String>> nominalWordValueMap) {
+		this.nominalWordValueMap = nominalWordValueMap;
+	}
+
+	public void setNumericWords(SortedSet<String> numericWords) {
+		this.numericWords = numericWords;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataExporter.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataExporter.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataExporter.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataExporter.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,28 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.IOException;
+import java.util.InvalidPropertiesFormatException;
+import java.util.Properties;
+
+public interface SparseDataExporter {
+	
+	public enum ScopeEnum {
+		LABEL("label"),
+		FOLD("fold");
+		private String scope;
+		ScopeEnum(String scope) {
+			this.scope = scope;
+		}
+		public String getScope() {
+			return scope;
+		}
+	}
+
+	public abstract void exportData(String propertiesFile, String format)
+			throws IOException, InvalidPropertiesFormatException;
+
+	public abstract void exportData(Properties props,
+			SparseDataFormatter formatter, BagOfWordsDecorator bDecorator)
+			throws IOException;
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataExporterImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataExporterImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataExporterImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataExporterImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,442 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.InvalidPropertiesFormatException;
+import java.util.Map;
+import java.util.Properties;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import javax.sql.DataSource;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.jdbc.core.RowCallbackHandler;
+import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
+import org.springframework.jdbc.core.simple.SimpleJdbcTemplate;
+import org.springframework.transaction.TransactionStatus;
+import org.springframework.transaction.support.TransactionCallback;
+import org.springframework.transaction.support.TransactionTemplate;
+
+public class SparseDataExporterImpl implements SparseDataExporter {
+
+	private static final Log log = LogFactory
+			.getLog(SparseDataExporterImpl.class);
+
+	@SuppressWarnings("static-access")
+	public static void main(String args[]) throws IOException {
+		Options options = new Options();
+		options.addOption(OptionBuilder
+				.withArgName("prop")
+				.hasArg()
+				.isRequired()
+				.withDescription(
+						"property file with queries and other parameters.")
+				.create("prop"));
+		options.addOption(OptionBuilder.withArgName("type").hasArg()
+				.isRequired()
+				.withDescription("export format; valid values: weka, libsvm")
+				.create("type"));
+		if (args.length == 0)
+			printHelp(options);
+		else {
+			try {
+				CommandLineParser parser = new GnuParser();
+				CommandLine line = parser.parse(options, args);
+				String propFile = line.getOptionValue("prop");
+				String format = line.getOptionValue("type");
+				SparseDataExporter exporter = KernelContextHolder
+						.getApplicationContext().getBean(
+								SparseDataExporter.class);
+				exporter.exportData(propFile, format);
+			} catch (ParseException pe) {
+				printHelp(options);
+			}
+		}
+	}
+
+	private static void printHelp(Options options) {
+		HelpFormatter formatter = new HelpFormatter();
+
+		formatter.printHelp("java " + SparseDataExporterImpl.class.getName()
+				+ " export sparse data", options);
+	}
+
+	protected JdbcTemplate jdbcTemplate;
+	protected KernelUtil kernelUtil;
+
+	protected NamedParameterJdbcTemplate namedJdbcTemplate;
+
+	protected Map<String, SparseDataFormatterFactory> nameToFormatterMap = new HashMap<String, SparseDataFormatterFactory>();
+
+	protected SimpleJdbcTemplate simpleJdbcTemplate;
+
+	protected TransactionTemplate txTemplateNew;
+
+	public SparseDataExporterImpl() {
+		super();
+	}
+
+	protected void addNominalWordToInstance(SparseData sparseData,
+			long instanceId, String word, String wordValue) {
+		// add the instance id to the set of instance ids if necessary
+		if (!sparseData.getInstanceIds().contains(instanceId))
+			sparseData.getInstanceIds().add(instanceId);
+		SortedMap<String, String> instanceWords = sparseData
+				.getInstanceNominalWords().get(instanceId);
+		SortedSet<String> wordValueSet = sparseData.getNominalWordValueMap()
+				.get(word);
+		if (instanceWords == null) {
+			instanceWords = new TreeMap<String, String>();
+			sparseData.getInstanceNominalWords().put(instanceId, instanceWords);
+		}
+		if (wordValueSet == null) {
+			wordValueSet = new TreeSet<String>();
+			sparseData.getNominalWordValueMap().put(word, wordValueSet);
+		}
+		// add the word-value for the instance
+		instanceWords.put(word, wordValue);
+		// add the value to the set of valid values
+		wordValueSet.add(wordValue);
+	}
+
+	protected void addNumericWordToInstance(SparseData sparseData,
+			long instanceId, String word, double wordValue) {
+		// add the instance id to the set of instance ids if necessary
+		if (!sparseData.getInstanceIds().contains(instanceId))
+			sparseData.getInstanceIds().add(instanceId);
+		// add the numeric word to the map of words for this document
+		SortedMap<String, Double> words = sparseData.getInstanceNumericWords()
+				.get(instanceId);
+		if (words == null) {
+			words = new TreeMap<String, Double>();
+			sparseData.getInstanceNumericWords().put(instanceId, words);
+		}
+		words.put(word, wordValue);
+		sparseData.getNumericWords().add(word);
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see org.apache.ctakes.ytex.kernel.SparseDataExporter#exportData(org.apache.ctakes.ytex.kernel.SparseData,
+	 * org.apache.ctakes.ytex.kernel.SparseDataFormatter, java.util.Properties)
+	 */
+	public void exportData(InstanceData instanceLabel,
+			SparseDataFormatter formatter, Properties properties,
+			BagOfWordsDecorator bDecorator) throws IOException {
+		String scope = properties.getProperty("scope", null);
+		SparseData sparseData = null;
+		if (scope == null) {
+			sparseData = this.loadData(instanceLabel,
+					properties.getProperty("numericWordQuery"),
+					properties.getProperty("nominalWordQuery"),
+					properties.getProperty("prepareScript"),
+					properties.getProperty("prepareScriptDelimiter", ";"),
+					bDecorator, null, null, null);
+		}
+		formatter.initializeExport(instanceLabel, properties, sparseData);
+		for (String label : instanceLabel.getLabelToInstanceMap().keySet()) {
+			if ("label".equals(scope)) {
+				sparseData = this.loadData(instanceLabel,
+						properties.getProperty("numericWordQuery"),
+						properties.getProperty("nominalWordQuery"),
+						properties.getProperty("prepareScript"),
+						properties.getProperty("prepareScriptDelimiter", ";"),
+						bDecorator, label, null, null);
+			}
+			formatter
+					.initializeLabel(label, instanceLabel
+							.getLabelToInstanceMap().get(label), properties,
+							sparseData);
+			for (int run : instanceLabel.getLabelToInstanceMap().get(label)
+					.keySet()) {
+				for (int fold : instanceLabel.getLabelToInstanceMap()
+						.get(label).get(run).keySet()) {
+					if (log.isInfoEnabled()
+							&& (label.length() > 0 || run > 0 || fold > 0))
+						log.info("exporting, label " + label + " run " + run
+								+ " fold " + fold);
+					if ("fold".equals(scope)) {
+						sparseData = this.loadData(instanceLabel, properties
+								.getProperty("numericWordQuery"), properties
+								.getProperty("nominalWordQuery"), properties
+								.getProperty("prepareScript"), properties
+								.getProperty("prepareScriptDelimiter", ";"),
+								bDecorator, label, fold, run);
+					}
+					formatter.initializeFold(sparseData, label, run, fold,
+							instanceLabel.getLabelToInstanceMap().get(label)
+									.get(run).get(fold));
+					for (boolean train : instanceLabel.getLabelToInstanceMap()
+							.get(label).get(run).get(fold).keySet()) {
+						formatter.exportFold(sparseData, instanceLabel
+								.getLabelToInstanceMap().get(label).get(run)
+								.get(fold).get(train), train, label,
+								0 == run ? null : run, 0 == fold ? null : fold);
+					}
+					formatter.clearFold();
+				}
+			}
+			formatter.clearLabel();
+		}
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see org.apache.ctakes.ytex.kernel.SparseDataExporter#exportData(java.util.Properties,
+	 * org.apache.ctakes.ytex.kernel.SparseDataFormatter, org.apache.ctakes.ytex.kernel.BagOfWordsDecorator)
+	 */
+	@Override
+	public void exportData(Properties props, SparseDataFormatter formatter,
+			BagOfWordsDecorator bDecorator) throws IOException {
+		InstanceData instanceLabel = this.getKernelUtil().loadInstances(
+				props.getProperty("instanceClassQuery"));
+		if (props.containsKey("folds")) {
+			this.getKernelUtil().generateFolds(instanceLabel, props);
+		}
+		// load label - instance id maps
+		// sparseData.setLabelToInstanceMap(this.getKernelUtil().loadInstances(
+		// props.getProperty("instanceClassQuery"),
+		// sparseData.getLabelToClassMap()));
+		this.exportData(instanceLabel, formatter, props, bDecorator);
+		// this.loadData(sparseData,
+		// props.getProperty("numericWordQuery"),
+		// props.getProperty("nominalWordQuery"), bDecorator);
+		// this.exportData(sparseData, formatter, props);
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see org.apache.ctakes.ytex.kernel.SparseDataExporter#exportData(java.lang.String,
+	 * java.lang.String)
+	 */
+	@Override
+	public void exportData(String propertiesFile, String format)
+			throws IOException, InvalidPropertiesFormatException {
+		Properties props = new Properties();
+		this.getKernelUtil().loadProperties(propertiesFile, props);
+		this.exportData(props, nameToFormatterMap.get(format.toLowerCase())
+				.getFormatter(), null);
+	}
+
+	public DataSource getDataSource(DataSource ds) {
+		return this.jdbcTemplate.getDataSource();
+	}
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public Map<String, SparseDataFormatterFactory> getNameToFormatterMap() {
+		return nameToFormatterMap;
+	}
+
+	/**
+	 * run the prepare script if defined.
+	 * 
+	 * @param prepareScript
+	 *            sequence of sql statements to be executed with named params.
+	 * @param prepareScriptDelimiter
+	 *            delimiter separating the sql statements.
+	 * @param params
+	 *            for named parameters in sql statements.
+	 */
+	protected void prepare(final String prepareScript,
+			final String prepareScriptDelimiter,
+			final Map<String, Object> params) {
+		if (prepareScript != null && prepareScript.length() > 0) {
+			String[] statements = prepareScript.split(prepareScriptDelimiter);
+			// throw out empty lines
+			for (String sql : statements) {
+				if (sql != null && sql.trim().length() > 0) {
+					this.namedJdbcTemplate.update(sql, params);
+				}
+			}
+		}
+	}
+
+	/**
+	 * 
+	 * @param sql
+	 *            result set has 3 columns. 1st column - integer - instance id.
+	 *            2nd column - word. 3rd column - word value.
+	 * @param instanceWordMap
+	 *            map of instance id to word-word value.
+	 * @param wordValueMap
+	 *            map of word to valid values for the word.
+	 * @return populate maps with results of query.
+	 */
+	protected void getNominalInstanceWords(final String sql,
+			final String prepareScript, final String prepareScriptDelimiter,
+			final SparseData sparseData, final Map<String, Object> params) {
+		txTemplateNew.execute(new TransactionCallback<Object>() {
+
+			// new PreparedStatementCreator() {
+			// @Override
+
+			// public PreparedStatement createPreparedStatement(
+			// Connection conn) throws SQLException {
+			// return conn.prepareStatement(sql,
+			// ResultSet.TYPE_FORWARD_ONLY,
+			// ResultSet.CONCUR_READ_ONLY);
+			// }
+			//
+			// } @Override
+			public Object doInTransaction(TransactionStatus txStatus) {
+				prepare(prepareScript, prepareScriptDelimiter, params);
+				namedJdbcTemplate.query(sql, params, new RowCallbackHandler() {
+
+					@Override
+					public void processRow(ResultSet rs) throws SQLException {
+						long instanceId = rs.getLong(1);
+						String word = rs.getString(2);
+						String wordValue = rs.getString(3);
+						addNominalWordToInstance(sparseData, instanceId, word,
+								wordValue);
+					}
+				});
+				return null;
+			}
+		});
+	}
+
+	/**
+	 * 
+	 * @param sql
+	 *            result 1st column: instance id, 2nd column: word, 3rd column:
+	 *            numeric word value
+	 * @param instanceNumericWords
+	 *            map of instance id - [map word - word value] to be populated
+	 */
+	protected void getNumericInstanceWords(final String sql,
+			final String prepareScript, final String prepareScriptDelimiter,
+			final SparseData sparseData, final Map<String, Object> params) {
+		txTemplateNew.execute(new TransactionCallback<Object>() {
+
+			@Override
+			public Object doInTransaction(TransactionStatus txStatus) {
+				prepare(prepareScript, prepareScriptDelimiter, params);
+				namedJdbcTemplate.query(sql, params
+				// new PreparedStatementCreator() {
+				//
+				// @Override
+				// public PreparedStatement createPreparedStatement(
+				// Connection conn) throws SQLException {
+				// return conn.prepareStatement(sql,
+				// ResultSet.TYPE_FORWARD_ONLY,
+				// ResultSet.CONCUR_READ_ONLY);
+				// }
+				//
+				// }
+						, new RowCallbackHandler() {
+
+							@Override
+							public void processRow(ResultSet rs)
+									throws SQLException {
+								long instanceId = rs.getLong(1);
+								String word = rs.getString(2);
+								double wordValue = rs.getDouble(3);
+								addNumericWordToInstance(sparseData,
+										instanceId, word, wordValue);
+							}
+						});
+				return null;
+			}
+
+		});
+	}
+
+	public TransactionTemplate getTxTemplateNew() {
+		return txTemplateNew;
+	}
+
+	/**
+	 * 
+	 * @param instanceLabel
+	 *            instance data: label - fold - instance id - class map
+	 * @param instanceNumericWordQuery
+	 *            query to get numeric attributes
+	 * @param instanceNominalWordQuery
+	 *            query to get nominal attributes
+	 * @param prepareScript
+	 *            prepare script to be executed in same tx as instance attribute
+	 *            queries
+	 * @param prepareScriptDelimiter
+	 *            delimiter for statements in prepare script
+	 * @param bDecorator
+	 *            decorator to add attributes
+	 * @param label
+	 * @param fold
+	 * @param run
+	 * @return
+	 */
+	protected SparseData loadData(InstanceData instanceLabel,
+			String instanceNumericWordQuery, String instanceNominalWordQuery,
+			String prepareScript, String prepareScriptDelimiter,
+			BagOfWordsDecorator bDecorator, String label, Integer fold,
+			Integer run) {
+		SparseData sparseData = new SparseData();
+		Map<String, Object> params = new HashMap<String, Object>();
+		if (label != null && label.length() > 0)
+			params.put("label", label);
+		if (fold != null && fold != 0)
+			params.put("fold", fold);
+		if (run != null && run != 0)
+			params.put("run", run);
+		// load numeric attributes
+		if (instanceNumericWordQuery != null
+				&& instanceNumericWordQuery.trim().length() > 0)
+			this.getNumericInstanceWords(instanceNumericWordQuery,
+					prepareScript, prepareScriptDelimiter, sparseData, params);
+		// added to support adding gram matrix index in GramMatrixExporter
+		if (bDecorator != null)
+			bDecorator.decorateNumericInstanceWords(
+					sparseData.getInstanceNumericWords(),
+					sparseData.getNumericWords());
+		// load nominal attributes
+		if (instanceNominalWordQuery != null
+				&& instanceNominalWordQuery.trim().length() > 0)
+			this.getNominalInstanceWords(instanceNominalWordQuery,
+					prepareScript, prepareScriptDelimiter, sparseData, params);
+		if (bDecorator != null)
+			bDecorator.decorateNominalInstanceWords(
+					sparseData.getInstanceNominalWords(),
+					sparseData.getNominalWordValueMap());
+		return sparseData;
+	}
+
+	public void setDataSource(DataSource ds) {
+		this.jdbcTemplate = new JdbcTemplate(ds);
+		this.simpleJdbcTemplate = new SimpleJdbcTemplate(ds);
+		this.namedJdbcTemplate = new NamedParameterJdbcTemplate(ds);
+	}
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	public void setNameToFormatterMap(
+			Map<String, SparseDataFormatterFactory> nameToFormatterMap) {
+		this.nameToFormatterMap = nameToFormatterMap;
+	}
+
+	public void setTxTemplateNew(TransactionTemplate txTemplateNew) {
+		this.txTemplateNew = txTemplateNew;
+	}
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataFormatter.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataFormatter.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataFormatter.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataFormatter.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,89 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.IOException;
+import java.util.Properties;
+import java.util.SortedMap;
+
+/**
+ * stateful class called by the sparseDataExporter to export sparse data in a
+ * specific format. This is created, called for each fold, training, and test
+ * set, then thrown away.
+ * 
+ * @author vijay
+ * 
+ */
+public interface SparseDataFormatter {
+
+	/**
+	 * scope property key
+	 */
+	public static final String SCOPE = "scope";
+	/**
+	 * fold value for scope
+	 */
+	public static final String SCOPE_FOLD = "fold";
+	/**
+	 * label value for scope
+	 */
+	public static final String SCOPE_LABEL = "label";
+	/**
+	 * value <tt>instance_id</tt>. SparseMatrix adds the instance_id attribute
+	 * to the matrix. This is a reserved attribute name.
+	 */
+	public static final String ATTR_INSTANCE_ID = "instance_id";
+
+	/**
+	 * initialize data structures for the fold that will be exported. called
+	 * before export.
+	 * 
+	 * @param sparseData
+	 * @param label
+	 * @param run
+	 * @param fold
+	 * @param foldInstanceLabelMap
+	 * @throws IOException
+	 */
+	void initializeFold(SparseData sparseData, String label, Integer run,
+			Integer fold,
+			SortedMap<Boolean, SortedMap<Long, String>> foldInstanceLabelMap)
+			throws IOException;
+
+	/**
+	 * export the fold train/test set. called once per train/test set, 2x per
+	 * fold.
+	 * 
+	 * @param sparseData
+	 * @param sortedMap
+	 * @param train
+	 * @param label
+	 * @param run
+	 * @param fold
+	 * @throws IOException
+	 */
+	void exportFold(SparseData sparseData,
+			SortedMap<Long, String> sortedMap, boolean train, String label,
+			Integer run, Integer fold) throws IOException;
+
+	/**
+	 * initialize export - called once
+	 * 
+	 * @param instanceLabel
+	 * @param properties
+	 * @throws IOException
+	 */
+	void initializeExport(InstanceData instanceLabel, Properties properties,
+			SparseData sparseData) throws IOException;
+
+	/**
+	 * clear all data structures set up during initializeFold
+	 */
+	void clearFold();
+
+	void initializeLabel(
+			String label,
+			SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>> labelInstances,
+			Properties properties, SparseData sparseData) throws IOException;
+
+	void clearLabel();
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataFormatterFactory.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataFormatterFactory.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataFormatterFactory.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SparseDataFormatterFactory.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,8 @@
+package org.apache.ctakes.ytex.kernel;
+
+
+public interface SparseDataFormatterFactory {
+
+	public abstract SparseDataFormatter getFormatter();
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SvmlinEvaluationParser.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SvmlinEvaluationParser.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SvmlinEvaluationParser.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/SvmlinEvaluationParser.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,201 @@
+package org.apache.ctakes.ytex.kernel;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Properties;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.kernel.model.SVMClassifierEvaluation;
+
+import com.google.common.collect.BiMap;
+
+
+public class SvmlinEvaluationParser extends BaseClassifierEvaluationParser {
+	private static final Log log = LogFactory
+			.getLog(SvmlinEvaluationParser.class);
+	public static Pattern pAlgo = Pattern.compile("-A\\s+(\\d)");
+	public static Pattern pLambdaW = Pattern.compile("-W\\s+([\\d\\.eE-]+)");
+	public static Pattern pLambaU = Pattern.compile("-U\\s+([\\d\\.eE-]+)");
+
+	/**
+	 * parse directory. Expect following files:
+	 * <ul>
+	 * <li>model.txt - libsvm model file
+	 * <li>options.properties - properties file with needed parameter settings
+	 * (see ParseOption)
+	 * <li>predict.txt - predictions on test set
+	 * </ul>
+	 */
+	@Override
+	public void parseDirectory(File dataDir, File outputDir) throws IOException {
+		String optionsFile = outputDir.getPath() + File.separator
+				+ "options.properties";
+		if (checkFileRead(optionsFile)) {
+			// read options.properties
+			Properties props = this.loadProps(outputDir);
+			SVMClassifierEvaluation eval = new SVMClassifierEvaluation();
+			// set algorithm
+			eval.setAlgorithm("svmlin");
+			// parse results
+			parseResults(dataDir, outputDir, eval, props);
+		}
+	}
+
+	private void parseResults(File dataDir, File outputDir,
+			SVMClassifierEvaluation eval, Properties props) throws IOException {
+		// parse fold, run, label from file base name
+		String fileBaseName = this.getFileBaseName(props);
+		initClassifierEvaluation(fileBaseName, eval);
+		// initialize common properties
+		initClassifierEvaluationFromProperties(props, eval);
+		// parse options from command line
+		String options = props
+				.getProperty(ParseOption.EVAL_LINE.getOptionKey());
+		if (options != null) {
+			eval.setKernel(parseIntOption(pAlgo, options));
+			if (eval.getKernel() == null)
+				eval.setKernel(1);
+			eval.setCost(parseDoubleOption(pLambdaW, options));
+			eval.setGamma(parseDoubleOption(pLambaU, options));
+		}
+		// parse predictions
+		if (fileBaseName != null && fileBaseName.length() > 0) {
+			List<InstanceClassInfo> listClassInfo = loadInstanceClassInfo(
+					dataDir, fileBaseName + "id.txt");
+			// process .output files
+			if (listClassInfo != null) {
+				BiMap<Integer, String> classIdToNameMap = loadClassIdMap(
+						dataDir, eval.getLabel());
+				parseSvmlinOutput(dataDir, outputDir, eval, fileBaseName,
+						props, listClassInfo, classIdToNameMap);
+				// save the classifier evaluation
+				storeSemiSupervised(props, eval, classIdToNameMap);
+			}
+		} else {
+			log.warn("couldn't parse directory; kernel.label.base not defined. Dir: "
+					+ outputDir);
+		}
+
+	}
+
+	/**
+	 * support multi-class classification
+	 * 
+	 * @param dataDir
+	 * @param outputDir
+	 * @param eval
+	 * @param fileBaseName
+	 * @param props
+	 * @param predict
+	 * @param listClassInfo
+	 * @throws IOException
+	 */
+	private void parseSvmlinOutput(File dataDir, File outputDir,
+			SVMClassifierEvaluation eval, String fileBaseName,
+			Properties props, List<InstanceClassInfo> listClassInfo,
+			BiMap<Integer, String> classIdToNameMap) throws IOException {
+		Properties codeProps = FileUtil.loadProperties(
+				dataDir.getAbsolutePath() + "/" + fileBaseName
+						+ "code.properties", false);
+		String[] codes = codeProps.getProperty("codes", "").split(",");
+		SortedMap<String, double[]> codeToPredictionMap = new TreeMap<String, double[]>();
+		if (codes.length == 0) {
+			throw new IOException("invalid code.properties: " + fileBaseName);
+		}
+		// int otherClassId = 0;
+		String otherClassName = null;
+		if (codes.length == 1) {
+			// otherClassId = Integer
+			// .parseInt(codeProps.getProperty("classOther"));
+			otherClassName = codeProps.getProperty("classOtherName");
+		}
+		for (String code : codes) {
+			// determine class for given code
+			// String strClassId = codeProps.getProperty(code+".class");
+			// if (strClassId == null) {
+			// throw new IOException("invalid code.properties: "
+			// + fileBaseName);
+			// }
+			// int classId = Integer.parseInt(strClassId);
+			String className = codeProps.getProperty(code + ".className");
+			String codeBase = code.substring(0, code.length()-".txt".length());
+			// read predictions for given class
+			codeToPredictionMap.put(
+					className,
+					readPredictions(outputDir.getAbsolutePath() + "/" + codeBase
+							+ ".outputs", listClassInfo.size()));
+		}
+		// iterate over predictions for each instance, figure out which class is
+		// the winner
+		String[] classPredictions = new String[listClassInfo.size()];
+		for (int i = 0; i < listClassInfo.size(); i++) {
+			if (otherClassName != null) {
+				Map.Entry<String, double[]> classToPred = codeToPredictionMap
+						.entrySet().iterator().next();
+				classPredictions[i] = classToPred.getValue()[i] > 0 ? classToPred
+						.getKey() : otherClassName;
+			} else {
+				NavigableMap<Double, String> predToClassMap = new TreeMap<Double, String>();
+				for (Map.Entry<String, double[]> classToPred : codeToPredictionMap
+						.entrySet()) {
+					predToClassMap.put(classToPred.getValue()[i],
+							classToPred.getKey());
+				}
+				classPredictions[i] = predToClassMap.lastEntry().getValue();
+			}
+		}
+		boolean storeUnlabeled = YES.equalsIgnoreCase(props.getProperty(
+				ParseOption.STORE_UNLABELED.getOptionKey(),
+				ParseOption.STORE_UNLABELED.getDefaultValue()));
+		updateSemiSupervisedPredictions(eval, listClassInfo, storeUnlabeled,
+				classPredictions, classIdToNameMap.inverse());
+	}
+
+	/**
+	 * read the predictions
+	 * 
+	 * @param predict
+	 * @param expectedSize
+	 * @return
+	 * @throws FileNotFoundException
+	 * @throws IOException
+	 */
+	private double[] readPredictions(String predict, int expectedSize)
+			throws FileNotFoundException, IOException {
+		BufferedReader outputReader = null;
+		try {
+			double predictions[] = new double[expectedSize];
+			int i = 0;
+			String prediction = null;
+			outputReader = new BufferedReader(new FileReader(predict));
+			while ((prediction = outputReader.readLine()) != null) {
+				if (i < expectedSize)
+					predictions[i++] = (Double.parseDouble(prediction));
+				else
+					throw new IOException(predict
+							+ ":  more predictions than expected");
+			}
+			if (i < expectedSize - 1)
+				throw new IOException(predict
+						+ ":  less predictions than expected");
+			return predictions;
+		} finally {
+			if (outputReader != null) {
+				try {
+					outputReader.close();
+				} catch (Exception ignore) {
+				}
+			}
+		}
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/dao/ClassifierEvaluationDao.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/dao/ClassifierEvaluationDao.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/dao/ClassifierEvaluationDao.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/dao/ClassifierEvaluationDao.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,99 @@
+package org.apache.ctakes.ytex.kernel.dao;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.ytex.kernel.metric.ConceptInfo;
+import org.apache.ctakes.ytex.kernel.model.ClassifierEvaluation;
+import org.apache.ctakes.ytex.kernel.model.CrossValidationFold;
+import org.apache.ctakes.ytex.kernel.model.FeatureEvaluation;
+import org.apache.ctakes.ytex.kernel.model.FeatureParentChild;
+import org.apache.ctakes.ytex.kernel.model.FeatureRank;
+
+
+public interface ClassifierEvaluationDao {
+
+	public abstract void saveClassifierEvaluation(ClassifierEvaluation eval,
+			Map<Integer, String> irClassMap, boolean saveInstanceEval);
+
+	public abstract void saveFold(CrossValidationFold fold);
+
+	public abstract void deleteCrossValidationFoldByName(String name,
+			String splitName);
+
+	public abstract void saveFeatureEvaluation(
+			FeatureEvaluation featureEvaluation, List<FeatureRank> features);
+
+	public abstract void deleteFeatureEvaluationByNameAndType(
+			String corpusName, String featureSetName, String type);
+
+	/**
+	 * 
+	 * @param eval
+	 *            evaluation to save
+	 * @param saveInstanceEval
+	 *            save instance level evaluations - default false
+	 * @param saveIRStats
+	 *            save IR statistics - default true
+	 * @param excludeTargetClassId
+	 *            for semi-supervised learners, don't want to include the
+	 *            unlabeled instances in computation of ir statistics. this
+	 *            specifies the class id of the unlabeled instances (default 0)
+	 */
+	public void saveClassifierEvaluation(ClassifierEvaluation eval,
+			Map<Integer, String> irClassMap, boolean saveInstanceEval,
+			boolean saveIRStats, Integer excludeTargetClassId);
+
+	public abstract CrossValidationFold getCrossValidationFold(
+			String corpusName, String splitName, String label, int run, int fold);
+
+	public List<FeatureRank> getTopFeatures(String corpusName,
+			String featureSetName, String label, String type, Integer foldId,
+			double param1, String param2, Integer parentConceptTopThreshold);
+
+	public List<FeatureRank> getThresholdFeatures(String corpusName,
+			String featureSetName, String label, String type, Integer foldId,
+			double param1, String param2,
+			double parentConceptEvaluationThreshold);
+
+	public abstract void deleteFeatureEvaluation(String corpusName,
+			String featureSetName, String label, String evaluationType,
+			Integer foldId, Double param1, String param2);
+
+	public abstract Map<String, Double> getFeatureRankEvaluations(
+			String corpusName, String featureSetName, String label,
+			String evaluationType, Integer foldId, double param1, String param2);
+
+	public abstract Map<String, Double> getFeatureRankEvaluations(
+			Set<String> featureNames, String corpusName, String featureSetName,
+			String label, String evaluationType, Integer foldId, double param1,
+			String param2);
+
+	public abstract Map<String, FeatureRank> getFeatureRanks(
+			Set<String> featureNames, String corpusName, String featureSetName,
+			String label, String evaluationType, Integer foldId, double param1,
+			String param2);
+
+	public abstract List<Object[]> getCorpusCuiTuis(String corpusName,
+			String conceptGraphName, String conceptSetName);
+
+	public abstract Map<String, Double> getInfoContent(String corpusName,
+			String conceptGraphName, String conceptSet);
+
+	public abstract List<ConceptInfo> getIntrinsicInfoContent(
+			String conceptGraphName);
+
+	public abstract void saveFeatureParentChild(FeatureParentChild parchd);
+
+	public abstract List<FeatureRank> getImputedFeaturesByPropagatedCutoff(
+			String corpusName, String conceptSetName, String label,
+			String evaluationType, String conceptGraphName,
+			String propEvaluationType, int propRankCutoff);
+
+	public abstract Double getMaxFeatureEvaluation(String corpusName,
+			String featureSetName, String label, String evaluationType,
+			Integer foldId, double param1, String param2);
+
+	public abstract Integer getMaxDepth(String conceptGraphName);
+}
\ No newline at end of file



Mime
View raw message