ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vjapa...@apache.org
Subject svn commit: r1551254 [23/26] - in /ctakes/branches/ytex: ctakes-ytex-res/ ctakes-ytex-res/.settings/ ctakes-ytex-res/src/ ctakes-ytex-res/src/main/ ctakes-ytex-res/src/main/resources/ ctakes-ytex-res/src/main/resources/org/ ctakes-ytex-res/src/main/res...
Date Mon, 16 Dec 2013 16:30:40 GMT
Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/pagerank/PageRankServiceImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/pagerank/PageRankServiceImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/pagerank/PageRankServiceImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/pagerank/PageRankServiceImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,524 @@
+package org.apache.ctakes.ytex.kernel.pagerank;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.OptionGroup;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.kernel.KernelContextHolder;
+import org.apache.ctakes.ytex.kernel.dao.ConceptDao;
+import org.apache.ctakes.ytex.kernel.model.ConcRel;
+import org.apache.ctakes.ytex.kernel.model.ConceptGraph;
+
+
+public class PageRankServiceImpl implements PageRankService {
+	private static final Log log = LogFactory.getLog(PageRankServiceImpl.class);
+
+	private double[] rankInternal(Map<Integer, Double> dampingVector,
+			ConceptGraph cg, int iter, double threshold, double dampingFactor) {
+		Map<Integer, Double> scoreMapCurrent = dampingVector;
+		double N = (double) cg.getConceptList().size();
+		double scoresCurrent[] = new double[cg.getConceptList().size()];
+		double diff = 1d;
+		for (int i = 0; i < iter; i++) {
+			double[] scoresOld = scoresCurrent;
+			long timeBegin = 0;
+			if (log.isDebugEnabled()) {
+				timeBegin = System.currentTimeMillis();
+			}
+			scoresCurrent = pagerankIter(scoresOld, dampingVector, cg,
+					dampingFactor, N);
+			if (log.isDebugEnabled()) {
+				log.debug("iter " + i + " "
+						+ Long.toString(System.currentTimeMillis() - timeBegin));
+			}
+			if ((diff = difference(scoresOld, scoresCurrent)) <= threshold)
+				break;
+		}
+		if (log.isDebugEnabled() && diff > threshold) {
+			log.debug("did not converge, diff = " + diff + ", dampingVector = "
+					+ dampingVector);
+		}
+		return scoresCurrent;
+	}
+
+	/**
+	 * difference between 2 vectors
+	 * 
+	 * @param a
+	 * @param b
+	 * @return a-b
+	 */
+	private <T> double difference(Map<T, Double> a, Map<T, Double> b) {
+		double diff = 0d;
+		for (Map.Entry<T, Double> aiEntry : a.entrySet()) {
+			Double bi = b.get(aiEntry.getKey());
+			diff += Math.pow(
+					aiEntry.getValue() - (bi != null ? bi.doubleValue() : 0d),
+					2);
+		}
+		for (Map.Entry<T, Double> biEntry : b.entrySet()) {
+			if (!a.containsKey(biEntry.getKey())) {
+				diff += Math.pow(biEntry.getValue(), 2);
+			}
+		}
+		return diff;
+	}
+
+	/**
+	 * 
+	 * @param u
+	 * @param v
+	 * @return norm(u-v)
+	 */
+	private double difference(double[] u, double[] v) {
+		double diff = 0d;
+		for (int i = 0; i < u.length; i++) {
+			double d = (u[i] - v[i]);
+			diff += d * d;
+		}
+		return Math.sqrt(diff);
+	}
+
+	private double cosine(double[] u, double[] v) {
+		double uu = 0;
+		double vv = 0;
+		double uv = 0;
+		for (int i = 0; i < u.length; i++) {
+			uu += u[i] * u[i];
+			vv += v[i] * v[i];
+			uv += u[i] * v[i];
+		}
+		return uv / Math.sqrt(uu * vv);
+	}
+
+	public double[] pagerankIter(double[] currentScores,
+			Map<Integer, Double> dampingVector, ConceptGraph cg,
+			double dampingFactor, double N, Set<Integer> activeNodes) {
+		double newScores[] = new double[(int) N];
+		Arrays.fill(newScores, 0d);
+		Integer[] activeNodeArr = new Integer[activeNodes.size()];
+		activeNodes.toArray(activeNodeArr);
+		for (int index : activeNodeArr) {
+			// pagerank with non-uniform damping vector (topic vector).
+			// because of the non-uniform damping vector, few nodes will have a
+			// non-zero pagerank.
+			// optimized so that we only iterate over nodes with non-zero
+			// pagerank.
+			// propagate from non-zero nodes to linked nodes
+			// we assume currentScores is non-null - it is initialized to the
+			// damping vector.
+			// iterate over nodes that have a pagerank, and propagate the
+			// pagerank to out-links.
+			// pagerank
+			double score = currentScores[index];
+			// get concept id
+			ConcRel cr = cg.getConceptList().get(index);
+			// get number of out-links
+			double nOutlinks = (double) cr.getChildren().size();
+			if (nOutlinks > 0) {
+				// propagate pagerank to out-links (children)
+				for (ConcRel crOut : cr.getChildren()) {
+					int targetIndex = crOut.getNodeIndex();
+					// get current pagerank value for target page
+					double childScore = newScores[targetIndex];
+					// add the pagerank/|links|
+					childScore += (score / nOutlinks);
+					newScores[targetIndex] = childScore;
+					activeNodes.add(targetIndex);
+				}
+			}
+		}
+		// we just added the contribution of pages to newScores sum(score).
+		// adjust: convert to (d)*sum(score) + (1-d)*v_i
+		for (int index : activeNodes) {
+			// personalized pagerank
+			double adjusted = (newScores[index] * dampingFactor);
+			// v_i
+			Double v_i = dampingVector.get(index);
+			// 1-c * v_i
+			if (v_i != null)
+				adjusted += v_i;
+			newScores[index] = adjusted;
+		}
+		return newScores;
+	}
+
+	public double[] pagerankIter(double[] currentScores,
+			Map<Integer, Double> dampingVector, ConceptGraph cg,
+			double dampingFactor, double N) {
+		double newScores[] = new double[(int) N];
+		double jump = ((1 - dampingFactor) / N);
+		for (int i = 0; i < currentScores.length; i++) {
+			double score = 0d;
+			ConcRel c = cg.getConceptList().get(i);
+			// get nodes pointing at node c
+			for (int parentIndex : c.getParentsArray()) {
+				ConcRel p = cg.getConceptList().get(parentIndex);
+				// get the pagerank for node p which is pointing at c
+				// if this is the first iteration, currentScores is null so
+				// use the initial pagerank
+				double prIn = currentScores[parentIndex];
+				// add the pagerank divided by the number of nodes p is
+				// pointing at
+				score += (prIn / (double) p.getChildrenArray().length);
+			}
+			if (dampingVector == null) {
+				// uniform damping
+				newScores[i] = (score * dampingFactor) + jump;
+			} else {
+				// personalized pagerank
+				double adjusted = (score * dampingFactor);
+				// get the random jump for this node
+				Double v_i = dampingVector.get(i);
+				// if not null, add it
+				if (v_i != null)
+					adjusted += v_i;
+				newScores[i] = adjusted;
+			}
+		}
+		return newScores;
+	}
+
+	@Override
+	public double[] rank2(Map<Integer, Double> dampingVector, ConceptGraph cg,
+			int iter, double threshold, double dampingFactor) {
+		double N = (double) cg.getConceptMap().size();
+		double scoresCurrent[] = new double[cg.getConceptMap().size()];
+		Map<Integer, Double> dampingVectorAdj = null;
+		// Set<Integer> activeNodes = null;
+		if (dampingVector != null) {
+			// for personalized page rank, put together a map of possibilities
+			// of randomly jumping to a specific node
+			dampingVectorAdj = new HashMap<Integer, Double>(
+					dampingVector.size());
+			// // initialize set of active nodes
+			// activeNodes = new HashSet<Integer>(dampingVector.keySet());
+			Arrays.fill(scoresCurrent, 0d);
+			for (Map.Entry<Integer, Double> dvEntry : dampingVector.entrySet()) {
+				// set the random jump for the node
+				dampingVectorAdj.put(dvEntry.getKey(), dvEntry.getValue()
+						* (1 - dampingFactor));
+				// set the initial weight for the node
+				scoresCurrent[dvEntry.getKey()] = dvEntry.getValue();
+			}
+		} else {
+			// for static page rank, all nodes have same weight initially
+			Arrays.fill(scoresCurrent, 1d / N);
+		}
+		double diff = 1d;
+		for (int i = 0; i < iter; i++) {
+			double scoresOld[] = scoresCurrent;
+			long timeBegin = 0;
+			if (log.isDebugEnabled()) {
+				timeBegin = System.currentTimeMillis();
+			}
+			// if (activeNodes == null) {
+			scoresCurrent = pagerankIter(scoresCurrent, dampingVectorAdj, cg,
+					dampingFactor, N);
+			// } else {
+			// scoresCurrent = pagerankIter(scoresCurrent, dampingVectorAdj,
+			// cg, dampingFactor, N, activeNodes);
+			// }
+			if (log.isDebugEnabled()) {
+				log.debug("iter " + i + " time(ms) "
+						+ Long.toString(System.currentTimeMillis() - timeBegin));
+			}
+			if ((diff = difference(scoresCurrent, scoresOld)) <= threshold)
+				break;
+		}
+		if (log.isDebugEnabled() && diff > threshold) {
+			log.debug("did not converge, diff = " + diff + ", dampingVector = "
+					+ dampingVector);
+		}
+		return scoresCurrent;
+	}
+
+	/**
+	 * perform one iteration of pagerank
+	 * 
+	 * @param currentScores
+	 * @param cg
+	 * @return
+	 */
+	public Map<Integer, Double> pagerankIter(
+			Map<Integer, Double> currentScores,
+			Map<Integer, Double> dampingVector, ConceptGraph cg,
+			double dampingFactor, double N) {
+		Map<Integer, Double> newScores = new HashMap<Integer, Double>();
+		if (dampingVector == null) {
+			// the constant probability of randomly surfing into this node,
+			// adjusted by damping factor
+			double jump = ((1 - dampingFactor) / N);
+			double initialValue = 1 / N;
+			// the basic pagerank iteration with uniform damping vector
+			// iterate over all nodes
+			for (ConcRel c : cg.getConceptList()) {
+				double score = 0d;
+				// get nodes pointing at node c
+				for (ConcRel in : c.getParents()) {
+					// get the pagerank for node p which is pointing at c
+					// if this is the first iteration, currentScores is null so
+					// use the initial pagerank
+					double prIn = currentScores == null ? initialValue
+							: currentScores.get(in.getNodeIndex());
+					// add the pagerank divided by the number of nodes p is
+					// pointing at
+					score += (prIn / (double) in.getChildren().size());
+				}
+				// adjust for uniform damping
+				double adjusted = (score * dampingFactor) + jump;
+				newScores.put(c.getNodeIndex(), adjusted);
+			}
+			// for (ConcRel c : cg.getConceptMap().values()) {
+			// double score = 0d;
+			// // get nodes pointing at node c
+			// for (ConcRel in : c.getParents()) {
+			// // get the pagerank for node p which is pointing at c
+			// // if this is the first iteration, currentScores is null so
+			// // use the initial pagerank
+			// double prIn = currentScores == null ? initialValue
+			// : currentScores.get(in.getConceptID());
+			// // add the pagerank divided by the number of nodes p is
+			// // pointing at
+			// score += (prIn / (double) in.getChildren().size());
+			// }
+			// // adjust for uniform damping
+			// double adjusted = (score * dampingFactor) + jump;
+			// newScores.put(c.getConceptID(), adjusted);
+			// }
+		} else {
+			// pagerank with non-uniform damping vector (topic vector).
+			// because of the non-uniform damping vector, few nodes will have a
+			// non-zero pagerank.
+			// optimized so that we only iterate over nodes with non-zero
+			// pagerank.
+			// propagate from non-zero nodes to linked nodes
+			// we assume currentScores is non-null - it is initialized to the
+			// damping vector.
+			// iterate over nodes that have a pagerank, and propagate the
+			// pagerank to out-links.
+			for (Map.Entry<Integer, Double> scoreEntry : currentScores
+					.entrySet()) {
+				// page (concept id)
+				Integer index = scoreEntry.getKey();
+				// pagerank
+				double score = scoreEntry.getValue();
+				// get concept id
+				ConcRel cr = cg.getConceptList().get(index);
+				// get number of out-links
+				double nOutlinks = (double) cr.getChildren().size();
+				if (nOutlinks > 0) {
+					// propagate pagerank to out-links (children)
+					for (ConcRel crOut : cr.getChildren()) {
+						// get current pagerank value for target page
+						double childScore = 0d;
+						Double childScoreD = newScores
+								.get(crOut.getNodeIndex());
+						if (childScoreD != null)
+							childScore = childScoreD.doubleValue();
+						// add the pagerank/|links|
+						childScore += (score / nOutlinks);
+						newScores.put(crOut.getNodeIndex(), childScore);
+					}
+				}
+			}
+			// we just added the contribution of pages to newScores sum(score).
+			// adjust: convert to (d)*sum(score) + (1-d)*v_i
+			for (Map.Entry<Integer, Double> scoreEntry : newScores.entrySet()) {
+				// v_i
+				Double v_i = dampingVector.get(scoreEntry.getKey());
+				// 1-c * v_i
+				double v_i_adj = v_i != null ? v_i * (1 - dampingFactor) : 0d;
+				double adjusted = (scoreEntry.getValue() * dampingFactor)
+						+ v_i_adj;
+				scoreEntry.setValue(adjusted);
+			}
+			//
+			//
+			// for (Map.Entry<String, Double> scoreEntry : currentScores
+			// .entrySet()) {
+			// // page (concept id)
+			// String page = scoreEntry.getKey();
+			// // pagerank
+			// double score = scoreEntry.getValue();
+			// // get concept id
+			// ConcRel cr = cg.getConceptMap().get(page);
+			// // get number of out-links
+			// double nOutlinks = (double) cr.getChildren().size();
+			// if (nOutlinks > 0) {
+			// // propagate pagerank to out-links (children)
+			// for (ConcRel crOut : cr.getChildren()) {
+			// // get current pagerank value for target page
+			// double childScore = 0d;
+			// Double childScoreD = newScores
+			// .get(crOut.getConceptID());
+			// if (childScoreD != null)
+			// childScore = childScoreD.doubleValue();
+			// // add the pagerank/|links|
+			// childScore += (score / nOutlinks);
+			// newScores.put(crOut.getConceptID(), childScore);
+			// }
+			// }
+			// }
+			// // we just added the contribution of pages to newScores
+			// sum(score).
+			// // adjust: convert to (d)*sum(score) + (1-d)*v_i
+			// for (Map.Entry<String, Double> scoreEntry : newScores.entrySet())
+			// {
+			// // v_i
+			// Double v_i = dampingVector.get(scoreEntry.getKey());
+			// // 1-c * v_i
+			// double v_i_adj = v_i != null ? v_i * (1 - dampingFactor) : 0d;
+			// double adjusted = (scoreEntry.getValue() * dampingFactor)
+			// + v_i_adj;
+			// scoreEntry.setValue(adjusted);
+			// }
+		}
+		return newScores;
+	}
+
+	/**
+	 * compute similarity using personalized page rank, as documented in <a
+	 * href=
+	 * "http://ixa.si.ehu.es/Ixa/Argitalpenak/Artikuluak/1274099085/publikoak/main.pdf"
+	 * >Exploring Knowledge Bases for Similarity</a>
+	 * 
+	 * @param concept1
+	 * @param concept2
+	 * @param cg
+	 * @param iter
+	 * @param threshold
+	 * @param dampingFactor
+	 * @return
+	 */
+	@Override
+	public double sim(String concept1, String concept2, ConceptGraph cg,
+			int iter, double threshold, double dampingFactor) {
+		Map<Integer, Double> c1dv = new HashMap<Integer, Double>(1);
+		ConcRel c1 = cg.getConceptMap().get(concept1);
+		ConcRel c2 = cg.getConceptMap().get(concept2);
+		if (c1 == null || c2 == null)
+			return 0d;
+		c1dv.put(c1.getNodeIndex(), 1d);
+		double[] c1pr = this.rank2(c1dv, cg, iter, threshold, dampingFactor);
+		Map<Integer, Double> c2dv = new HashMap<Integer, Double>(1);
+		c2dv.put(c2.getNodeIndex(), 1d);
+		double[] c2pr = this.rank2(c2dv, cg, iter, threshold, dampingFactor);
+		return cosine(c1pr, c2pr);
+	}
+
+	/**
+	 * cosine of two vectors
+	 * 
+	 * @param u
+	 * @param v
+	 * @return
+	 */
+	private <T> double cosine(Map<T, Double> u, Map<T, Double> v) {
+		double uu = 0d;
+		double uv = 0d;
+		double vv = 0d;
+		if (u.isEmpty() || v.isEmpty())
+			return 0d;
+		// in this loop compute u*u, and u*v
+		for (Map.Entry<T, Double> uEntry : u.entrySet()) {
+			double ui = uEntry.getValue();
+			T uC = uEntry.getKey();
+			uu += ui * ui;
+			Double vi = v.get(uC);
+			if (vi != null)
+				uv += ui * vi.doubleValue();
+		}
+		if (uv == 0)
+			return 0d;
+		// in this loop, compute v*v
+		for (double vi : v.values()) {
+			vv += vi * vi;
+		}
+		// u*v/sqrt(v*v)*sqrt(u*u)
+		return uv / Math.sqrt(vv * uu);
+	}
+
+	public static void main(String args[]) {
+		Options options = new Options();
+		OptionGroup og = new OptionGroup();
+		og.addOption(OptionBuilder
+				.withArgName("concept1,concept2")
+				.hasArg()
+				.withDescription(
+						"compute similarity for specified concept pair")
+				.create("sim"));
+		og.addOption(OptionBuilder
+				.withArgName("concept1,concept2,...")
+				.hasArg()
+				.withDescription(
+						"personalized pagerank vector for specified concepts ")
+				.create("ppr"));
+		og.setRequired(true);
+		options.addOptionGroup(og);
+		try {
+			CommandLineParser parser = new GnuParser();
+			CommandLine line = parser.parse(options, args);
+			Properties ytexProps = new Properties();
+			ytexProps.putAll((Properties) KernelContextHolder
+					.getApplicationContext().getBean("ytexProperties"));
+			ytexProps.putAll(System.getProperties());
+			ConceptDao conceptDao = KernelContextHolder.getApplicationContext()
+					.getBean(ConceptDao.class);
+			PageRankService pageRankService = KernelContextHolder
+					.getApplicationContext().getBean(PageRankService.class);
+			ConceptGraph cg = conceptDao.getConceptGraph(ytexProps
+					.getProperty("org.apache.ctakes.ytex.conceptGraphName"));
+			if (line.hasOption("sim")) {
+				String cs = line.getOptionValue("sim");
+				String concept[] = cs.split(",");
+				System.out.println(pageRankService.sim(concept[0], concept[1],
+						cg, 30, 1e-4, 0.85));
+			} else if (line.hasOption("ppr")) {
+				String cs = line.getOptionValue("ppr");
+				String concept[] = cs.split(",");
+				double weight = 1 / (double) concept.length;
+				Map<String, Double> ppv = new HashMap<String, Double>();
+				for (String c : concept) {
+					ppv.put(c, weight);
+				}
+				System.out.println(pageRankService.rank(ppv, cg));
+			}
+		} catch (ParseException pe) {
+			HelpFormatter formatter = new HelpFormatter();
+			formatter
+					.printHelp(
+							"java "
+									+ PageRankServiceImpl.class.getName()
+									+ " compute personalized page rank or similarity.  used for testing purposes",
+							options);
+		}
+
+	}
+
+	@Override
+	public double[] rank(Map<String, Double> dampingVector, ConceptGraph cg,
+			int iter, double threshold, double dampingFactor) {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	@Override
+	public double[] rank(Map<String, Double> dampingVector, ConceptGraph cg) {
+		// TODO Auto-generated method stub
+		return null;
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilder.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilder.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilder.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilder.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,20 @@
+package org.apache.ctakes.ytex.kernel.tree;
+
+import java.io.IOException;
+import java.util.Map;
+
+public interface InstanceTreeBuilder {
+
+	/**
+	 * Generate trees from the results of a sorted query
+	 * 
+	 */
+	public Map<Long, Node> loadInstanceTrees(TreeMappingInfo mappingInfo);
+
+	public abstract void serializeInstanceTrees(TreeMappingInfo mappingInfo, String filename)
+			throws IOException;
+
+	public abstract Map<Long, Node> loadInstanceTrees(String filename) throws IOException,
+			ClassNotFoundException;
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilderImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilderImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilderImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilderImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,210 @@
+package org.apache.ctakes.ytex.kernel.tree;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.sql.DataSource;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.jdbc.core.simple.SimpleJdbcTemplate;
+
+public class InstanceTreeBuilderImpl implements InstanceTreeBuilder {
+	static final Log log = LogFactory.getLog(InstanceTreeBuilderImpl.class);
+	SimpleJdbcTemplate simpleJdbcTemplate;
+	private DataSource dataSource;
+
+	public DataSource getDataSource() {
+		return dataSource;
+	}
+
+	public void setDataSource(DataSource dataSource) {
+		this.dataSource = dataSource;
+		this.simpleJdbcTemplate = new SimpleJdbcTemplate(dataSource);
+	}
+
+	Node nodeFromRow(NodeMappingInfo nodeInfo, Map<String, Object> nodeValues) {
+		Node n = null;
+		Map<String, Serializable> values = new HashMap<String, Serializable>(
+				nodeInfo.getValues().size());
+		for (String valueName : nodeInfo.getValues()) {
+			if (nodeValues.containsKey(valueName)
+					&& nodeValues.get(valueName) != null) {
+				values.put(valueName, (Serializable) nodeValues.get(valueName));
+			}
+		}
+		// make sure there is something to put in
+		if (!values.isEmpty()) {
+			n = new Node();
+			n.setType(nodeInfo.getNodeType());
+			n.setValue(values);
+		}
+		return n;
+	}
+
+	@SuppressWarnings("unchecked")
+	@Override
+	public Map<Long, Node> loadInstanceTrees(String filename)
+			throws IOException, ClassNotFoundException {
+		ObjectInputStream os = null;
+		try {
+			os = new ObjectInputStream(new BufferedInputStream(
+					new FileInputStream(filename)));
+			return (Map<Long, Node>) os.readObject();
+		} finally {
+			if (os != null)
+				os.close();
+		}
+	}
+
+	@Override
+	public void serializeInstanceTrees(TreeMappingInfo mappingInfo,
+			String filename) throws IOException {
+		ObjectOutputStream os = null;
+		try {
+			os = new ObjectOutputStream(new BufferedOutputStream(
+					new FileOutputStream(filename)));
+			os.writeObject(loadInstanceTrees(mappingInfo));
+		} finally {
+			if (os != null)
+				os.close();
+		}
+	}
+
+	public Map<Long, Node> loadInstanceTrees(TreeMappingInfo mappingInfo) {
+		Map<NodeKey, Node> nodeKeyMap = new HashMap<NodeKey, Node>();
+		this.prepare(mappingInfo.getPrepareScript(), mappingInfo.getPrepareScriptStatementDelimiter());
+		Map<Long, Node> instanceMap = loadInstanceTrees(
+				mappingInfo.getInstanceIDField(),
+				mappingInfo.getInstanceQueryMappingInfo(), nodeKeyMap);
+		if (mappingInfo.getNodeQueryMappingInfos() != null) {
+			for (QueryMappingInfo qInfo : mappingInfo
+					.getNodeQueryMappingInfos()) {
+				this.addChildrenToNodes(nodeKeyMap, qInfo);
+			}
+		}
+		return instanceMap;
+	}
+	
+
+	/**
+	 * run 'preparation' statements.  These may e.g. create temporary tables in the database.
+	 * @param prepareStatementList
+	 */
+	protected void prepare(String prepareScript, String prepareScriptDelimiter) {
+		if(prepareScript != null && prepareScript.length() > 0) {
+			String[] statements = prepareScript.split(prepareScriptDelimiter);
+			List<String> listStatements = new ArrayList<String>(statements.length);
+			// throw out empty lines
+			for(String sql : statements) {
+				if(sql != null && sql.trim().length() > 0)
+					listStatements.add(sql);
+			}
+			JdbcTemplate jt = new JdbcTemplate(this.getDataSource());
+			jt.batchUpdate(listStatements.toArray(new String[]{}));
+		}
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see
+	 * org.apache.ctakes.ytex.kernel.tree.InstanceTreeBuilder#loadInstanceTrees(java.util.List,
+	 * java.lang.String, java.lang.String, java.util.Map)
+	 */
+	protected Map<Long, Node> loadInstanceTrees(String instanceIDField,
+			QueryMappingInfo qInfo, Map<NodeKey, Node> nodeKeyMap) {
+		Node[] currentPath = new Node[qInfo.getNodeTypes().size()];
+		Map<Long, Node> instanceMap = new HashMap<Long, Node>();
+		List<Map<String, Object>> rowData = simpleJdbcTemplate.queryForList(
+				qInfo.getQuery(), qInfo.getQueryArgs());
+		for (Map<String, Object> row : rowData) {
+			for (int i = 0; i < qInfo.getNodeTypes().size(); i++) {
+				Node newNode = this.nodeFromRow(qInfo.getNodeTypes().get(i),
+						row);
+				if (newNode != null) {
+					if (!newNode.equals(currentPath[i])) {
+						if (i > 0) {
+							// add the node to the parent
+							currentPath[i - 1].getChildren().add(newNode);
+						} else {
+							// this is a new root, i.e. a new instance
+							// add it to the instance map
+							instanceMap.put(((Number) row.get(instanceIDField)).longValue(),
+									newNode);
+						}
+						// put the new node in the path
+						// we don't really care about nodes 'after' this one in
+						// the path list
+						// because we only add to parents, not to children
+						currentPath[i] = newNode;
+						if (nodeKeyMap != null)
+							nodeKeyMap.put(new NodeKey(newNode), newNode);
+					}
+				}
+			}
+		}
+		return instanceMap;
+	}
+
+	public void addChildrenToNodes(Map<NodeKey, Node> nodeKeyMap,
+			QueryMappingInfo qInfo) {
+		// run query
+		List<Map<String, Object>> rowData = simpleJdbcTemplate.queryForList(
+				qInfo.getQuery(), qInfo.getQueryArgs());
+		// iterate through rows, adding nodes as children of existing nodes
+		for (Map<String, Object> row : rowData) {
+			// allocate array for holding node path corresponding to row
+			Node[] currentPath = new Node[qInfo.getNodeTypes().size()];
+			// get the root of this subtree - temporary node contains values
+			Node parentTmp = nodeFromRow(qInfo.getNodeTypes().get(0), row);
+			if (parentTmp != null) {
+				// get the node from the tree that correponds to this node
+				Node parent = nodeKeyMap.get(new NodeKey(parentTmp));
+				if (parent == null) {
+					if (log.isWarnEnabled()) {
+						log.warn("couldn't find node for key: " + parentTmp);
+					}
+				} else {
+					// found the parent - add the subtree
+					currentPath[0] = parent;
+					for (int i = 1; i < qInfo.getNodeTypes().size(); i++) {
+						Node newNode = this.nodeFromRow(qInfo.getNodeTypes()
+								.get(i), row);
+						if (newNode != null) {
+							if (!newNode.equals(currentPath[i])) {
+								// null out everything after this index in the path
+								Arrays.fill(currentPath, i,
+										currentPath.length - 1, null);
+								// add the node to the parent
+								currentPath[i - 1].getChildren().add(newNode);
+								// put the new node in the path
+								// we don't really care about nodes 'after' this
+								// one in the path list
+								// because we only add to parents, not to
+								// children
+								currentPath[i] = newNode;
+								if (nodeKeyMap != null)
+									nodeKeyMap.put(new NodeKey(newNode),
+											newNode);
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/Node.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/Node.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/Node.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/Node.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,94 @@
+package org.apache.ctakes.ytex.kernel.tree;
+
+import java.io.Serializable;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+public class Node implements Serializable {
+	/**
+	 * 
+	 */
+	private static final long serialVersionUID = 1L;
+
+	@Override
+	public String toString() {
+		return "Node [type=" + type + ", value=" + value + "]";
+	}
+
+	private String type;
+	private Map<String, Serializable> value;
+	private List<Node> children = new LinkedList<Node>();
+
+	/**
+	 * Caching the norm externally, e.g. in EHCache involves too much additional
+	 * overhead. Therefore, save the norm in this object. This shouldn't cause
+	 * problems in a multi-threaded environment, as long as the kernel is the
+	 * same - the value of the norm will be the same across evaluations.
+	 */
+	private transient Double norm;
+
+	public Double getNorm() {
+		return norm;
+	}
+
+	public void setNorm(Double norm) {
+		this.norm = norm;
+	}
+
+	@Override
+	public int hashCode() {
+		final int prime = 31;
+		int result = 1;
+		result = prime * result + ((type == null) ? 0 : type.hashCode());
+		result = prime * result + ((value == null) ? 0 : value.hashCode());
+		return result;
+	}
+
+	public String getType() {
+		return type;
+	}
+
+	public void setType(String type) {
+		this.type = type;
+	}
+
+	public Map<String, Serializable> getValue() {
+		return value;
+	}
+
+	public void setValue(Map<String, Serializable> value) {
+		this.value = value;
+	}
+
+	public List<Node> getChildren() {
+		return children;
+	}
+
+	public void setChildren(List<Node> children) {
+		this.children = children;
+	}
+
+	@Override
+	public boolean equals(Object obj) {
+		if (this == obj)
+			return true;
+		if (obj == null)
+			return false;
+		if (getClass() != obj.getClass())
+			return false;
+		Node other = (Node) obj;
+		if (type == null) {
+			if (other.type != null)
+				return false;
+		} else if (!type.equals(other.type))
+			return false;
+		if (value == null) {
+			if (other.value != null)
+				return false;
+		} else if (!value.equals(other.value))
+			return false;
+		return true;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeKey.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeKey.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeKey.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeKey.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,45 @@
+package org.apache.ctakes.ytex.kernel.tree;
+
+import java.io.Serializable;
+import java.util.Map;
+
+public class NodeKey {
+	public String type;
+	public Map<String, Serializable> value;
+
+	public NodeKey(Node node) {
+		type = node.getType();
+		value = node.getValue();
+	}
+
+	@Override
+	public int hashCode() {
+		final int prime = 31;
+		int result = 1;
+		result = prime * result + ((type == null) ? 0 : type.hashCode());
+		result = prime * result + ((value == null) ? 0 : value.hashCode());
+		return result;
+	}
+
+	@Override
+	public boolean equals(Object obj) {
+		if (this == obj)
+			return true;
+		if (obj == null)
+			return false;
+		if (getClass() != obj.getClass())
+			return false;
+		NodeKey other = (NodeKey) obj;
+		if (type == null) {
+			if (other.type != null)
+				return false;
+		} else if (!type.equals(other.type))
+			return false;
+		if (value == null) {
+			if (other.value != null)
+				return false;
+		} else if (!value.equals(other.value))
+			return false;
+		return true;
+	}
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeMappingInfo.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeMappingInfo.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeMappingInfo.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeMappingInfo.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,34 @@
+package org.apache.ctakes.ytex.kernel.tree;
+
+import java.util.Set;
+
+public class NodeMappingInfo {
+	private String nodeType;
+
+	private Set<String> values;
+
+	public NodeMappingInfo() {
+		super();
+	}
+
+	public NodeMappingInfo(String nodeType, Set<String> values) {
+		super();
+		this.nodeType = nodeType;
+		this.values = values;
+	}
+
+	public String getNodeType() {
+		return nodeType;
+	}
+
+	public Set<String> getValues() {
+		return values;
+	}
+	public void setNodeType(String nodeType) {
+		this.nodeType = nodeType;
+	}
+
+	public void setValues(Set<String> values) {
+		this.values = values;
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/QueryMappingInfo.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/QueryMappingInfo.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/QueryMappingInfo.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/QueryMappingInfo.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,29 @@
+package org.apache.ctakes.ytex.kernel.tree;
+
+import java.util.List;
+import java.util.Map;
+
+public class QueryMappingInfo {
+	String query;
+	Map<String,Object> queryArgs;
+	List<NodeMappingInfo> nodeTypes;
+	public String getQuery() {
+		return query;
+	}
+	public void setQuery(String query) {
+		this.query = query;
+	}
+	public Map<String, Object> getQueryArgs() {
+		return queryArgs;
+	}
+	public void setQueryArgs(Map<String, Object> queryArgs) {
+		this.queryArgs = queryArgs;
+	}
+	public List<NodeMappingInfo> getNodeTypes() {
+		return nodeTypes;
+	}
+	public void setNodeTypes(List<NodeMappingInfo> nodeTypes) {
+		this.nodeTypes = nodeTypes;
+	}
+	
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/TreeMappingInfo.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/TreeMappingInfo.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/TreeMappingInfo.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/TreeMappingInfo.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,56 @@
+package org.apache.ctakes.ytex.kernel.tree;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class TreeMappingInfo {
+	String instanceIDField;
+	QueryMappingInfo instanceQueryMappingInfo;
+	List<QueryMappingInfo> nodeQueryMappingInfos = new ArrayList<QueryMappingInfo>();
+	String prepareScript;
+	String prepareScriptStatementDelimiter = ";";
+
+
+	public String getPrepareScript() {
+		return prepareScript;
+	}
+
+	public void setPrepareScript(String prepareScript) {
+		this.prepareScript = prepareScript;
+	}
+
+	public String getPrepareScriptStatementDelimiter() {
+		return prepareScriptStatementDelimiter;
+	}
+
+	public void setPrepareScriptStatementDelimiter(
+			String prepareScriptStatementDelimiter) {
+		this.prepareScriptStatementDelimiter = prepareScriptStatementDelimiter;
+	}
+
+	public String getInstanceIDField() {
+		return instanceIDField;
+	}
+
+	public void setInstanceIDField(String instanceIDField) {
+		this.instanceIDField = instanceIDField;
+	}
+
+	public QueryMappingInfo getInstanceQueryMappingInfo() {
+		return instanceQueryMappingInfo;
+	}
+
+	public void setInstanceQueryMappingInfo(
+			QueryMappingInfo instanceQueryMappingInfo) {
+		this.instanceQueryMappingInfo = instanceQueryMappingInfo;
+	}
+
+	public List<QueryMappingInfo> getNodeQueryMappingInfos() {
+		return nodeQueryMappingInfos;
+	}
+
+	public void setNodeQueryMappingInfos(
+			List<QueryMappingInfo> nodeQueryMappingInfos) {
+		this.nodeQueryMappingInfos = nodeQueryMappingInfos;
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguator.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguator.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguator.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguator.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,44 @@
+package org.apache.ctakes.ytex.kernel.wsd;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.ytex.kernel.metric.ConceptSimilarityService.SimilarityMetricEnum;
+
+
+public interface WordSenseDisambiguator {
+
+	public abstract String disambiguate(List<Set<String>> sentenceConcepts,
+			int index, Set<String> contextConcepts, int windowSize,
+			SimilarityMetricEnum metric, Map<String, Double> scoreMap);
+
+	/**
+	 * Disambiguate a named entity.
+	 * 
+	 * @param sentenceConcepts
+	 *            named entities from the document, represented as list of
+	 *            sets of concept ids
+	 * @param index
+	 *            index of target named entity to disambiguate
+	 * @param contextConcepts
+	 *            context concepts, e.g. from title
+	 * @param windowSize
+	 *            number of named entities on either side of target to use for
+	 *            disambiguation
+	 * @param metric
+	 *            metric to use
+	 * @param scoreMap
+	 *            optional to get the scores assigned to each concept
+	 * @param weighted
+	 *            to weight context concepts by frequency
+	 * @return highest scoring concept, or null if none of the target concepts
+	 *         are in the concept graph, or if all the target concepts have the
+	 *         same score
+	 */
+	String disambiguate(List<Set<String>> sentenceConcepts, int index,
+			Set<String> contextConcepts, int windowSize,
+			SimilarityMetricEnum metric, Map<String, Double> scoreMap,
+			boolean weighted);
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguatorImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguatorImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguatorImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguatorImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,171 @@
+package org.apache.ctakes.ytex.kernel.wsd;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+
+import org.apache.ctakes.ytex.kernel.metric.ConceptPairSimilarity;
+import org.apache.ctakes.ytex.kernel.metric.ConceptSimilarityService;
+import org.apache.ctakes.ytex.kernel.metric.ConceptSimilarityService.SimilarityMetricEnum;
+import org.apache.ctakes.ytex.kernel.model.ConcRel;
+
+import com.google.common.collect.SetMultimap;
+import com.google.common.collect.TreeMultimap;
+
+public class WordSenseDisambiguatorImpl implements WordSenseDisambiguator {
+	ConceptSimilarityService conceptSimilarityService;
+
+	public ConceptSimilarityService getConceptSimilarityService() {
+		return conceptSimilarityService;
+	}
+
+	public void setConceptSimilarityService(
+			ConceptSimilarityService conceptSimilarityService) {
+		this.conceptSimilarityService = conceptSimilarityService;
+	}
+
+	@Override
+	public String disambiguate(List<Set<String>> sentenceConcepts, int index,
+			Set<String> contextConcepts, int windowSize,
+			SimilarityMetricEnum metric, Map<String, Double> scoreMap) {
+		return disambiguate(sentenceConcepts, index, contextConcepts,
+				windowSize, metric, scoreMap, true);
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see
+	 * org.apache.ctakes.ytex.kernel.wsd.WordSenseDisambiguator#disambiguate
+	 * (java.util.List, int, java.util.Set, int,
+	 * org.apache.ctakes.ytex.kernel.ConceptSimilarityService
+	 * .SimilarityMetricEnum, java.util.Map)
+	 */
+	@Override
+	public String disambiguate(List<Set<String>> sentenceConcepts, int index,
+			Set<String> contextConcepts, int windowSize,
+			SimilarityMetricEnum metric, Map<String, Double> scoreMap,
+			boolean weighted) {
+		// get the candidate concepts that we want to disambiguate
+		Set<String> candidateConcepts = sentenceConcepts.get(index);
+		if (candidateConcepts.size() == 1)
+			return candidateConcepts.iterator().next();
+		// allocate set to hold all the concepts to compare to
+		Map<String, Integer> windowContextConcepts = new HashMap<String, Integer>();
+		// add context concepts (e.g. title concepts)
+		if (contextConcepts != null) {
+			addConcepts(windowContextConcepts, contextConcepts);
+		}
+		// add windowSize concepts from the sentence
+		// get left, then right concepts
+		// case 1 - enough tokens on both sides
+		int indexLeftStart = index - windowSize - 1;
+		int indexRightStart = index + windowSize + 1;
+		if (indexLeftStart < 0) {
+			// case 2 - not enough tokens on left
+			indexRightStart += (-1 * indexLeftStart);
+			indexLeftStart = 0;
+		} else if (indexRightStart >= sentenceConcepts.size()) {
+			// case 3 - not enough tokens on right
+			indexLeftStart -= indexRightStart - sentenceConcepts.size() - 1;
+			indexRightStart = sentenceConcepts.size() - 1;
+		}
+		// make sure the range is in bounds
+		if (indexLeftStart < 0)
+			indexLeftStart = 0;
+		if (indexRightStart >= sentenceConcepts.size())
+			indexRightStart = sentenceConcepts.size() - 1;
+		// add the concepts in the ranges
+		if (indexLeftStart < index) {
+			for (Set<String> cs : sentenceConcepts.subList(indexLeftStart,
+					index)) {
+				addConcepts(windowContextConcepts, cs);
+			}
+		}
+		if (indexRightStart > index) {
+			for (Set<String> cs : sentenceConcepts.subList(index + 1,
+					indexRightStart + 1)) {
+				addConcepts(windowContextConcepts, cs);
+			}
+		}
+		// allocate map to hold scores
+		TreeMultimap<Double, String> scoreConceptMap = TreeMultimap.create();
+		for (String c : candidateConcepts) {
+			scoreConceptMap
+					.put(scoreConcept(c, windowContextConcepts, metric,
+							weighted), c);
+		}
+		// if scoreMap is not null, fill it in with the concept scores - invert
+		// scoreConceptMap
+		boolean bNonZero = false;
+		if (scoreMap != null) {
+			for (Map.Entry<Double, String> scoreConcept : scoreConceptMap
+					.entries()) {
+				scoreMap.put(scoreConcept.getValue(), scoreConcept.getKey());
+			}
+		}
+		SortedSet<String> bestConcepts = scoreConceptMap.get(scoreConceptMap
+				.keySet().last());
+		String bestConcept = null;
+		if (bestConcepts.size() == 1) {
+			// only 1 concept with high score
+			bestConcept = bestConcepts.iterator().next();
+		} else if (bestConcepts.size() == candidateConcepts.size()) {
+			// all concepts have same score
+			bestConcept = null;
+		} else {
+			// multiple best candidates - pick concept with lowest ic - most
+			// general concept
+			double ic = 1e6;
+			Map<String, ConcRel> conceptMap = this
+					.getConceptSimilarityService().getConceptGraph()
+					.getConceptMap();
+			for (String c : bestConcepts) {
+				ConcRel cr = conceptMap.get(c);
+				if (cr != null && cr.getIntrinsicInfoContent() < ic) {
+					ic = cr.getIntrinsicInfoContent();
+					bestConcept = c;
+				}
+			}
+		}
+		// get the best scoring concept
+		return bestConcept;
+	}
+
+	private void addConcepts(Map<String, Integer> windowContextConcepts,
+			Set<String> contextConcepts) {
+		for (String c : contextConcepts) {
+			Integer cn = windowContextConcepts.get(c);
+			if (cn != null) {
+				windowContextConcepts.put(c, cn + 1);
+			} else {
+				windowContextConcepts.put(c, 1);
+			}
+		}
+	}
+
+	private double scoreConcept(String concept,
+			Map<String, Integer> windowContextConcepts,
+			SimilarityMetricEnum metric, boolean weighted) {
+		List<SimilarityMetricEnum> metrics = Arrays.asList(metric);
+		double score = 0d;
+		for (Map.Entry<String, Integer> windowConcept : windowContextConcepts
+				.entrySet()) {
+			ConceptPairSimilarity csim = conceptSimilarityService.similarity(
+					metrics, concept, windowConcept.getKey(), null, false);
+			if (weighted)
+				score += csim.getSimilarities().get(0)
+						* windowConcept.getValue().doubleValue();
+			else
+				score += csim.getSimilarities().get(0);
+		}
+		return score;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMFormatterFactory.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMFormatterFactory.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMFormatterFactory.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMFormatterFactory.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,193 @@
+package org.apache.ctakes.ytex.libsvm;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.SortedMap;
+
+import org.apache.ctakes.ytex.kernel.BaseSparseDataFormatter;
+import org.apache.ctakes.ytex.kernel.FileUtil;
+import org.apache.ctakes.ytex.kernel.InstanceData;
+import org.apache.ctakes.ytex.kernel.KernelUtil;
+import org.apache.ctakes.ytex.kernel.SparseData;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatter;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatterFactory;
+
+import com.google.common.collect.BiMap;
+
+
+public class LibSVMFormatterFactory implements SparseDataFormatterFactory {
+	KernelUtil kernelUtil;
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see org.apache.ctakes.ytex.libsvm.SparseDataFormatterFactory#getFormatter()
+	 */
+	@Override
+	public SparseDataFormatter getFormatter() {
+		return new LibSVMFormatter(getKernelUtil());
+	}
+
+	public static class LibSVMFormatter extends BaseSparseDataFormatter {
+		@Override
+		public void initializeExport(InstanceData instanceLabel,
+				Properties properties, SparseData sparseData)
+				throws IOException {
+			super.initializeExport(instanceLabel, properties, sparseData);
+		}
+
+		public LibSVMFormatter(KernelUtil kernelUtil) {
+			super(kernelUtil);
+		}
+
+		@Override
+		public void initializeLabel(
+				String label,
+				SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>> labelInstances,
+				Properties properties, SparseData sparseData)
+				throws IOException {
+			kernelUtil.exportClassIds(this.outdir,
+					this.labelToClassIndexMap.get(label), label);
+		}
+
+		/**
+		 * write a file with the attribute names corresponding to the indices in
+		 * the libsvm data file
+		 */
+		@Override
+		public void initializeFold(SparseData sparseData, String label,
+				Integer run, Integer fold,
+				SortedMap<Boolean, SortedMap<Long, String>> foldInstanceLabelMap)
+				throws IOException {
+			exportAttributeNames(sparseData, label, run, fold);
+		}
+
+		/**
+		 * export the given train/test set
+		 */
+		@Override
+		public void exportFold(SparseData sparseData,
+				SortedMap<Long, String> instanceClassMap, boolean train,
+				String label, Integer run, Integer fold) throws IOException {
+			String filename = FileUtil.getDataFilePrefix(outdir, label, run,
+					fold, train) + "_data.txt";
+			String idFilename = FileUtil.getDataFilePrefix(outdir, label, run,
+					fold, train) + "_id.txt";
+			exportDataForLabel(filename, idFilename, sparseData,
+					instanceClassMap, this.labelToClassIndexMap.get(label));
+		}
+
+		/**
+		 * Export data file and id file
+		 * 
+		 * @param filename
+		 * @param idFilename
+		 * @param bagOfWordsData
+		 * @param instanceClassMap
+		 * @param numericAttributeMap
+		 * @param nominalAttributeMap
+		 * @param label
+		 * @throws IOException
+		 */
+		protected void exportDataForLabel(String filename, String idFilename,
+				SparseData bagOfWordsData,
+				SortedMap<Long, String> instanceClassMap,
+				BiMap<String, Integer> classToIndexMap) throws IOException {
+			BufferedWriter wData = null;
+			BufferedWriter wId = null;
+			try {
+				wData = new BufferedWriter(new FileWriter(filename));
+				wId = new BufferedWriter(new FileWriter(idFilename));
+				exportDataForInstances(bagOfWordsData, instanceClassMap,
+						classToIndexMap, wData, wId);
+			} finally {
+				if (wData != null)
+					wData.close();
+				if (wId != null)
+					wId.close();
+			}
+		}
+
+		/**
+		 * 
+		 * @param bagOfWordsData
+		 *            data to be exported
+		 * @param instanceClassMap
+		 *            instance ids - class name map
+		 * @param classToIndexMap
+		 *            class name - class id map
+		 * @param wData
+		 *            file to write data to
+		 * @param wId
+		 *            file to write ids to
+		 * @return list of instance ids corresponding to order with which they
+		 *         were exported
+		 * @throws IOException
+		 */
+		protected List<Long> exportDataForInstances(SparseData bagOfWordsData,
+				SortedMap<Long, String> instanceClassMap,
+				BiMap<String, Integer> classToIndexMap, BufferedWriter wData,
+				BufferedWriter wId) throws IOException {
+			List<Long> instanceIds = new ArrayList<Long>();
+			for (Map.Entry<Long, String> instanceClass : instanceClassMap
+					.entrySet()) {
+				long instanceId = instanceClass.getKey();
+				instanceIds.add(instanceId);
+				// allocate line with sparse attribute indices and values
+				SortedMap<Integer, Double> instanceValues = getSparseLineValues(
+						bagOfWordsData, numericAttributeMap,
+						nominalAttributeMap, instanceId);
+				// data file
+				// write class id
+				int classId = classToIndexMap.get(instanceClass.getValue());
+				// write id to id file
+				wId.write(Long.toString(instanceId));
+				wId.newLine();
+				wData.write(Integer.toString(classId));
+				// write attributes
+				// add the attributes
+				writeLibsvmLine(wData, instanceValues);
+			}
+			return instanceIds;
+		}
+
+		protected void writeLibsvmLine(BufferedWriter wData,
+				SortedMap<Integer, Double> instanceValues) throws IOException {
+			for (SortedMap.Entry<Integer, Double> instanceValue : instanceValues
+					.entrySet()) {
+				wData.write("\t");
+				wData.write(Integer.toString(instanceValue.getKey()));
+				wData.write(":");
+				wData.write(Double.toString(instanceValue.getValue()));
+			}
+			wData.newLine();
+		}
+
+		/**
+		 * clean up fold specific state
+		 */
+		@Override
+		public void clearFold() {
+			this.numericAttributeMap.clear();
+			this.nominalAttributeMap.clear();
+		}
+
+		@Override
+		public void clearLabel() {
+		}
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporter.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporter.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporter.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporter.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,10 @@
+package org.apache.ctakes.ytex.libsvm;
+
+import java.io.IOException;
+import java.util.Properties;
+
+public interface LibSVMGramMatrixExporter {
+
+	public abstract void exportGramMatrix(Properties props) throws IOException;
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporterImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporterImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporterImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporterImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,510 @@
+package org.apache.ctakes.ytex.libsvm;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import javax.sql.DataSource;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.ctakes.ytex.kernel.FileUtil;
+import org.apache.ctakes.ytex.kernel.InstanceData;
+import org.apache.ctakes.ytex.kernel.KernelContextHolder;
+import org.apache.ctakes.ytex.kernel.KernelUtil;
+import org.apache.ctakes.ytex.kernel.dao.KernelEvaluationDao;
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.transaction.PlatformTransactionManager;
+
+import com.google.common.collect.BiMap;
+
+
+/**
+ * export gram matrix for libsvm. input properties file with following keys:
+ * <p/>
+ * <li>kernel.name name of kernel evaluation (corresponds to name column in
+ * kernel_eval table) - required
+ * <li>outdir directory where files will be place - optional defaults to current
+ * directory
+ * <p/>
+ * Output to outdir following files:
+ * <li>train_data.txt - for each class label, a symmetric gram matrix for
+ * training instances
+ * <li>train_id.txt - instance ids corresponding to rows of training gram matrix
+ * <li>test_data.txt - for each class label, a rectangular matrix of the test
+ * instances kernel evaluations wrt training instances
+ * <li>test_id.txt - instance ids corresponding to rows of test gram matrix
+ * 
+ * @author vijay
+ */
+public class LibSVMGramMatrixExporterImpl implements LibSVMGramMatrixExporter {
+	@SuppressWarnings("static-access")
+	public static void main(String args[]) throws IOException {
+		Options options = new Options();
+		options.addOption(OptionBuilder
+				.withArgName("prop")
+				.hasArg()
+				.isRequired()
+				.withDescription(
+						"property file with queries and other kernel parameters")
+				.create("prop"));
+		try {
+			CommandLineParser parser = new GnuParser();
+			CommandLine line = parser.parse(options, args);
+			LibSVMGramMatrixExporter exporter = (LibSVMGramMatrixExporter) KernelContextHolder
+					.getApplicationContext()
+					.getBean("libSVMGramMatrixExporter");
+			exporter.exportGramMatrix(FileUtil.loadProperties(
+					line.getOptionValue("prop"), true));
+		} catch (ParseException pe) {
+			HelpFormatter formatter = new HelpFormatter();
+			formatter.printHelp(
+					"java " + LibSVMGramMatrixExporterImpl.class.getName()
+							+ " export gram matrix in libsvm format", options);
+		}
+	}
+
+	private JdbcTemplate jdbcTemplate = null;
+	private KernelEvaluationDao kernelEvaluationDao = null;
+	private KernelUtil kernelUtil;
+	private LibSVMUtil libsvmUtil;
+
+	private PlatformTransactionManager transactionManager;
+
+	/**
+	 * export the train or test gram matrix. the train gram matrix is square and
+	 * symmetric. the test gram matrix is rectangular - each column corresponds
+	 * to a training instance each row corresponds to a test instance.
+	 * 
+	 * @param gramMatrix
+	 *            square symmetric matrix with all available instance data
+	 * @param instanceIdToClassMap
+	 *            folds
+	 * @param train
+	 *            true - export train set, false - export test set
+	 * @param mapInstanceIdToIndex
+	 *            map of instance id to index in gramMatrix
+	 * @param filePrefix
+	 *            - prefix to which we add train_data.txt
+	 * @param mapClassToIndex
+	 * @throws IOException
+	 */
+	private void exportFold(double[][] gramMatrix,
+			Map<Boolean, SortedMap<Long, String>> instanceIdToClassMap,
+			boolean train, Map<Long, Integer> mapInstanceIdToIndex,
+			String filePrefix, Map<String, Integer> mapClassToIndex)
+			throws IOException {
+		String fileName = new StringBuilder(filePrefix).append("_data.txt")
+				.toString();
+		String idFileName = new StringBuilder(filePrefix).append("_id.txt")
+				.toString();
+		BufferedWriter w = null;
+		BufferedWriter wId = null;
+		// for both training and test sets, the column instance ids
+		// are the training instance ids. This is already sorted,
+		// but we stuff it in a list, so make sure it is sorted
+		// the order has to be the same in both the train and test files
+		List<Long> colInstanceIds = new ArrayList<Long>(instanceIdToClassMap
+				.get(true).keySet());
+		Collections.sort(colInstanceIds);
+		// the rows - train or test instance ids and their class labels
+		SortedMap<Long, String> rowInstanceToClassMap = instanceIdToClassMap
+				.get(train);
+		try {
+			w = new BufferedWriter(new FileWriter(fileName));
+			wId = new BufferedWriter(new FileWriter(idFileName));
+			int rowIndex = 0;
+			// the rows in the gramMatrix correspond to the entries in the
+			// instanceLabelMap
+			// both are in the same order
+			for (Map.Entry<Long, String> instanceClass : rowInstanceToClassMap
+					.entrySet()) {
+				// classId - we assume that this is value is valid for libsvm
+				// this can be a real number (for regression)
+				String classId = instanceClass.getValue();
+				// the instance id of this row
+				long rowInstanceId = instanceClass.getKey();
+				// the index to gramMatrix corresponding to this instance
+				int rowInstanceIndex = mapInstanceIdToIndex.get(rowInstanceId);
+				// write class Id
+				w.write(mapClassToIndex.get(classId).toString());
+				w.write("\t");
+				// write row number - libsvm uses 1-based indexing
+				w.write("0:");
+				w.write(Integer.toString(rowIndex + 1));
+				// write column entries
+				for (int columnIndex = 0; columnIndex < colInstanceIds.size(); columnIndex++) {
+					// column instance id
+					long colInstanceId = colInstanceIds.get(columnIndex);
+					// index into gram matrix for this instance id
+					int colInstanceIndex = mapInstanceIdToIndex
+							.get(colInstanceId);
+					w.write("\t");
+					// write column number
+					w.write(Integer.toString(columnIndex + 1));
+					w.write(":");
+					// write value - gramMatrix is symmetric, so this will work
+					// both ways
+					w.write(Double
+							.toString(gramMatrix[rowInstanceIndex][colInstanceIndex]));
+				}
+				// don't want carriage return, even on windows
+				w.write("\n");
+				// increment the row number
+				rowIndex++;
+				// write id to file
+				wId.write(Long.toString(rowInstanceId));
+				wId.write("\n");
+			}
+		} finally {
+			if (w != null)
+				w.close();
+			if (wId != null)
+				wId.close();
+		}
+
+	}
+
+	/**
+	 * Load the gram matrix based on scope. Write the gram matrix for each fold.
+	 * Generate 4 files per fold: train_data.txt, train_id.txt, test_data.txt,
+	 * test_id.txt.
+	 * 
+	 */
+	private void exportGramMatrices(String name, String experiment,
+			double param1, String param2, String scope, String splitName,
+			String outdir, InstanceData instanceData,
+			Map<String, BiMap<String, Integer>> labelToClassIndexMap)
+			throws IOException {
+		// the full, symmetric gram matrix
+		double[][] gramMatrix = null;
+		// the set of all instance ids
+		SortedSet<Long> instanceIds = new TreeSet<Long>();
+		// map of instance id to index in gramMatrix
+		Map<Long, Integer> mapInstanceIdToIndex = new HashMap<Long, Integer>();
+		if (scope == null || scope.length() == 0) {
+			// empty scope - load gram matrix
+			gramMatrix = loadGramMatrix(name, experiment, param1, param2,
+					splitName, null, 0, 0, instanceData, instanceIds,
+					mapInstanceIdToIndex);
+			if (gramMatrix == null)
+				return;
+		}
+		for (String label : instanceData.getLabelToInstanceMap().keySet()) {
+			if ("label".equals(scope)) {
+				// label scope - load gram matrix
+				gramMatrix = loadGramMatrix(name, experiment, param1, param2,
+						splitName, label, 0, 0, instanceData, instanceIds,
+						mapInstanceIdToIndex);
+				if (gramMatrix == null)
+					return;
+			}
+			// write the properties file with the class id to class name map
+			kernelUtil.exportClassIds(outdir, labelToClassIndexMap.get(label),
+					label);
+			for (int run : instanceData.getLabelToInstanceMap().get(label)
+					.keySet()) {
+				for (int fold : instanceData.getLabelToInstanceMap().get(label)
+						.get(run).keySet()) {
+					if ("fold".equals(scope)) {
+						// fold scope - load gram matrix
+						gramMatrix = loadGramMatrix(name, experiment, param1,
+								param2, splitName, label, run, fold,
+								instanceData, instanceIds, mapInstanceIdToIndex);
+					}
+					if (gramMatrix != null) {
+						// get folds
+						Map<Boolean, SortedMap<Long, String>> foldMap = instanceData
+								.getLabelToInstanceMap().get(label).get(run)
+								.get(fold);
+						// export training fold
+						exportFold(gramMatrix, foldMap, true,
+								mapInstanceIdToIndex,
+								FileUtil.getDataFilePrefix(outdir, label, run,
+										fold, true),
+								labelToClassIndexMap.get(label));
+						// export test fold
+						exportFold(gramMatrix, foldMap, false,
+								mapInstanceIdToIndex,
+								FileUtil.getDataFilePrefix(outdir, label, run,
+										fold, false),
+								labelToClassIndexMap.get(label));
+					}
+				}
+			}
+		}
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see
+	 * org.apache.ctakes.ytex.libsvm.LibSVMGramMatrixExporter#exportGramMatrix(java.util.Properties
+	 * )
+	 */
+	public void exportGramMatrix(Properties props) throws IOException {
+		String name = props.getProperty("org.apache.ctakes.ytex.corpusName");
+		String experiment = props.getProperty("org.apache.ctakes.ytex.experiment");
+		String param2 = props.getProperty("org.apache.ctakes.ytex.param2");
+		double param1 = Double.parseDouble(props
+				.getProperty("org.apache.ctakes.ytex.param1", "0"));
+		String scope = props.getProperty("scope");
+		InstanceData instanceData = this.getKernelUtil().loadInstances(
+				props.getProperty("instanceClassQuery"));
+		String splitName = props.getProperty("org.apache.ctakes.ytex.splitName");
+		String outdir = props.getProperty("outdir");
+		Map<String, BiMap<String, Integer>> labelToClassIndexMap = new HashMap<String, BiMap<String, Integer>>();
+		kernelUtil.fillLabelToClassToIndexMap(
+				instanceData.getLabelToClassMap(), labelToClassIndexMap);
+		exportGramMatrices(name, experiment, param1, param2, scope, splitName,
+				outdir, instanceData, labelToClassIndexMap);
+	}
+
+	public DataSource getDataSource() {
+		return jdbcTemplate.getDataSource();
+	}
+
+	public KernelEvaluationDao getKernelEvaluationDao() {
+		return kernelEvaluationDao;
+	}
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public LibSVMUtil getLibsvmUtil() {
+		return libsvmUtil;
+	}
+
+	public PlatformTransactionManager getTransactionManager() {
+		return transactionManager;
+	}
+
+	private double[][] loadGramMatrix(String name, String experiment,
+			double param1, String param2, String splitName, String label,
+			int run, int fold, InstanceData instanceData,
+			SortedSet<Long> instanceIds, Map<Long, Integer> mapInstanceIdToIndex) {
+		double[][] gramMatrix;
+		instanceIds.clear();
+		mapInstanceIdToIndex.clear();
+		instanceIds.addAll(instanceData.getAllInstanceIds(label, run, fold));
+		int index = 0;
+		for (long instanceId : instanceIds) {
+			mapInstanceIdToIndex.put(instanceId, index++);
+		}
+		gramMatrix = this.kernelUtil.loadGramMatrix(instanceIds, name,
+				splitName, experiment, label, run, fold, param1, param2);
+		return gramMatrix;
+	}
+
+	public void setDataSource(DataSource dataSource) {
+		this.jdbcTemplate = new JdbcTemplate(dataSource);
+	}
+
+	public void setKernelEvaluationDao(KernelEvaluationDao kernelEvaluationDao) {
+		this.kernelEvaluationDao = kernelEvaluationDao;
+	}
+
+	// private void exportFold(String name, String experiment, String outdir,
+	// InstanceData instanceData, String label, int run, int fold,
+	// double param1, String param2) throws IOException {
+	// SortedMap<Integer, String> trainInstanceLabelMap = instanceData
+	// .getLabelToInstanceMap().get(label).get(run).get(fold)
+	// .get(true);
+	// SortedMap<Integer, String> testInstanceLabelMap = instanceData
+	// .getLabelToInstanceMap().get(label).get(run).get(fold)
+	// .get(false);
+	// double[][] trainGramMatrix = new
+	// double[trainInstanceLabelMap.size()][trainInstanceLabelMap
+	// .size()];
+	// double[][] testGramMatrix = null;
+	// if (testInstanceLabelMap != null) {
+	// testGramMatrix = new
+	// double[testInstanceLabelMap.size()][trainInstanceLabelMap
+	// .size()];
+	// }
+	// KernelEvaluation kernelEval = this.kernelEvaluationDao.getKernelEval(
+	// name, experiment, label, 0, param1, param2);
+	// kernelUtil.fillGramMatrix(kernelEval, new TreeSet<Integer>(
+	// trainInstanceLabelMap.keySet()), trainGramMatrix,
+	// testInstanceLabelMap != null ? new TreeSet<Integer>(
+	// testInstanceLabelMap.keySet()) : null, testGramMatrix);
+	// outputGramMatrix(kernelEval, trainInstanceLabelMap, trainGramMatrix,
+	// FileUtil.getDataFilePrefix(outdir, label, run, fold,
+	// testInstanceLabelMap != null ? true : null));
+	// if (testGramMatrix != null) {
+	// outputGramMatrix(kernelEval, testInstanceLabelMap, testGramMatrix,
+	// FileUtil.getDataFilePrefix(outdir, label, run, fold, false));
+	// }
+	// }
+	//
+	// private void outputGramMatrix(KernelEvaluation kernelEval,
+	// SortedMap<Integer, String> instanceLabelMap, double[][] gramMatrix,
+	// String dataFilePrefix) throws IOException {
+	// StringBuilder bFileName = new StringBuilder(dataFilePrefix)
+	// .append("_data.txt");
+	// StringBuilder bIdFileName = new StringBuilder(dataFilePrefix)
+	// .append("_id.txt");
+	// BufferedWriter w = null;
+	// BufferedWriter wId = null;
+	// try {
+	// w = new BufferedWriter(new FileWriter(bFileName.toString()));
+	// wId = new BufferedWriter(new FileWriter(bIdFileName.toString()));
+	// int rowIndex = 0;
+	// // the rows in the gramMatrix correspond to the entries in the
+	// // instanceLabelMap
+	// // both are in the same order
+	// for (Map.Entry<Integer, String> instanceClass : instanceLabelMap
+	// .entrySet()) {
+	// // default the class Id to 0
+	// String classId = instanceClass.getValue();
+	// int instanceId = instanceClass.getKey();
+	// // write class Id
+	// w.write(classId);
+	// w.write("\t");
+	// // write row number - libsvm uses 1-based indexing
+	// w.write("0:");
+	// w.write(Integer.toString(rowIndex + 1));
+	// // write column entries
+	// for (int columnIndex = 0; columnIndex < gramMatrix[rowIndex].length;
+	// columnIndex++) {
+	// w.write("\t");
+	// // write column number
+	// w.write(Integer.toString(columnIndex + 1));
+	// w.write(":");
+	// // write value
+	// w.write(Double.toString(gramMatrix[rowIndex][columnIndex]));
+	// }
+	// w.newLine();
+	// // increment the row number
+	// rowIndex++;
+	// // write id file
+	// wId.write(Integer.toString(instanceId));
+	// wId.newLine();
+	// }
+	// } finally {
+	// if (w != null)
+	// w.close();
+	// if (wId != null)
+	// wId.close();
+	// }
+	// }
+
+	// /**
+	// * instantiate gram matrices, generate output files
+	// *
+	// * @param name
+	// * @param testInstanceQuery
+	// * @param trainInstanceQuery
+	// * @param outdir
+	// * @throws IOException
+	// */
+	// private void exportGramMatrices(String name, String testInstanceQuery,
+	// String trainInstanceQuery, String outdir) throws IOException {
+	// Set<String> labels = new HashSet<String>();
+	// SortedMap<Integer, Map<String, Integer>> trainInstanceLabelMap =
+	// libsvmUtil
+	// .loadClassLabels(trainInstanceQuery, labels);
+	// double[][] trainGramMatrix = new
+	// double[trainInstanceLabelMap.size()][trainInstanceLabelMap
+	// .size()];
+	// SortedMap<Integer, Map<String, Integer>> testInstanceLabelMap = null;
+	// double[][] testGramMatrix = null;
+	// if (testInstanceQuery != null) {
+	// testInstanceLabelMap = libsvmUtil.loadClassLabels(
+	// testInstanceQuery, labels);
+	// testGramMatrix = new
+	// double[testInstanceLabelMap.size()][trainInstanceLabelMap
+	// .size()];
+	// }
+	// // fillGramMatrix(name, trainInstanceLabelMap, trainGramMatrix,
+	// // testInstanceLabelMap, testGramMatrix);
+	// for (String label : labels) {
+	// outputGramMatrix(name, outdir, label, trainInstanceLabelMap,
+	// trainGramMatrix, "training");
+	// if (testGramMatrix != null) {
+	// outputGramMatrix(name, outdir, label, testInstanceLabelMap,
+	// testGramMatrix, "test");
+	// }
+	// }
+	// libsvmUtil.outputInstanceIds(outdir, trainInstanceLabelMap, "training");
+	// if (testInstanceLabelMap != null)
+	// libsvmUtil.outputInstanceIds(outdir, testInstanceLabelMap, "test");
+	// }
+
+	// private void outputGramMatrix(String name, String outdir, String label,
+	// SortedMap<Integer, Map<String, Integer>> instanceLabelMap,
+	// double[][] gramMatrix, String type) throws IOException {
+	// StringBuilder bFileName = new StringBuilder(outdir)
+	// .append(File.separator).append(type).append("_data_")
+	// .append(label).append(".txt");
+	// BufferedWriter w = null;
+	// try {
+	// w = new BufferedWriter(new FileWriter(bFileName.toString()));
+	// int rowIndex = 0;
+	// // the rows in the gramMatrix correspond to the entries in the
+	// // instanceLabelMap
+	// // both are in the same order
+	// for (Map.Entry<Integer, Map<String, Integer>> instanceLabels :
+	// instanceLabelMap
+	// .entrySet()) {
+	// // default the class Id to 0
+	// int classId = 0;
+	// if (instanceLabels.getValue() != null
+	// && instanceLabels.getValue().containsKey(label)) {
+	// classId = instanceLabels.getValue().get(label);
+	// }
+	// // write class Id
+	// w.write(Integer.toString(classId));
+	// w.write("\t");
+	// // write row number - libsvm uses 1-based indexing
+	// w.write("0:");
+	// w.write(Integer.toString(rowIndex + 1));
+	// // write column entries
+	// for (int columnIndex = 0; columnIndex < gramMatrix[rowIndex].length;
+	// columnIndex++) {
+	// w.write("\t");
+	// // write column number
+	// w.write(Integer.toString(columnIndex + 1));
+	// w.write(":");
+	// // write value
+	// w.write(Double.toString(gramMatrix[rowIndex][columnIndex]));
+	// }
+	// w.newLine();
+	// // increment the row number
+	// rowIndex++;
+	// }
+	// } finally {
+	// if (w != null)
+	// w.close();
+	// }
+	// }
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	public void setLibsvmUtil(LibSVMUtil libsvmUtil) {
+		this.libsvmUtil = libsvmUtil;
+	}
+
+	public void setTransactionManager(
+			PlatformTransactionManager transactionManager) {
+		this.transactionManager = transactionManager;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMParser.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMParser.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMParser.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMParser.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,376 @@
+package org.apache.ctakes.ytex.libsvm;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.ytex.kernel.BaseClassifierEvaluationParser;
+import org.apache.ctakes.ytex.kernel.model.ClassifierEvaluation;
+import org.apache.ctakes.ytex.kernel.model.ClassifierInstanceEvaluation;
+import org.apache.ctakes.ytex.kernel.model.SVMClassifierEvaluation;
+
+
+public class LibSVMParser extends BaseClassifierEvaluationParser {
+	public static Pattern labelsPattern = Pattern.compile("labels\\s+(.*)");
+	public static Pattern totalSVPattern = Pattern.compile("total_sv (\\d+)");
+	public static Pattern pKernel = Pattern.compile("-t\\s+(\\d)");
+	public static Pattern pGamma = Pattern.compile("-g\\s+([\\d\\.eE-]+)");
+	public static Pattern pCost = Pattern.compile("-c\\s+([\\d\\.eE-]+)");
+	public static Pattern pWeight = Pattern
+			.compile("-w-{0,1}\\d\\s+[\\d\\.]+\\b");
+	public static Pattern pDegree = Pattern.compile("-d\\s+(\\d+)");
+
+	/**
+	 * parse svm-train model file to get the number of support vectors. Needed
+	 * for model selection
+	 * 
+	 * @param modelFile
+	 * @return
+	 * @throws IOException
+	 */
+	public Integer parseModel(String modelFile) throws IOException {
+		BufferedReader r = null;
+		try {
+			r = new BufferedReader(new FileReader(modelFile));
+			String line = null;
+			while ((line = r.readLine()) != null) {
+				Matcher m = totalSVPattern.matcher(line);
+				if (m.find()) {
+					return new Integer(m.group(1));
+				}
+			}
+		} finally {
+			try {
+				if (r != null)
+					r.close();
+			} catch (Exception e) {
+				System.err.println("reading model file");
+				e.printStackTrace(System.err);
+			}
+		}
+		return null;
+	}
+
+	// /**
+	// * Parse svm-predict input (instance file) and predictions (prediction
+	// file)
+	// *
+	// * @param predictionFile
+	// * @param instanceFile
+	// * @return
+	// * @throws Exception
+	// * @throws IOException
+	// */
+	// public ClassifierEvaluationResults parse(String predictionFile,
+	// String instanceFile, Properties props) throws IOException {
+	// ClassifierEvaluationResults results = new ClassifierEvaluationResults();
+	// List<ClassifierEvaluationResult> listResults = new
+	// ArrayList<ClassifierEvaluationResult>();
+	// results.setResults(listResults);
+	// BufferedReader instanceReader = null;
+	// BufferedReader predictionReader = null;
+	// try {
+	// instanceReader = new BufferedReader(new FileReader(instanceFile));
+	// predictionReader = new BufferedReader(
+	// new FileReader(predictionFile));
+	// String instanceLine = null;
+	// String predictionLine = null;
+	// int nLine = 0;
+	// // 1st line in libSVMOutputReader lists labels
+	//
+	// results.setClassIds(parseClassIds(predictionReader));
+	// // when working with high cutoffs resulting in mainly zero vectors
+	// // we sometimes have a trivial classification problem (1 class)
+	// // if (results.getClassIds().size() < 2)
+	// // throw new Exception("error parsing class ids");
+	// while (((instanceLine = instanceReader.readLine()) != null)
+	// && ((predictionLine = predictionReader.readLine()) != null)) {
+	// nLine++;
+	// ClassifierEvaluationResult result = new ClassifierEvaluationResult();
+	// listResults.add(result);
+	// String predictTokens[] = wsPattern.split(predictionLine);
+	// String classIdPredicted = predictTokens[0];
+	// String classIdTarget = extractFirstToken(instanceLine,
+	// wsPattern);
+	// result.setTargetClassId(Integer.parseInt(classIdTarget));
+	// result.setPredictedClassId(Integer.parseInt(classIdPredicted));
+	// if (predictTokens.length > 1) {
+	// double probabilities[] = new double[results.getClassIds()
+	// .size()];
+	// for (int i = 1; i < predictTokens.length; i++) {
+	// probabilities[i - 1] = Double
+	// .parseDouble(predictTokens[i]);
+	// }
+	// result.setProbabilities(probabilities);
+	// }
+	// }
+	// } finally {
+	// if (instanceReader != null) {
+	// try {
+	// instanceReader.close();
+	// } catch (Exception e) {
+	// System.err.println("testGramReader");
+	// e.printStackTrace(System.err);
+	// }
+	// }
+	// if (predictionReader != null) {
+	// try {
+	// predictionReader.close();
+	// } catch (Exception e) {
+	// e.printStackTrace(System.err);
+	// }
+	// }
+	// }
+	// return results;
+	// }
+
+	/**
+	 * parse class ids from first line in prediction file. this correspond to
+	 * probabilities
+	 * 
+	 * @param predictionReader
+	 * @return
+	 * @throws IOException
+	 */
+	protected List<Integer> parseClassIds(BufferedReader predictionReader)
+			throws IOException {
+		List<Integer> labels = null;
+		String labelLine = predictionReader.readLine();
+		Matcher labelMatcher = labelsPattern.matcher(labelLine);
+		if (labelMatcher.find()) {
+			String labelsA[] = wsPattern.split(labelMatcher.group(1));
+			if (labelsA != null && labelsA.length > 0) {
+				labels = new ArrayList<Integer>(labelsA.length);
+				for (String label : labelsA)
+					labels.add(Integer.parseInt(label));
+			}
+		}
+		return labels;
+	}
+
+	protected SVMClassifierEvaluation initClassifierEval(String name,
+			String experiment, String label, String options,
+			String instanceIdFile) {
+		SVMClassifierEvaluation eval = new SVMClassifierEvaluation();
+		initClassifierEval(name, experiment, label, options, instanceIdFile,
+				eval);
+		return eval;
+	}
+
+	private void initClassifierEval(String name, String experiment,
+			String label, String options, String instanceIdFile,
+			ClassifierEvaluation eval) {
+		initClassifierEvaluation(instanceIdFile, eval);
+		eval.setName(name);
+		eval.setExperiment(experiment);
+		eval.setOptions(options);
+	}
+
+	/**
+	 * parse predicted class ids, probabilities; correlate to target class ids
+	 * and instance ids.
+	 * 
+	 * @param predictionFile
+	 *            prediction (output)
+	 * @param instanceFile
+	 *            input data file; contains target class ids
+	 * @param props
+	 * @param instanceIdFile
+	 *            instance ids corresponding to lines in input data file
+	 * @param eval
+	 * @throws IOException
+	 */
+	protected void parsePredictions(String predictionFile, String instanceFile,
+			Properties props, String instanceIdFile,
+			SVMClassifierEvaluation eval) throws IOException {
+		boolean storeProbabilities = YES.equalsIgnoreCase(props.getProperty(
+				ParseOption.STORE_PROBABILITIES.getOptionKey(),
+				ParseOption.STORE_PROBABILITIES.getDefaultValue()));
+		List<Long> instanceIds = null;
+		if (instanceIdFile != null)
+			instanceIds = parseInstanceIds(instanceIdFile);
+		BufferedReader instanceReader = null;
+		BufferedReader predictionReader = null;
+		try {
+			instanceReader = new BufferedReader(new FileReader(instanceFile));
+			predictionReader = new BufferedReader(
+					new FileReader(predictionFile));
+			String instanceLine = null;
+			String predictionLine = null;
+			int nLine = 0;
+			// 1st line in libSVMOutputReader lists class ids - parse them out
+			List<Integer> classIds = parseClassIds(predictionReader);
+			// iterate through input data file and output predictions
+			// simultaneously
+			while (((instanceLine = instanceReader.readLine()) != null)
+					&& ((predictionLine = predictionReader.readLine()) != null)) {
+				// get instance id corresponding to this line
+				long instanceId = instanceIds.size() > nLine ? instanceIds
+						.get(nLine) : nLine;
+				nLine++;
+				// allocate instanceEval
+				ClassifierInstanceEvaluation instanceEval = new ClassifierInstanceEvaluation();
+				// parse out predicted class from output predictions
+				String predictTokens[] = wsPattern.split(predictionLine);
+				String classIdPredicted = predictTokens[0];
+				String classIdTarget = extractFirstToken(instanceLine,
+						wsPattern);
+				// parse out target class from input data file
+				instanceEval.setTargetClassId(Integer.parseInt(classIdTarget));
+				instanceEval.setPredictedClassId(Integer
+						.parseInt(classIdPredicted));
+				instanceEval.setInstanceId(instanceId);
+				instanceEval.setClassifierEvaluation(eval);
+				// add the instance to the map
+				eval.getClassifierInstanceEvaluations().put(instanceId,
+						instanceEval);
+				// parse class id probabilities
+				if (storeProbabilities && predictTokens.length > 1) {
+					for (int i = 1; i < predictTokens.length; i++) {
+						instanceEval.getClassifierInstanceProbabilities().put(
+								classIds.get(i - 1),
+								Double.parseDouble(predictTokens[i]));
+					}
+				}
+			}
+		} finally {
+			if (instanceReader != null) {
+				try {
+					instanceReader.close();
+				} catch (Exception e) {
+					e.printStackTrace(System.err);
+				}
+			}
+			if (predictionReader != null) {
+				try {
+					predictionReader.close();
+				} catch (Exception e) {
+					e.printStackTrace(System.err);
+				}
+			}
+		}
+	}
+
+	protected void parseOptions(SVMClassifierEvaluation eval, String options) {
+		// -q -b 1 -t 2 -w1 41 -g 1000 -c 1000 training_data_11_fold9_train.txt
+		// training_data_11_fold9_model.txt
+		if (options != null) {
+			eval.setKernel(parseIntOption(pKernel, options));
+			if (eval.getKernel() == null)
+				eval.setKernel(0);
+			eval.setDegree(parseIntOption(pDegree, options));
+			eval.setWeight(parseWeight(options));
+			eval.setCost(parseDoubleOption(pCost, options));
+			eval.setGamma(parseDoubleOption(pGamma, options));
+		}
+	}
+
+	/**
+	 * parse the weight options out of the libsvm command line. they are of the
+	 * form -w0 1 -w2 1.5 ...
+	 * 
+	 * @param options
+	 * @return null if no weight options, else weight options
+	 */
+	private String parseWeight(String options) {
+		StringBuilder bWeight = new StringBuilder();
+		Matcher m = pWeight.matcher(options);
+		boolean bWeightParam = false;
+		while (m.find()) {
+			bWeightParam = true;
+			bWeight.append(m.group()).append(" ");
+		}
+		if (bWeightParam)
+			return bWeight.toString();
+		else
+			return null;
+	}
+
+	/**
+	 * parse directory. Expect following files:
+	 * <ul>
+	 * <li>model.txt - libsvm model file
+	 * <li>options.properties - properties file with needed parameter settings
+	 * (see ParseOption)
+	 * <li>predict.txt - predictions on test set
+	 * </ul>
+	 */
+	@Override
+	public void parseDirectory(File dataDir, File outputDir) throws IOException {
+		String model = outputDir.getPath() + File.separator + "model.txt";
+		String predict = outputDir.getPath() + File.separator + "predict.txt";
+		String optionsFile = outputDir.getPath() + File.separator
+				+ "options.properties";
+		if (checkFileRead(model) && checkFileRead(predict)
+				&& checkFileRead(optionsFile)) {
+			// read options.properties
+			Properties props = this.loadProps(outputDir);
+			SVMClassifierEvaluation eval = new SVMClassifierEvaluation();
+			// set algorithm
+			eval.setAlgorithm("libsvm");
+			// parse results
+			parseResults(dataDir, outputDir, model, predict, eval, props);
+			// store results
+			storeResults(dataDir, props, eval);
+		}
+	}
+
+	/**
+	 * store the parsed classifier evaluation
+	 * 
+	 * @param props
+	 * @param eval
+	 * @throws IOException
+	 */
+	protected void storeResults(File dataDir, Properties props,
+			SVMClassifierEvaluation eval) throws IOException {
+		// store the classifier evaluation
+		getClassifierEvaluationDao().saveClassifierEvaluation(
+				eval,
+				this.loadClassIdMap(dataDir, eval.getLabel()),
+				YES.equalsIgnoreCase(props.getProperty(
+						ParseOption.STORE_INSTANCE_EVAL.getOptionKey(),
+						ParseOption.STORE_INSTANCE_EVAL.getDefaultValue())));
+	}
+
+	/**
+	 * parse the results in the specified output dir. use reference data from
+	 * dataDir.
+	 * 
+	 * @param dataDir
+	 * @param outputDir
+	 * @param model
+	 * @param predict
+	 * @param eval
+	 * @param props
+	 * @throws IOException
+	 */
+	protected void parseResults(File dataDir, File outputDir, String model,
+			String predict, SVMClassifierEvaluation eval, Properties props)
+			throws IOException {
+		// initialize common properties
+		initClassifierEvaluationFromProperties(props, eval);
+		// parse number of support vectors from model
+		eval.setSupportVectors(this.parseModel(model));
+		// parse options from command line
+		parseOptions(eval,
+				props.getProperty(ParseOption.EVAL_LINE.getOptionKey()));
+		// parse fold, run, label from file base name
+		String fileBaseName = this.getFileBaseName(props);
+		initClassifierEvaluation(fileBaseName, eval);
+		// parse predictions
+		String instanceIdFile = dataDir + File.separator + fileBaseName
+				+ "test_id.txt";
+		String instanceFile = dataDir + File.separator + fileBaseName
+				+ "test_data.txt";
+		this.parsePredictions(predict, instanceFile, props, instanceIdFile,
+				eval);
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtil.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtil.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtil.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtil.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,24 @@
+package org.apache.ctakes.ytex.libsvm;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+
+public interface LibSVMUtil {
+
+	/**
+	 * @param strQuery
+	 *            query to get instance id - class label
+	 * @param labels
+	 *            fill with distinct labels
+	 * @return Map[Instance ID, Map[Class Label, Class Id]]
+	 */
+	public abstract SortedMap<Integer, Map<String, Integer>> loadClassLabels(
+			String strQuery, final Set<String> labels);
+	
+	public void outputInstanceIds(String outdir,
+			SortedMap<Integer, Map<String, Integer>> trainInstanceLabelMap,
+			String string) throws IOException;
+
+}
\ No newline at end of file



Mime
View raw message