Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 76ECA10EF5 for ; Mon, 16 Dec 2013 16:34:01 +0000 (UTC) Received: (qmail 82159 invoked by uid 500); 16 Dec 2013 16:33:20 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 81921 invoked by uid 500); 16 Dec 2013 16:33:06 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 81565 invoked by uid 99); 16 Dec 2013 16:32:39 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 16 Dec 2013 16:32:39 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 16 Dec 2013 16:32:17 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 89A962388C7D; Mon, 16 Dec 2013 16:30:59 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1551254 [23/26] - in /ctakes/branches/ytex: ctakes-ytex-res/ ctakes-ytex-res/.settings/ ctakes-ytex-res/src/ ctakes-ytex-res/src/main/ ctakes-ytex-res/src/main/resources/ ctakes-ytex-res/src/main/resources/org/ ctakes-ytex-res/src/main/res... Date: Mon, 16 Dec 2013 16:30:40 -0000 To: commits@ctakes.apache.org From: vjapache@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20131216163059.89A962388C7D@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/pagerank/PageRankServiceImpl.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/pagerank/PageRankServiceImpl.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/pagerank/PageRankServiceImpl.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/pagerank/PageRankServiceImpl.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,524 @@ +package org.apache.ctakes.ytex.kernel.pagerank; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.OptionGroup; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.ctakes.ytex.kernel.KernelContextHolder; +import org.apache.ctakes.ytex.kernel.dao.ConceptDao; +import org.apache.ctakes.ytex.kernel.model.ConcRel; +import org.apache.ctakes.ytex.kernel.model.ConceptGraph; + + +public class PageRankServiceImpl implements PageRankService { + private static final Log log = LogFactory.getLog(PageRankServiceImpl.class); + + private double[] rankInternal(Map dampingVector, + ConceptGraph cg, int iter, double threshold, double dampingFactor) { + Map scoreMapCurrent = dampingVector; + double N = (double) cg.getConceptList().size(); + double scoresCurrent[] = new double[cg.getConceptList().size()]; + double diff = 1d; + for (int i = 0; i < iter; i++) { + double[] scoresOld = scoresCurrent; + long timeBegin = 0; + if (log.isDebugEnabled()) { + timeBegin = System.currentTimeMillis(); + } + scoresCurrent = pagerankIter(scoresOld, dampingVector, cg, + dampingFactor, N); + if (log.isDebugEnabled()) { + log.debug("iter " + i + " " + + Long.toString(System.currentTimeMillis() - timeBegin)); + } + if ((diff = difference(scoresOld, scoresCurrent)) <= threshold) + break; + } + if (log.isDebugEnabled() && diff > threshold) { + log.debug("did not converge, diff = " + diff + ", dampingVector = " + + dampingVector); + } + return scoresCurrent; + } + + /** + * difference between 2 vectors + * + * @param a + * @param b + * @return a-b + */ + private double difference(Map a, Map b) { + double diff = 0d; + for (Map.Entry aiEntry : a.entrySet()) { + Double bi = b.get(aiEntry.getKey()); + diff += Math.pow( + aiEntry.getValue() - (bi != null ? bi.doubleValue() : 0d), + 2); + } + for (Map.Entry biEntry : b.entrySet()) { + if (!a.containsKey(biEntry.getKey())) { + diff += Math.pow(biEntry.getValue(), 2); + } + } + return diff; + } + + /** + * + * @param u + * @param v + * @return norm(u-v) + */ + private double difference(double[] u, double[] v) { + double diff = 0d; + for (int i = 0; i < u.length; i++) { + double d = (u[i] - v[i]); + diff += d * d; + } + return Math.sqrt(diff); + } + + private double cosine(double[] u, double[] v) { + double uu = 0; + double vv = 0; + double uv = 0; + for (int i = 0; i < u.length; i++) { + uu += u[i] * u[i]; + vv += v[i] * v[i]; + uv += u[i] * v[i]; + } + return uv / Math.sqrt(uu * vv); + } + + public double[] pagerankIter(double[] currentScores, + Map dampingVector, ConceptGraph cg, + double dampingFactor, double N, Set activeNodes) { + double newScores[] = new double[(int) N]; + Arrays.fill(newScores, 0d); + Integer[] activeNodeArr = new Integer[activeNodes.size()]; + activeNodes.toArray(activeNodeArr); + for (int index : activeNodeArr) { + // pagerank with non-uniform damping vector (topic vector). + // because of the non-uniform damping vector, few nodes will have a + // non-zero pagerank. + // optimized so that we only iterate over nodes with non-zero + // pagerank. + // propagate from non-zero nodes to linked nodes + // we assume currentScores is non-null - it is initialized to the + // damping vector. + // iterate over nodes that have a pagerank, and propagate the + // pagerank to out-links. + // pagerank + double score = currentScores[index]; + // get concept id + ConcRel cr = cg.getConceptList().get(index); + // get number of out-links + double nOutlinks = (double) cr.getChildren().size(); + if (nOutlinks > 0) { + // propagate pagerank to out-links (children) + for (ConcRel crOut : cr.getChildren()) { + int targetIndex = crOut.getNodeIndex(); + // get current pagerank value for target page + double childScore = newScores[targetIndex]; + // add the pagerank/|links| + childScore += (score / nOutlinks); + newScores[targetIndex] = childScore; + activeNodes.add(targetIndex); + } + } + } + // we just added the contribution of pages to newScores sum(score). + // adjust: convert to (d)*sum(score) + (1-d)*v_i + for (int index : activeNodes) { + // personalized pagerank + double adjusted = (newScores[index] * dampingFactor); + // v_i + Double v_i = dampingVector.get(index); + // 1-c * v_i + if (v_i != null) + adjusted += v_i; + newScores[index] = adjusted; + } + return newScores; + } + + public double[] pagerankIter(double[] currentScores, + Map dampingVector, ConceptGraph cg, + double dampingFactor, double N) { + double newScores[] = new double[(int) N]; + double jump = ((1 - dampingFactor) / N); + for (int i = 0; i < currentScores.length; i++) { + double score = 0d; + ConcRel c = cg.getConceptList().get(i); + // get nodes pointing at node c + for (int parentIndex : c.getParentsArray()) { + ConcRel p = cg.getConceptList().get(parentIndex); + // get the pagerank for node p which is pointing at c + // if this is the first iteration, currentScores is null so + // use the initial pagerank + double prIn = currentScores[parentIndex]; + // add the pagerank divided by the number of nodes p is + // pointing at + score += (prIn / (double) p.getChildrenArray().length); + } + if (dampingVector == null) { + // uniform damping + newScores[i] = (score * dampingFactor) + jump; + } else { + // personalized pagerank + double adjusted = (score * dampingFactor); + // get the random jump for this node + Double v_i = dampingVector.get(i); + // if not null, add it + if (v_i != null) + adjusted += v_i; + newScores[i] = adjusted; + } + } + return newScores; + } + + @Override + public double[] rank2(Map dampingVector, ConceptGraph cg, + int iter, double threshold, double dampingFactor) { + double N = (double) cg.getConceptMap().size(); + double scoresCurrent[] = new double[cg.getConceptMap().size()]; + Map dampingVectorAdj = null; + // Set activeNodes = null; + if (dampingVector != null) { + // for personalized page rank, put together a map of possibilities + // of randomly jumping to a specific node + dampingVectorAdj = new HashMap( + dampingVector.size()); + // // initialize set of active nodes + // activeNodes = new HashSet(dampingVector.keySet()); + Arrays.fill(scoresCurrent, 0d); + for (Map.Entry dvEntry : dampingVector.entrySet()) { + // set the random jump for the node + dampingVectorAdj.put(dvEntry.getKey(), dvEntry.getValue() + * (1 - dampingFactor)); + // set the initial weight for the node + scoresCurrent[dvEntry.getKey()] = dvEntry.getValue(); + } + } else { + // for static page rank, all nodes have same weight initially + Arrays.fill(scoresCurrent, 1d / N); + } + double diff = 1d; + for (int i = 0; i < iter; i++) { + double scoresOld[] = scoresCurrent; + long timeBegin = 0; + if (log.isDebugEnabled()) { + timeBegin = System.currentTimeMillis(); + } + // if (activeNodes == null) { + scoresCurrent = pagerankIter(scoresCurrent, dampingVectorAdj, cg, + dampingFactor, N); + // } else { + // scoresCurrent = pagerankIter(scoresCurrent, dampingVectorAdj, + // cg, dampingFactor, N, activeNodes); + // } + if (log.isDebugEnabled()) { + log.debug("iter " + i + " time(ms) " + + Long.toString(System.currentTimeMillis() - timeBegin)); + } + if ((diff = difference(scoresCurrent, scoresOld)) <= threshold) + break; + } + if (log.isDebugEnabled() && diff > threshold) { + log.debug("did not converge, diff = " + diff + ", dampingVector = " + + dampingVector); + } + return scoresCurrent; + } + + /** + * perform one iteration of pagerank + * + * @param currentScores + * @param cg + * @return + */ + public Map pagerankIter( + Map currentScores, + Map dampingVector, ConceptGraph cg, + double dampingFactor, double N) { + Map newScores = new HashMap(); + if (dampingVector == null) { + // the constant probability of randomly surfing into this node, + // adjusted by damping factor + double jump = ((1 - dampingFactor) / N); + double initialValue = 1 / N; + // the basic pagerank iteration with uniform damping vector + // iterate over all nodes + for (ConcRel c : cg.getConceptList()) { + double score = 0d; + // get nodes pointing at node c + for (ConcRel in : c.getParents()) { + // get the pagerank for node p which is pointing at c + // if this is the first iteration, currentScores is null so + // use the initial pagerank + double prIn = currentScores == null ? initialValue + : currentScores.get(in.getNodeIndex()); + // add the pagerank divided by the number of nodes p is + // pointing at + score += (prIn / (double) in.getChildren().size()); + } + // adjust for uniform damping + double adjusted = (score * dampingFactor) + jump; + newScores.put(c.getNodeIndex(), adjusted); + } + // for (ConcRel c : cg.getConceptMap().values()) { + // double score = 0d; + // // get nodes pointing at node c + // for (ConcRel in : c.getParents()) { + // // get the pagerank for node p which is pointing at c + // // if this is the first iteration, currentScores is null so + // // use the initial pagerank + // double prIn = currentScores == null ? initialValue + // : currentScores.get(in.getConceptID()); + // // add the pagerank divided by the number of nodes p is + // // pointing at + // score += (prIn / (double) in.getChildren().size()); + // } + // // adjust for uniform damping + // double adjusted = (score * dampingFactor) + jump; + // newScores.put(c.getConceptID(), adjusted); + // } + } else { + // pagerank with non-uniform damping vector (topic vector). + // because of the non-uniform damping vector, few nodes will have a + // non-zero pagerank. + // optimized so that we only iterate over nodes with non-zero + // pagerank. + // propagate from non-zero nodes to linked nodes + // we assume currentScores is non-null - it is initialized to the + // damping vector. + // iterate over nodes that have a pagerank, and propagate the + // pagerank to out-links. + for (Map.Entry scoreEntry : currentScores + .entrySet()) { + // page (concept id) + Integer index = scoreEntry.getKey(); + // pagerank + double score = scoreEntry.getValue(); + // get concept id + ConcRel cr = cg.getConceptList().get(index); + // get number of out-links + double nOutlinks = (double) cr.getChildren().size(); + if (nOutlinks > 0) { + // propagate pagerank to out-links (children) + for (ConcRel crOut : cr.getChildren()) { + // get current pagerank value for target page + double childScore = 0d; + Double childScoreD = newScores + .get(crOut.getNodeIndex()); + if (childScoreD != null) + childScore = childScoreD.doubleValue(); + // add the pagerank/|links| + childScore += (score / nOutlinks); + newScores.put(crOut.getNodeIndex(), childScore); + } + } + } + // we just added the contribution of pages to newScores sum(score). + // adjust: convert to (d)*sum(score) + (1-d)*v_i + for (Map.Entry scoreEntry : newScores.entrySet()) { + // v_i + Double v_i = dampingVector.get(scoreEntry.getKey()); + // 1-c * v_i + double v_i_adj = v_i != null ? v_i * (1 - dampingFactor) : 0d; + double adjusted = (scoreEntry.getValue() * dampingFactor) + + v_i_adj; + scoreEntry.setValue(adjusted); + } + // + // + // for (Map.Entry scoreEntry : currentScores + // .entrySet()) { + // // page (concept id) + // String page = scoreEntry.getKey(); + // // pagerank + // double score = scoreEntry.getValue(); + // // get concept id + // ConcRel cr = cg.getConceptMap().get(page); + // // get number of out-links + // double nOutlinks = (double) cr.getChildren().size(); + // if (nOutlinks > 0) { + // // propagate pagerank to out-links (children) + // for (ConcRel crOut : cr.getChildren()) { + // // get current pagerank value for target page + // double childScore = 0d; + // Double childScoreD = newScores + // .get(crOut.getConceptID()); + // if (childScoreD != null) + // childScore = childScoreD.doubleValue(); + // // add the pagerank/|links| + // childScore += (score / nOutlinks); + // newScores.put(crOut.getConceptID(), childScore); + // } + // } + // } + // // we just added the contribution of pages to newScores + // sum(score). + // // adjust: convert to (d)*sum(score) + (1-d)*v_i + // for (Map.Entry scoreEntry : newScores.entrySet()) + // { + // // v_i + // Double v_i = dampingVector.get(scoreEntry.getKey()); + // // 1-c * v_i + // double v_i_adj = v_i != null ? v_i * (1 - dampingFactor) : 0d; + // double adjusted = (scoreEntry.getValue() * dampingFactor) + // + v_i_adj; + // scoreEntry.setValue(adjusted); + // } + } + return newScores; + } + + /** + * compute similarity using personalized page rank, as documented in Exploring Knowledge Bases for Similarity + * + * @param concept1 + * @param concept2 + * @param cg + * @param iter + * @param threshold + * @param dampingFactor + * @return + */ + @Override + public double sim(String concept1, String concept2, ConceptGraph cg, + int iter, double threshold, double dampingFactor) { + Map c1dv = new HashMap(1); + ConcRel c1 = cg.getConceptMap().get(concept1); + ConcRel c2 = cg.getConceptMap().get(concept2); + if (c1 == null || c2 == null) + return 0d; + c1dv.put(c1.getNodeIndex(), 1d); + double[] c1pr = this.rank2(c1dv, cg, iter, threshold, dampingFactor); + Map c2dv = new HashMap(1); + c2dv.put(c2.getNodeIndex(), 1d); + double[] c2pr = this.rank2(c2dv, cg, iter, threshold, dampingFactor); + return cosine(c1pr, c2pr); + } + + /** + * cosine of two vectors + * + * @param u + * @param v + * @return + */ + private double cosine(Map u, Map v) { + double uu = 0d; + double uv = 0d; + double vv = 0d; + if (u.isEmpty() || v.isEmpty()) + return 0d; + // in this loop compute u*u, and u*v + for (Map.Entry uEntry : u.entrySet()) { + double ui = uEntry.getValue(); + T uC = uEntry.getKey(); + uu += ui * ui; + Double vi = v.get(uC); + if (vi != null) + uv += ui * vi.doubleValue(); + } + if (uv == 0) + return 0d; + // in this loop, compute v*v + for (double vi : v.values()) { + vv += vi * vi; + } + // u*v/sqrt(v*v)*sqrt(u*u) + return uv / Math.sqrt(vv * uu); + } + + public static void main(String args[]) { + Options options = new Options(); + OptionGroup og = new OptionGroup(); + og.addOption(OptionBuilder + .withArgName("concept1,concept2") + .hasArg() + .withDescription( + "compute similarity for specified concept pair") + .create("sim")); + og.addOption(OptionBuilder + .withArgName("concept1,concept2,...") + .hasArg() + .withDescription( + "personalized pagerank vector for specified concepts ") + .create("ppr")); + og.setRequired(true); + options.addOptionGroup(og); + try { + CommandLineParser parser = new GnuParser(); + CommandLine line = parser.parse(options, args); + Properties ytexProps = new Properties(); + ytexProps.putAll((Properties) KernelContextHolder + .getApplicationContext().getBean("ytexProperties")); + ytexProps.putAll(System.getProperties()); + ConceptDao conceptDao = KernelContextHolder.getApplicationContext() + .getBean(ConceptDao.class); + PageRankService pageRankService = KernelContextHolder + .getApplicationContext().getBean(PageRankService.class); + ConceptGraph cg = conceptDao.getConceptGraph(ytexProps + .getProperty("org.apache.ctakes.ytex.conceptGraphName")); + if (line.hasOption("sim")) { + String cs = line.getOptionValue("sim"); + String concept[] = cs.split(","); + System.out.println(pageRankService.sim(concept[0], concept[1], + cg, 30, 1e-4, 0.85)); + } else if (line.hasOption("ppr")) { + String cs = line.getOptionValue("ppr"); + String concept[] = cs.split(","); + double weight = 1 / (double) concept.length; + Map ppv = new HashMap(); + for (String c : concept) { + ppv.put(c, weight); + } + System.out.println(pageRankService.rank(ppv, cg)); + } + } catch (ParseException pe) { + HelpFormatter formatter = new HelpFormatter(); + formatter + .printHelp( + "java " + + PageRankServiceImpl.class.getName() + + " compute personalized page rank or similarity. used for testing purposes", + options); + } + + } + + @Override + public double[] rank(Map dampingVector, ConceptGraph cg, + int iter, double threshold, double dampingFactor) { + // TODO Auto-generated method stub + return null; + } + + @Override + public double[] rank(Map dampingVector, ConceptGraph cg) { + // TODO Auto-generated method stub + return null; + } +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilder.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilder.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilder.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilder.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,20 @@ +package org.apache.ctakes.ytex.kernel.tree; + +import java.io.IOException; +import java.util.Map; + +public interface InstanceTreeBuilder { + + /** + * Generate trees from the results of a sorted query + * + */ + public Map loadInstanceTrees(TreeMappingInfo mappingInfo); + + public abstract void serializeInstanceTrees(TreeMappingInfo mappingInfo, String filename) + throws IOException; + + public abstract Map loadInstanceTrees(String filename) throws IOException, + ClassNotFoundException; + +} \ No newline at end of file Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilderImpl.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilderImpl.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilderImpl.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/InstanceTreeBuilderImpl.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,210 @@ +package org.apache.ctakes.ytex.kernel.tree; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import javax.sql.DataSource; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.jdbc.core.simple.SimpleJdbcTemplate; + +public class InstanceTreeBuilderImpl implements InstanceTreeBuilder { + static final Log log = LogFactory.getLog(InstanceTreeBuilderImpl.class); + SimpleJdbcTemplate simpleJdbcTemplate; + private DataSource dataSource; + + public DataSource getDataSource() { + return dataSource; + } + + public void setDataSource(DataSource dataSource) { + this.dataSource = dataSource; + this.simpleJdbcTemplate = new SimpleJdbcTemplate(dataSource); + } + + Node nodeFromRow(NodeMappingInfo nodeInfo, Map nodeValues) { + Node n = null; + Map values = new HashMap( + nodeInfo.getValues().size()); + for (String valueName : nodeInfo.getValues()) { + if (nodeValues.containsKey(valueName) + && nodeValues.get(valueName) != null) { + values.put(valueName, (Serializable) nodeValues.get(valueName)); + } + } + // make sure there is something to put in + if (!values.isEmpty()) { + n = new Node(); + n.setType(nodeInfo.getNodeType()); + n.setValue(values); + } + return n; + } + + @SuppressWarnings("unchecked") + @Override + public Map loadInstanceTrees(String filename) + throws IOException, ClassNotFoundException { + ObjectInputStream os = null; + try { + os = new ObjectInputStream(new BufferedInputStream( + new FileInputStream(filename))); + return (Map) os.readObject(); + } finally { + if (os != null) + os.close(); + } + } + + @Override + public void serializeInstanceTrees(TreeMappingInfo mappingInfo, + String filename) throws IOException { + ObjectOutputStream os = null; + try { + os = new ObjectOutputStream(new BufferedOutputStream( + new FileOutputStream(filename))); + os.writeObject(loadInstanceTrees(mappingInfo)); + } finally { + if (os != null) + os.close(); + } + } + + public Map loadInstanceTrees(TreeMappingInfo mappingInfo) { + Map nodeKeyMap = new HashMap(); + this.prepare(mappingInfo.getPrepareScript(), mappingInfo.getPrepareScriptStatementDelimiter()); + Map instanceMap = loadInstanceTrees( + mappingInfo.getInstanceIDField(), + mappingInfo.getInstanceQueryMappingInfo(), nodeKeyMap); + if (mappingInfo.getNodeQueryMappingInfos() != null) { + for (QueryMappingInfo qInfo : mappingInfo + .getNodeQueryMappingInfos()) { + this.addChildrenToNodes(nodeKeyMap, qInfo); + } + } + return instanceMap; + } + + + /** + * run 'preparation' statements. These may e.g. create temporary tables in the database. + * @param prepareStatementList + */ + protected void prepare(String prepareScript, String prepareScriptDelimiter) { + if(prepareScript != null && prepareScript.length() > 0) { + String[] statements = prepareScript.split(prepareScriptDelimiter); + List listStatements = new ArrayList(statements.length); + // throw out empty lines + for(String sql : statements) { + if(sql != null && sql.trim().length() > 0) + listStatements.add(sql); + } + JdbcTemplate jt = new JdbcTemplate(this.getDataSource()); + jt.batchUpdate(listStatements.toArray(new String[]{})); + } + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.ctakes.ytex.kernel.tree.InstanceTreeBuilder#loadInstanceTrees(java.util.List, + * java.lang.String, java.lang.String, java.util.Map) + */ + protected Map loadInstanceTrees(String instanceIDField, + QueryMappingInfo qInfo, Map nodeKeyMap) { + Node[] currentPath = new Node[qInfo.getNodeTypes().size()]; + Map instanceMap = new HashMap(); + List> rowData = simpleJdbcTemplate.queryForList( + qInfo.getQuery(), qInfo.getQueryArgs()); + for (Map row : rowData) { + for (int i = 0; i < qInfo.getNodeTypes().size(); i++) { + Node newNode = this.nodeFromRow(qInfo.getNodeTypes().get(i), + row); + if (newNode != null) { + if (!newNode.equals(currentPath[i])) { + if (i > 0) { + // add the node to the parent + currentPath[i - 1].getChildren().add(newNode); + } else { + // this is a new root, i.e. a new instance + // add it to the instance map + instanceMap.put(((Number) row.get(instanceIDField)).longValue(), + newNode); + } + // put the new node in the path + // we don't really care about nodes 'after' this one in + // the path list + // because we only add to parents, not to children + currentPath[i] = newNode; + if (nodeKeyMap != null) + nodeKeyMap.put(new NodeKey(newNode), newNode); + } + } + } + } + return instanceMap; + } + + public void addChildrenToNodes(Map nodeKeyMap, + QueryMappingInfo qInfo) { + // run query + List> rowData = simpleJdbcTemplate.queryForList( + qInfo.getQuery(), qInfo.getQueryArgs()); + // iterate through rows, adding nodes as children of existing nodes + for (Map row : rowData) { + // allocate array for holding node path corresponding to row + Node[] currentPath = new Node[qInfo.getNodeTypes().size()]; + // get the root of this subtree - temporary node contains values + Node parentTmp = nodeFromRow(qInfo.getNodeTypes().get(0), row); + if (parentTmp != null) { + // get the node from the tree that correponds to this node + Node parent = nodeKeyMap.get(new NodeKey(parentTmp)); + if (parent == null) { + if (log.isWarnEnabled()) { + log.warn("couldn't find node for key: " + parentTmp); + } + } else { + // found the parent - add the subtree + currentPath[0] = parent; + for (int i = 1; i < qInfo.getNodeTypes().size(); i++) { + Node newNode = this.nodeFromRow(qInfo.getNodeTypes() + .get(i), row); + if (newNode != null) { + if (!newNode.equals(currentPath[i])) { + // null out everything after this index in the path + Arrays.fill(currentPath, i, + currentPath.length - 1, null); + // add the node to the parent + currentPath[i - 1].getChildren().add(newNode); + // put the new node in the path + // we don't really care about nodes 'after' this + // one in the path list + // because we only add to parents, not to + // children + currentPath[i] = newNode; + if (nodeKeyMap != null) + nodeKeyMap.put(new NodeKey(newNode), + newNode); + } + } + } + } + } + } + } + +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/Node.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/Node.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/Node.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/Node.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,94 @@ +package org.apache.ctakes.ytex.kernel.tree; + +import java.io.Serializable; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +public class Node implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + + @Override + public String toString() { + return "Node [type=" + type + ", value=" + value + "]"; + } + + private String type; + private Map value; + private List children = new LinkedList(); + + /** + * Caching the norm externally, e.g. in EHCache involves too much additional + * overhead. Therefore, save the norm in this object. This shouldn't cause + * problems in a multi-threaded environment, as long as the kernel is the + * same - the value of the norm will be the same across evaluations. + */ + private transient Double norm; + + public Double getNorm() { + return norm; + } + + public void setNorm(Double norm) { + this.norm = norm; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((type == null) ? 0 : type.hashCode()); + result = prime * result + ((value == null) ? 0 : value.hashCode()); + return result; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public Map getValue() { + return value; + } + + public void setValue(Map value) { + this.value = value; + } + + public List getChildren() { + return children; + } + + public void setChildren(List children) { + this.children = children; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + Node other = (Node) obj; + if (type == null) { + if (other.type != null) + return false; + } else if (!type.equals(other.type)) + return false; + if (value == null) { + if (other.value != null) + return false; + } else if (!value.equals(other.value)) + return false; + return true; + } + +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeKey.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeKey.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeKey.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeKey.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,45 @@ +package org.apache.ctakes.ytex.kernel.tree; + +import java.io.Serializable; +import java.util.Map; + +public class NodeKey { + public String type; + public Map value; + + public NodeKey(Node node) { + type = node.getType(); + value = node.getValue(); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((type == null) ? 0 : type.hashCode()); + result = prime * result + ((value == null) ? 0 : value.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + NodeKey other = (NodeKey) obj; + if (type == null) { + if (other.type != null) + return false; + } else if (!type.equals(other.type)) + return false; + if (value == null) { + if (other.value != null) + return false; + } else if (!value.equals(other.value)) + return false; + return true; + } +} \ No newline at end of file Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeMappingInfo.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeMappingInfo.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeMappingInfo.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/NodeMappingInfo.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,34 @@ +package org.apache.ctakes.ytex.kernel.tree; + +import java.util.Set; + +public class NodeMappingInfo { + private String nodeType; + + private Set values; + + public NodeMappingInfo() { + super(); + } + + public NodeMappingInfo(String nodeType, Set values) { + super(); + this.nodeType = nodeType; + this.values = values; + } + + public String getNodeType() { + return nodeType; + } + + public Set getValues() { + return values; + } + public void setNodeType(String nodeType) { + this.nodeType = nodeType; + } + + public void setValues(Set values) { + this.values = values; + } +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/QueryMappingInfo.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/QueryMappingInfo.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/QueryMappingInfo.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/QueryMappingInfo.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,29 @@ +package org.apache.ctakes.ytex.kernel.tree; + +import java.util.List; +import java.util.Map; + +public class QueryMappingInfo { + String query; + Map queryArgs; + List nodeTypes; + public String getQuery() { + return query; + } + public void setQuery(String query) { + this.query = query; + } + public Map getQueryArgs() { + return queryArgs; + } + public void setQueryArgs(Map queryArgs) { + this.queryArgs = queryArgs; + } + public List getNodeTypes() { + return nodeTypes; + } + public void setNodeTypes(List nodeTypes) { + this.nodeTypes = nodeTypes; + } + +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/TreeMappingInfo.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/TreeMappingInfo.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/TreeMappingInfo.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/tree/TreeMappingInfo.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,56 @@ +package org.apache.ctakes.ytex.kernel.tree; + +import java.util.ArrayList; +import java.util.List; + +public class TreeMappingInfo { + String instanceIDField; + QueryMappingInfo instanceQueryMappingInfo; + List nodeQueryMappingInfos = new ArrayList(); + String prepareScript; + String prepareScriptStatementDelimiter = ";"; + + + public String getPrepareScript() { + return prepareScript; + } + + public void setPrepareScript(String prepareScript) { + this.prepareScript = prepareScript; + } + + public String getPrepareScriptStatementDelimiter() { + return prepareScriptStatementDelimiter; + } + + public void setPrepareScriptStatementDelimiter( + String prepareScriptStatementDelimiter) { + this.prepareScriptStatementDelimiter = prepareScriptStatementDelimiter; + } + + public String getInstanceIDField() { + return instanceIDField; + } + + public void setInstanceIDField(String instanceIDField) { + this.instanceIDField = instanceIDField; + } + + public QueryMappingInfo getInstanceQueryMappingInfo() { + return instanceQueryMappingInfo; + } + + public void setInstanceQueryMappingInfo( + QueryMappingInfo instanceQueryMappingInfo) { + this.instanceQueryMappingInfo = instanceQueryMappingInfo; + } + + public List getNodeQueryMappingInfos() { + return nodeQueryMappingInfos; + } + + public void setNodeQueryMappingInfos( + List nodeQueryMappingInfos) { + this.nodeQueryMappingInfos = nodeQueryMappingInfos; + } +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguator.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguator.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguator.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguator.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,44 @@ +package org.apache.ctakes.ytex.kernel.wsd; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.ctakes.ytex.kernel.metric.ConceptSimilarityService.SimilarityMetricEnum; + + +public interface WordSenseDisambiguator { + + public abstract String disambiguate(List> sentenceConcepts, + int index, Set contextConcepts, int windowSize, + SimilarityMetricEnum metric, Map scoreMap); + + /** + * Disambiguate a named entity. + * + * @param sentenceConcepts + * named entities from the document, represented as list of + * sets of concept ids + * @param index + * index of target named entity to disambiguate + * @param contextConcepts + * context concepts, e.g. from title + * @param windowSize + * number of named entities on either side of target to use for + * disambiguation + * @param metric + * metric to use + * @param scoreMap + * optional to get the scores assigned to each concept + * @param weighted + * to weight context concepts by frequency + * @return highest scoring concept, or null if none of the target concepts + * are in the concept graph, or if all the target concepts have the + * same score + */ + String disambiguate(List> sentenceConcepts, int index, + Set contextConcepts, int windowSize, + SimilarityMetricEnum metric, Map scoreMap, + boolean weighted); + +} \ No newline at end of file Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguatorImpl.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguatorImpl.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguatorImpl.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/wsd/WordSenseDisambiguatorImpl.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,171 @@ +package org.apache.ctakes.ytex.kernel.wsd; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; + +import org.apache.ctakes.ytex.kernel.metric.ConceptPairSimilarity; +import org.apache.ctakes.ytex.kernel.metric.ConceptSimilarityService; +import org.apache.ctakes.ytex.kernel.metric.ConceptSimilarityService.SimilarityMetricEnum; +import org.apache.ctakes.ytex.kernel.model.ConcRel; + +import com.google.common.collect.SetMultimap; +import com.google.common.collect.TreeMultimap; + +public class WordSenseDisambiguatorImpl implements WordSenseDisambiguator { + ConceptSimilarityService conceptSimilarityService; + + public ConceptSimilarityService getConceptSimilarityService() { + return conceptSimilarityService; + } + + public void setConceptSimilarityService( + ConceptSimilarityService conceptSimilarityService) { + this.conceptSimilarityService = conceptSimilarityService; + } + + @Override + public String disambiguate(List> sentenceConcepts, int index, + Set contextConcepts, int windowSize, + SimilarityMetricEnum metric, Map scoreMap) { + return disambiguate(sentenceConcepts, index, contextConcepts, + windowSize, metric, scoreMap, true); + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.ctakes.ytex.kernel.wsd.WordSenseDisambiguator#disambiguate + * (java.util.List, int, java.util.Set, int, + * org.apache.ctakes.ytex.kernel.ConceptSimilarityService + * .SimilarityMetricEnum, java.util.Map) + */ + @Override + public String disambiguate(List> sentenceConcepts, int index, + Set contextConcepts, int windowSize, + SimilarityMetricEnum metric, Map scoreMap, + boolean weighted) { + // get the candidate concepts that we want to disambiguate + Set candidateConcepts = sentenceConcepts.get(index); + if (candidateConcepts.size() == 1) + return candidateConcepts.iterator().next(); + // allocate set to hold all the concepts to compare to + Map windowContextConcepts = new HashMap(); + // add context concepts (e.g. title concepts) + if (contextConcepts != null) { + addConcepts(windowContextConcepts, contextConcepts); + } + // add windowSize concepts from the sentence + // get left, then right concepts + // case 1 - enough tokens on both sides + int indexLeftStart = index - windowSize - 1; + int indexRightStart = index + windowSize + 1; + if (indexLeftStart < 0) { + // case 2 - not enough tokens on left + indexRightStart += (-1 * indexLeftStart); + indexLeftStart = 0; + } else if (indexRightStart >= sentenceConcepts.size()) { + // case 3 - not enough tokens on right + indexLeftStart -= indexRightStart - sentenceConcepts.size() - 1; + indexRightStart = sentenceConcepts.size() - 1; + } + // make sure the range is in bounds + if (indexLeftStart < 0) + indexLeftStart = 0; + if (indexRightStart >= sentenceConcepts.size()) + indexRightStart = sentenceConcepts.size() - 1; + // add the concepts in the ranges + if (indexLeftStart < index) { + for (Set cs : sentenceConcepts.subList(indexLeftStart, + index)) { + addConcepts(windowContextConcepts, cs); + } + } + if (indexRightStart > index) { + for (Set cs : sentenceConcepts.subList(index + 1, + indexRightStart + 1)) { + addConcepts(windowContextConcepts, cs); + } + } + // allocate map to hold scores + TreeMultimap scoreConceptMap = TreeMultimap.create(); + for (String c : candidateConcepts) { + scoreConceptMap + .put(scoreConcept(c, windowContextConcepts, metric, + weighted), c); + } + // if scoreMap is not null, fill it in with the concept scores - invert + // scoreConceptMap + boolean bNonZero = false; + if (scoreMap != null) { + for (Map.Entry scoreConcept : scoreConceptMap + .entries()) { + scoreMap.put(scoreConcept.getValue(), scoreConcept.getKey()); + } + } + SortedSet bestConcepts = scoreConceptMap.get(scoreConceptMap + .keySet().last()); + String bestConcept = null; + if (bestConcepts.size() == 1) { + // only 1 concept with high score + bestConcept = bestConcepts.iterator().next(); + } else if (bestConcepts.size() == candidateConcepts.size()) { + // all concepts have same score + bestConcept = null; + } else { + // multiple best candidates - pick concept with lowest ic - most + // general concept + double ic = 1e6; + Map conceptMap = this + .getConceptSimilarityService().getConceptGraph() + .getConceptMap(); + for (String c : bestConcepts) { + ConcRel cr = conceptMap.get(c); + if (cr != null && cr.getIntrinsicInfoContent() < ic) { + ic = cr.getIntrinsicInfoContent(); + bestConcept = c; + } + } + } + // get the best scoring concept + return bestConcept; + } + + private void addConcepts(Map windowContextConcepts, + Set contextConcepts) { + for (String c : contextConcepts) { + Integer cn = windowContextConcepts.get(c); + if (cn != null) { + windowContextConcepts.put(c, cn + 1); + } else { + windowContextConcepts.put(c, 1); + } + } + } + + private double scoreConcept(String concept, + Map windowContextConcepts, + SimilarityMetricEnum metric, boolean weighted) { + List metrics = Arrays.asList(metric); + double score = 0d; + for (Map.Entry windowConcept : windowContextConcepts + .entrySet()) { + ConceptPairSimilarity csim = conceptSimilarityService.similarity( + metrics, concept, windowConcept.getKey(), null, false); + if (weighted) + score += csim.getSimilarities().get(0) + * windowConcept.getValue().doubleValue(); + else + score += csim.getSimilarities().get(0); + } + return score; + } + +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMFormatterFactory.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMFormatterFactory.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMFormatterFactory.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMFormatterFactory.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,193 @@ +package org.apache.ctakes.ytex.libsvm; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.SortedMap; + +import org.apache.ctakes.ytex.kernel.BaseSparseDataFormatter; +import org.apache.ctakes.ytex.kernel.FileUtil; +import org.apache.ctakes.ytex.kernel.InstanceData; +import org.apache.ctakes.ytex.kernel.KernelUtil; +import org.apache.ctakes.ytex.kernel.SparseData; +import org.apache.ctakes.ytex.kernel.SparseDataFormatter; +import org.apache.ctakes.ytex.kernel.SparseDataFormatterFactory; + +import com.google.common.collect.BiMap; + + +public class LibSVMFormatterFactory implements SparseDataFormatterFactory { + KernelUtil kernelUtil; + + public KernelUtil getKernelUtil() { + return kernelUtil; + } + + public void setKernelUtil(KernelUtil kernelUtil) { + this.kernelUtil = kernelUtil; + } + + /* + * (non-Javadoc) + * + * @see org.apache.ctakes.ytex.libsvm.SparseDataFormatterFactory#getFormatter() + */ + @Override + public SparseDataFormatter getFormatter() { + return new LibSVMFormatter(getKernelUtil()); + } + + public static class LibSVMFormatter extends BaseSparseDataFormatter { + @Override + public void initializeExport(InstanceData instanceLabel, + Properties properties, SparseData sparseData) + throws IOException { + super.initializeExport(instanceLabel, properties, sparseData); + } + + public LibSVMFormatter(KernelUtil kernelUtil) { + super(kernelUtil); + } + + @Override + public void initializeLabel( + String label, + SortedMap>>> labelInstances, + Properties properties, SparseData sparseData) + throws IOException { + kernelUtil.exportClassIds(this.outdir, + this.labelToClassIndexMap.get(label), label); + } + + /** + * write a file with the attribute names corresponding to the indices in + * the libsvm data file + */ + @Override + public void initializeFold(SparseData sparseData, String label, + Integer run, Integer fold, + SortedMap> foldInstanceLabelMap) + throws IOException { + exportAttributeNames(sparseData, label, run, fold); + } + + /** + * export the given train/test set + */ + @Override + public void exportFold(SparseData sparseData, + SortedMap instanceClassMap, boolean train, + String label, Integer run, Integer fold) throws IOException { + String filename = FileUtil.getDataFilePrefix(outdir, label, run, + fold, train) + "_data.txt"; + String idFilename = FileUtil.getDataFilePrefix(outdir, label, run, + fold, train) + "_id.txt"; + exportDataForLabel(filename, idFilename, sparseData, + instanceClassMap, this.labelToClassIndexMap.get(label)); + } + + /** + * Export data file and id file + * + * @param filename + * @param idFilename + * @param bagOfWordsData + * @param instanceClassMap + * @param numericAttributeMap + * @param nominalAttributeMap + * @param label + * @throws IOException + */ + protected void exportDataForLabel(String filename, String idFilename, + SparseData bagOfWordsData, + SortedMap instanceClassMap, + BiMap classToIndexMap) throws IOException { + BufferedWriter wData = null; + BufferedWriter wId = null; + try { + wData = new BufferedWriter(new FileWriter(filename)); + wId = new BufferedWriter(new FileWriter(idFilename)); + exportDataForInstances(bagOfWordsData, instanceClassMap, + classToIndexMap, wData, wId); + } finally { + if (wData != null) + wData.close(); + if (wId != null) + wId.close(); + } + } + + /** + * + * @param bagOfWordsData + * data to be exported + * @param instanceClassMap + * instance ids - class name map + * @param classToIndexMap + * class name - class id map + * @param wData + * file to write data to + * @param wId + * file to write ids to + * @return list of instance ids corresponding to order with which they + * were exported + * @throws IOException + */ + protected List exportDataForInstances(SparseData bagOfWordsData, + SortedMap instanceClassMap, + BiMap classToIndexMap, BufferedWriter wData, + BufferedWriter wId) throws IOException { + List instanceIds = new ArrayList(); + for (Map.Entry instanceClass : instanceClassMap + .entrySet()) { + long instanceId = instanceClass.getKey(); + instanceIds.add(instanceId); + // allocate line with sparse attribute indices and values + SortedMap instanceValues = getSparseLineValues( + bagOfWordsData, numericAttributeMap, + nominalAttributeMap, instanceId); + // data file + // write class id + int classId = classToIndexMap.get(instanceClass.getValue()); + // write id to id file + wId.write(Long.toString(instanceId)); + wId.newLine(); + wData.write(Integer.toString(classId)); + // write attributes + // add the attributes + writeLibsvmLine(wData, instanceValues); + } + return instanceIds; + } + + protected void writeLibsvmLine(BufferedWriter wData, + SortedMap instanceValues) throws IOException { + for (SortedMap.Entry instanceValue : instanceValues + .entrySet()) { + wData.write("\t"); + wData.write(Integer.toString(instanceValue.getKey())); + wData.write(":"); + wData.write(Double.toString(instanceValue.getValue())); + } + wData.newLine(); + } + + /** + * clean up fold specific state + */ + @Override + public void clearFold() { + this.numericAttributeMap.clear(); + this.nominalAttributeMap.clear(); + } + + @Override + public void clearLabel() { + } + } + +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporter.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporter.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporter.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporter.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,10 @@ +package org.apache.ctakes.ytex.libsvm; + +import java.io.IOException; +import java.util.Properties; + +public interface LibSVMGramMatrixExporter { + + public abstract void exportGramMatrix(Properties props) throws IOException; + +} \ No newline at end of file Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporterImpl.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporterImpl.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporterImpl.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMGramMatrixExporterImpl.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,510 @@ +package org.apache.ctakes.ytex.libsvm; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeSet; + +import javax.sql.DataSource; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.ctakes.ytex.kernel.FileUtil; +import org.apache.ctakes.ytex.kernel.InstanceData; +import org.apache.ctakes.ytex.kernel.KernelContextHolder; +import org.apache.ctakes.ytex.kernel.KernelUtil; +import org.apache.ctakes.ytex.kernel.dao.KernelEvaluationDao; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.transaction.PlatformTransactionManager; + +import com.google.common.collect.BiMap; + + +/** + * export gram matrix for libsvm. input properties file with following keys: + *

+ *

  • kernel.name name of kernel evaluation (corresponds to name column in + * kernel_eval table) - required + *
  • outdir directory where files will be place - optional defaults to current + * directory + *

    + * Output to outdir following files: + *

  • train_data.txt - for each class label, a symmetric gram matrix for + * training instances + *
  • train_id.txt - instance ids corresponding to rows of training gram matrix + *
  • test_data.txt - for each class label, a rectangular matrix of the test + * instances kernel evaluations wrt training instances + *
  • test_id.txt - instance ids corresponding to rows of test gram matrix + * + * @author vijay + */ +public class LibSVMGramMatrixExporterImpl implements LibSVMGramMatrixExporter { + @SuppressWarnings("static-access") + public static void main(String args[]) throws IOException { + Options options = new Options(); + options.addOption(OptionBuilder + .withArgName("prop") + .hasArg() + .isRequired() + .withDescription( + "property file with queries and other kernel parameters") + .create("prop")); + try { + CommandLineParser parser = new GnuParser(); + CommandLine line = parser.parse(options, args); + LibSVMGramMatrixExporter exporter = (LibSVMGramMatrixExporter) KernelContextHolder + .getApplicationContext() + .getBean("libSVMGramMatrixExporter"); + exporter.exportGramMatrix(FileUtil.loadProperties( + line.getOptionValue("prop"), true)); + } catch (ParseException pe) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp( + "java " + LibSVMGramMatrixExporterImpl.class.getName() + + " export gram matrix in libsvm format", options); + } + } + + private JdbcTemplate jdbcTemplate = null; + private KernelEvaluationDao kernelEvaluationDao = null; + private KernelUtil kernelUtil; + private LibSVMUtil libsvmUtil; + + private PlatformTransactionManager transactionManager; + + /** + * export the train or test gram matrix. the train gram matrix is square and + * symmetric. the test gram matrix is rectangular - each column corresponds + * to a training instance each row corresponds to a test instance. + * + * @param gramMatrix + * square symmetric matrix with all available instance data + * @param instanceIdToClassMap + * folds + * @param train + * true - export train set, false - export test set + * @param mapInstanceIdToIndex + * map of instance id to index in gramMatrix + * @param filePrefix + * - prefix to which we add train_data.txt + * @param mapClassToIndex + * @throws IOException + */ + private void exportFold(double[][] gramMatrix, + Map> instanceIdToClassMap, + boolean train, Map mapInstanceIdToIndex, + String filePrefix, Map mapClassToIndex) + throws IOException { + String fileName = new StringBuilder(filePrefix).append("_data.txt") + .toString(); + String idFileName = new StringBuilder(filePrefix).append("_id.txt") + .toString(); + BufferedWriter w = null; + BufferedWriter wId = null; + // for both training and test sets, the column instance ids + // are the training instance ids. This is already sorted, + // but we stuff it in a list, so make sure it is sorted + // the order has to be the same in both the train and test files + List colInstanceIds = new ArrayList(instanceIdToClassMap + .get(true).keySet()); + Collections.sort(colInstanceIds); + // the rows - train or test instance ids and their class labels + SortedMap rowInstanceToClassMap = instanceIdToClassMap + .get(train); + try { + w = new BufferedWriter(new FileWriter(fileName)); + wId = new BufferedWriter(new FileWriter(idFileName)); + int rowIndex = 0; + // the rows in the gramMatrix correspond to the entries in the + // instanceLabelMap + // both are in the same order + for (Map.Entry instanceClass : rowInstanceToClassMap + .entrySet()) { + // classId - we assume that this is value is valid for libsvm + // this can be a real number (for regression) + String classId = instanceClass.getValue(); + // the instance id of this row + long rowInstanceId = instanceClass.getKey(); + // the index to gramMatrix corresponding to this instance + int rowInstanceIndex = mapInstanceIdToIndex.get(rowInstanceId); + // write class Id + w.write(mapClassToIndex.get(classId).toString()); + w.write("\t"); + // write row number - libsvm uses 1-based indexing + w.write("0:"); + w.write(Integer.toString(rowIndex + 1)); + // write column entries + for (int columnIndex = 0; columnIndex < colInstanceIds.size(); columnIndex++) { + // column instance id + long colInstanceId = colInstanceIds.get(columnIndex); + // index into gram matrix for this instance id + int colInstanceIndex = mapInstanceIdToIndex + .get(colInstanceId); + w.write("\t"); + // write column number + w.write(Integer.toString(columnIndex + 1)); + w.write(":"); + // write value - gramMatrix is symmetric, so this will work + // both ways + w.write(Double + .toString(gramMatrix[rowInstanceIndex][colInstanceIndex])); + } + // don't want carriage return, even on windows + w.write("\n"); + // increment the row number + rowIndex++; + // write id to file + wId.write(Long.toString(rowInstanceId)); + wId.write("\n"); + } + } finally { + if (w != null) + w.close(); + if (wId != null) + wId.close(); + } + + } + + /** + * Load the gram matrix based on scope. Write the gram matrix for each fold. + * Generate 4 files per fold: train_data.txt, train_id.txt, test_data.txt, + * test_id.txt. + * + */ + private void exportGramMatrices(String name, String experiment, + double param1, String param2, String scope, String splitName, + String outdir, InstanceData instanceData, + Map> labelToClassIndexMap) + throws IOException { + // the full, symmetric gram matrix + double[][] gramMatrix = null; + // the set of all instance ids + SortedSet instanceIds = new TreeSet(); + // map of instance id to index in gramMatrix + Map mapInstanceIdToIndex = new HashMap(); + if (scope == null || scope.length() == 0) { + // empty scope - load gram matrix + gramMatrix = loadGramMatrix(name, experiment, param1, param2, + splitName, null, 0, 0, instanceData, instanceIds, + mapInstanceIdToIndex); + if (gramMatrix == null) + return; + } + for (String label : instanceData.getLabelToInstanceMap().keySet()) { + if ("label".equals(scope)) { + // label scope - load gram matrix + gramMatrix = loadGramMatrix(name, experiment, param1, param2, + splitName, label, 0, 0, instanceData, instanceIds, + mapInstanceIdToIndex); + if (gramMatrix == null) + return; + } + // write the properties file with the class id to class name map + kernelUtil.exportClassIds(outdir, labelToClassIndexMap.get(label), + label); + for (int run : instanceData.getLabelToInstanceMap().get(label) + .keySet()) { + for (int fold : instanceData.getLabelToInstanceMap().get(label) + .get(run).keySet()) { + if ("fold".equals(scope)) { + // fold scope - load gram matrix + gramMatrix = loadGramMatrix(name, experiment, param1, + param2, splitName, label, run, fold, + instanceData, instanceIds, mapInstanceIdToIndex); + } + if (gramMatrix != null) { + // get folds + Map> foldMap = instanceData + .getLabelToInstanceMap().get(label).get(run) + .get(fold); + // export training fold + exportFold(gramMatrix, foldMap, true, + mapInstanceIdToIndex, + FileUtil.getDataFilePrefix(outdir, label, run, + fold, true), + labelToClassIndexMap.get(label)); + // export test fold + exportFold(gramMatrix, foldMap, false, + mapInstanceIdToIndex, + FileUtil.getDataFilePrefix(outdir, label, run, + fold, false), + labelToClassIndexMap.get(label)); + } + } + } + } + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.ctakes.ytex.libsvm.LibSVMGramMatrixExporter#exportGramMatrix(java.util.Properties + * ) + */ + public void exportGramMatrix(Properties props) throws IOException { + String name = props.getProperty("org.apache.ctakes.ytex.corpusName"); + String experiment = props.getProperty("org.apache.ctakes.ytex.experiment"); + String param2 = props.getProperty("org.apache.ctakes.ytex.param2"); + double param1 = Double.parseDouble(props + .getProperty("org.apache.ctakes.ytex.param1", "0")); + String scope = props.getProperty("scope"); + InstanceData instanceData = this.getKernelUtil().loadInstances( + props.getProperty("instanceClassQuery")); + String splitName = props.getProperty("org.apache.ctakes.ytex.splitName"); + String outdir = props.getProperty("outdir"); + Map> labelToClassIndexMap = new HashMap>(); + kernelUtil.fillLabelToClassToIndexMap( + instanceData.getLabelToClassMap(), labelToClassIndexMap); + exportGramMatrices(name, experiment, param1, param2, scope, splitName, + outdir, instanceData, labelToClassIndexMap); + } + + public DataSource getDataSource() { + return jdbcTemplate.getDataSource(); + } + + public KernelEvaluationDao getKernelEvaluationDao() { + return kernelEvaluationDao; + } + + public KernelUtil getKernelUtil() { + return kernelUtil; + } + + public LibSVMUtil getLibsvmUtil() { + return libsvmUtil; + } + + public PlatformTransactionManager getTransactionManager() { + return transactionManager; + } + + private double[][] loadGramMatrix(String name, String experiment, + double param1, String param2, String splitName, String label, + int run, int fold, InstanceData instanceData, + SortedSet instanceIds, Map mapInstanceIdToIndex) { + double[][] gramMatrix; + instanceIds.clear(); + mapInstanceIdToIndex.clear(); + instanceIds.addAll(instanceData.getAllInstanceIds(label, run, fold)); + int index = 0; + for (long instanceId : instanceIds) { + mapInstanceIdToIndex.put(instanceId, index++); + } + gramMatrix = this.kernelUtil.loadGramMatrix(instanceIds, name, + splitName, experiment, label, run, fold, param1, param2); + return gramMatrix; + } + + public void setDataSource(DataSource dataSource) { + this.jdbcTemplate = new JdbcTemplate(dataSource); + } + + public void setKernelEvaluationDao(KernelEvaluationDao kernelEvaluationDao) { + this.kernelEvaluationDao = kernelEvaluationDao; + } + + // private void exportFold(String name, String experiment, String outdir, + // InstanceData instanceData, String label, int run, int fold, + // double param1, String param2) throws IOException { + // SortedMap trainInstanceLabelMap = instanceData + // .getLabelToInstanceMap().get(label).get(run).get(fold) + // .get(true); + // SortedMap testInstanceLabelMap = instanceData + // .getLabelToInstanceMap().get(label).get(run).get(fold) + // .get(false); + // double[][] trainGramMatrix = new + // double[trainInstanceLabelMap.size()][trainInstanceLabelMap + // .size()]; + // double[][] testGramMatrix = null; + // if (testInstanceLabelMap != null) { + // testGramMatrix = new + // double[testInstanceLabelMap.size()][trainInstanceLabelMap + // .size()]; + // } + // KernelEvaluation kernelEval = this.kernelEvaluationDao.getKernelEval( + // name, experiment, label, 0, param1, param2); + // kernelUtil.fillGramMatrix(kernelEval, new TreeSet( + // trainInstanceLabelMap.keySet()), trainGramMatrix, + // testInstanceLabelMap != null ? new TreeSet( + // testInstanceLabelMap.keySet()) : null, testGramMatrix); + // outputGramMatrix(kernelEval, trainInstanceLabelMap, trainGramMatrix, + // FileUtil.getDataFilePrefix(outdir, label, run, fold, + // testInstanceLabelMap != null ? true : null)); + // if (testGramMatrix != null) { + // outputGramMatrix(kernelEval, testInstanceLabelMap, testGramMatrix, + // FileUtil.getDataFilePrefix(outdir, label, run, fold, false)); + // } + // } + // + // private void outputGramMatrix(KernelEvaluation kernelEval, + // SortedMap instanceLabelMap, double[][] gramMatrix, + // String dataFilePrefix) throws IOException { + // StringBuilder bFileName = new StringBuilder(dataFilePrefix) + // .append("_data.txt"); + // StringBuilder bIdFileName = new StringBuilder(dataFilePrefix) + // .append("_id.txt"); + // BufferedWriter w = null; + // BufferedWriter wId = null; + // try { + // w = new BufferedWriter(new FileWriter(bFileName.toString())); + // wId = new BufferedWriter(new FileWriter(bIdFileName.toString())); + // int rowIndex = 0; + // // the rows in the gramMatrix correspond to the entries in the + // // instanceLabelMap + // // both are in the same order + // for (Map.Entry instanceClass : instanceLabelMap + // .entrySet()) { + // // default the class Id to 0 + // String classId = instanceClass.getValue(); + // int instanceId = instanceClass.getKey(); + // // write class Id + // w.write(classId); + // w.write("\t"); + // // write row number - libsvm uses 1-based indexing + // w.write("0:"); + // w.write(Integer.toString(rowIndex + 1)); + // // write column entries + // for (int columnIndex = 0; columnIndex < gramMatrix[rowIndex].length; + // columnIndex++) { + // w.write("\t"); + // // write column number + // w.write(Integer.toString(columnIndex + 1)); + // w.write(":"); + // // write value + // w.write(Double.toString(gramMatrix[rowIndex][columnIndex])); + // } + // w.newLine(); + // // increment the row number + // rowIndex++; + // // write id file + // wId.write(Integer.toString(instanceId)); + // wId.newLine(); + // } + // } finally { + // if (w != null) + // w.close(); + // if (wId != null) + // wId.close(); + // } + // } + + // /** + // * instantiate gram matrices, generate output files + // * + // * @param name + // * @param testInstanceQuery + // * @param trainInstanceQuery + // * @param outdir + // * @throws IOException + // */ + // private void exportGramMatrices(String name, String testInstanceQuery, + // String trainInstanceQuery, String outdir) throws IOException { + // Set labels = new HashSet(); + // SortedMap> trainInstanceLabelMap = + // libsvmUtil + // .loadClassLabels(trainInstanceQuery, labels); + // double[][] trainGramMatrix = new + // double[trainInstanceLabelMap.size()][trainInstanceLabelMap + // .size()]; + // SortedMap> testInstanceLabelMap = null; + // double[][] testGramMatrix = null; + // if (testInstanceQuery != null) { + // testInstanceLabelMap = libsvmUtil.loadClassLabels( + // testInstanceQuery, labels); + // testGramMatrix = new + // double[testInstanceLabelMap.size()][trainInstanceLabelMap + // .size()]; + // } + // // fillGramMatrix(name, trainInstanceLabelMap, trainGramMatrix, + // // testInstanceLabelMap, testGramMatrix); + // for (String label : labels) { + // outputGramMatrix(name, outdir, label, trainInstanceLabelMap, + // trainGramMatrix, "training"); + // if (testGramMatrix != null) { + // outputGramMatrix(name, outdir, label, testInstanceLabelMap, + // testGramMatrix, "test"); + // } + // } + // libsvmUtil.outputInstanceIds(outdir, trainInstanceLabelMap, "training"); + // if (testInstanceLabelMap != null) + // libsvmUtil.outputInstanceIds(outdir, testInstanceLabelMap, "test"); + // } + + // private void outputGramMatrix(String name, String outdir, String label, + // SortedMap> instanceLabelMap, + // double[][] gramMatrix, String type) throws IOException { + // StringBuilder bFileName = new StringBuilder(outdir) + // .append(File.separator).append(type).append("_data_") + // .append(label).append(".txt"); + // BufferedWriter w = null; + // try { + // w = new BufferedWriter(new FileWriter(bFileName.toString())); + // int rowIndex = 0; + // // the rows in the gramMatrix correspond to the entries in the + // // instanceLabelMap + // // both are in the same order + // for (Map.Entry> instanceLabels : + // instanceLabelMap + // .entrySet()) { + // // default the class Id to 0 + // int classId = 0; + // if (instanceLabels.getValue() != null + // && instanceLabels.getValue().containsKey(label)) { + // classId = instanceLabels.getValue().get(label); + // } + // // write class Id + // w.write(Integer.toString(classId)); + // w.write("\t"); + // // write row number - libsvm uses 1-based indexing + // w.write("0:"); + // w.write(Integer.toString(rowIndex + 1)); + // // write column entries + // for (int columnIndex = 0; columnIndex < gramMatrix[rowIndex].length; + // columnIndex++) { + // w.write("\t"); + // // write column number + // w.write(Integer.toString(columnIndex + 1)); + // w.write(":"); + // // write value + // w.write(Double.toString(gramMatrix[rowIndex][columnIndex])); + // } + // w.newLine(); + // // increment the row number + // rowIndex++; + // } + // } finally { + // if (w != null) + // w.close(); + // } + // } + + public void setKernelUtil(KernelUtil kernelUtil) { + this.kernelUtil = kernelUtil; + } + + public void setLibsvmUtil(LibSVMUtil libsvmUtil) { + this.libsvmUtil = libsvmUtil; + } + + public void setTransactionManager( + PlatformTransactionManager transactionManager) { + this.transactionManager = transactionManager; + } + +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMParser.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMParser.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMParser.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMParser.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,376 @@ +package org.apache.ctakes.ytex.libsvm; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.ctakes.ytex.kernel.BaseClassifierEvaluationParser; +import org.apache.ctakes.ytex.kernel.model.ClassifierEvaluation; +import org.apache.ctakes.ytex.kernel.model.ClassifierInstanceEvaluation; +import org.apache.ctakes.ytex.kernel.model.SVMClassifierEvaluation; + + +public class LibSVMParser extends BaseClassifierEvaluationParser { + public static Pattern labelsPattern = Pattern.compile("labels\\s+(.*)"); + public static Pattern totalSVPattern = Pattern.compile("total_sv (\\d+)"); + public static Pattern pKernel = Pattern.compile("-t\\s+(\\d)"); + public static Pattern pGamma = Pattern.compile("-g\\s+([\\d\\.eE-]+)"); + public static Pattern pCost = Pattern.compile("-c\\s+([\\d\\.eE-]+)"); + public static Pattern pWeight = Pattern + .compile("-w-{0,1}\\d\\s+[\\d\\.]+\\b"); + public static Pattern pDegree = Pattern.compile("-d\\s+(\\d+)"); + + /** + * parse svm-train model file to get the number of support vectors. Needed + * for model selection + * + * @param modelFile + * @return + * @throws IOException + */ + public Integer parseModel(String modelFile) throws IOException { + BufferedReader r = null; + try { + r = new BufferedReader(new FileReader(modelFile)); + String line = null; + while ((line = r.readLine()) != null) { + Matcher m = totalSVPattern.matcher(line); + if (m.find()) { + return new Integer(m.group(1)); + } + } + } finally { + try { + if (r != null) + r.close(); + } catch (Exception e) { + System.err.println("reading model file"); + e.printStackTrace(System.err); + } + } + return null; + } + + // /** + // * Parse svm-predict input (instance file) and predictions (prediction + // file) + // * + // * @param predictionFile + // * @param instanceFile + // * @return + // * @throws Exception + // * @throws IOException + // */ + // public ClassifierEvaluationResults parse(String predictionFile, + // String instanceFile, Properties props) throws IOException { + // ClassifierEvaluationResults results = new ClassifierEvaluationResults(); + // List listResults = new + // ArrayList(); + // results.setResults(listResults); + // BufferedReader instanceReader = null; + // BufferedReader predictionReader = null; + // try { + // instanceReader = new BufferedReader(new FileReader(instanceFile)); + // predictionReader = new BufferedReader( + // new FileReader(predictionFile)); + // String instanceLine = null; + // String predictionLine = null; + // int nLine = 0; + // // 1st line in libSVMOutputReader lists labels + // + // results.setClassIds(parseClassIds(predictionReader)); + // // when working with high cutoffs resulting in mainly zero vectors + // // we sometimes have a trivial classification problem (1 class) + // // if (results.getClassIds().size() < 2) + // // throw new Exception("error parsing class ids"); + // while (((instanceLine = instanceReader.readLine()) != null) + // && ((predictionLine = predictionReader.readLine()) != null)) { + // nLine++; + // ClassifierEvaluationResult result = new ClassifierEvaluationResult(); + // listResults.add(result); + // String predictTokens[] = wsPattern.split(predictionLine); + // String classIdPredicted = predictTokens[0]; + // String classIdTarget = extractFirstToken(instanceLine, + // wsPattern); + // result.setTargetClassId(Integer.parseInt(classIdTarget)); + // result.setPredictedClassId(Integer.parseInt(classIdPredicted)); + // if (predictTokens.length > 1) { + // double probabilities[] = new double[results.getClassIds() + // .size()]; + // for (int i = 1; i < predictTokens.length; i++) { + // probabilities[i - 1] = Double + // .parseDouble(predictTokens[i]); + // } + // result.setProbabilities(probabilities); + // } + // } + // } finally { + // if (instanceReader != null) { + // try { + // instanceReader.close(); + // } catch (Exception e) { + // System.err.println("testGramReader"); + // e.printStackTrace(System.err); + // } + // } + // if (predictionReader != null) { + // try { + // predictionReader.close(); + // } catch (Exception e) { + // e.printStackTrace(System.err); + // } + // } + // } + // return results; + // } + + /** + * parse class ids from first line in prediction file. this correspond to + * probabilities + * + * @param predictionReader + * @return + * @throws IOException + */ + protected List parseClassIds(BufferedReader predictionReader) + throws IOException { + List labels = null; + String labelLine = predictionReader.readLine(); + Matcher labelMatcher = labelsPattern.matcher(labelLine); + if (labelMatcher.find()) { + String labelsA[] = wsPattern.split(labelMatcher.group(1)); + if (labelsA != null && labelsA.length > 0) { + labels = new ArrayList(labelsA.length); + for (String label : labelsA) + labels.add(Integer.parseInt(label)); + } + } + return labels; + } + + protected SVMClassifierEvaluation initClassifierEval(String name, + String experiment, String label, String options, + String instanceIdFile) { + SVMClassifierEvaluation eval = new SVMClassifierEvaluation(); + initClassifierEval(name, experiment, label, options, instanceIdFile, + eval); + return eval; + } + + private void initClassifierEval(String name, String experiment, + String label, String options, String instanceIdFile, + ClassifierEvaluation eval) { + initClassifierEvaluation(instanceIdFile, eval); + eval.setName(name); + eval.setExperiment(experiment); + eval.setOptions(options); + } + + /** + * parse predicted class ids, probabilities; correlate to target class ids + * and instance ids. + * + * @param predictionFile + * prediction (output) + * @param instanceFile + * input data file; contains target class ids + * @param props + * @param instanceIdFile + * instance ids corresponding to lines in input data file + * @param eval + * @throws IOException + */ + protected void parsePredictions(String predictionFile, String instanceFile, + Properties props, String instanceIdFile, + SVMClassifierEvaluation eval) throws IOException { + boolean storeProbabilities = YES.equalsIgnoreCase(props.getProperty( + ParseOption.STORE_PROBABILITIES.getOptionKey(), + ParseOption.STORE_PROBABILITIES.getDefaultValue())); + List instanceIds = null; + if (instanceIdFile != null) + instanceIds = parseInstanceIds(instanceIdFile); + BufferedReader instanceReader = null; + BufferedReader predictionReader = null; + try { + instanceReader = new BufferedReader(new FileReader(instanceFile)); + predictionReader = new BufferedReader( + new FileReader(predictionFile)); + String instanceLine = null; + String predictionLine = null; + int nLine = 0; + // 1st line in libSVMOutputReader lists class ids - parse them out + List classIds = parseClassIds(predictionReader); + // iterate through input data file and output predictions + // simultaneously + while (((instanceLine = instanceReader.readLine()) != null) + && ((predictionLine = predictionReader.readLine()) != null)) { + // get instance id corresponding to this line + long instanceId = instanceIds.size() > nLine ? instanceIds + .get(nLine) : nLine; + nLine++; + // allocate instanceEval + ClassifierInstanceEvaluation instanceEval = new ClassifierInstanceEvaluation(); + // parse out predicted class from output predictions + String predictTokens[] = wsPattern.split(predictionLine); + String classIdPredicted = predictTokens[0]; + String classIdTarget = extractFirstToken(instanceLine, + wsPattern); + // parse out target class from input data file + instanceEval.setTargetClassId(Integer.parseInt(classIdTarget)); + instanceEval.setPredictedClassId(Integer + .parseInt(classIdPredicted)); + instanceEval.setInstanceId(instanceId); + instanceEval.setClassifierEvaluation(eval); + // add the instance to the map + eval.getClassifierInstanceEvaluations().put(instanceId, + instanceEval); + // parse class id probabilities + if (storeProbabilities && predictTokens.length > 1) { + for (int i = 1; i < predictTokens.length; i++) { + instanceEval.getClassifierInstanceProbabilities().put( + classIds.get(i - 1), + Double.parseDouble(predictTokens[i])); + } + } + } + } finally { + if (instanceReader != null) { + try { + instanceReader.close(); + } catch (Exception e) { + e.printStackTrace(System.err); + } + } + if (predictionReader != null) { + try { + predictionReader.close(); + } catch (Exception e) { + e.printStackTrace(System.err); + } + } + } + } + + protected void parseOptions(SVMClassifierEvaluation eval, String options) { + // -q -b 1 -t 2 -w1 41 -g 1000 -c 1000 training_data_11_fold9_train.txt + // training_data_11_fold9_model.txt + if (options != null) { + eval.setKernel(parseIntOption(pKernel, options)); + if (eval.getKernel() == null) + eval.setKernel(0); + eval.setDegree(parseIntOption(pDegree, options)); + eval.setWeight(parseWeight(options)); + eval.setCost(parseDoubleOption(pCost, options)); + eval.setGamma(parseDoubleOption(pGamma, options)); + } + } + + /** + * parse the weight options out of the libsvm command line. they are of the + * form -w0 1 -w2 1.5 ... + * + * @param options + * @return null if no weight options, else weight options + */ + private String parseWeight(String options) { + StringBuilder bWeight = new StringBuilder(); + Matcher m = pWeight.matcher(options); + boolean bWeightParam = false; + while (m.find()) { + bWeightParam = true; + bWeight.append(m.group()).append(" "); + } + if (bWeightParam) + return bWeight.toString(); + else + return null; + } + + /** + * parse directory. Expect following files: + *
      + *
    • model.txt - libsvm model file + *
    • options.properties - properties file with needed parameter settings + * (see ParseOption) + *
    • predict.txt - predictions on test set + *
    + */ + @Override + public void parseDirectory(File dataDir, File outputDir) throws IOException { + String model = outputDir.getPath() + File.separator + "model.txt"; + String predict = outputDir.getPath() + File.separator + "predict.txt"; + String optionsFile = outputDir.getPath() + File.separator + + "options.properties"; + if (checkFileRead(model) && checkFileRead(predict) + && checkFileRead(optionsFile)) { + // read options.properties + Properties props = this.loadProps(outputDir); + SVMClassifierEvaluation eval = new SVMClassifierEvaluation(); + // set algorithm + eval.setAlgorithm("libsvm"); + // parse results + parseResults(dataDir, outputDir, model, predict, eval, props); + // store results + storeResults(dataDir, props, eval); + } + } + + /** + * store the parsed classifier evaluation + * + * @param props + * @param eval + * @throws IOException + */ + protected void storeResults(File dataDir, Properties props, + SVMClassifierEvaluation eval) throws IOException { + // store the classifier evaluation + getClassifierEvaluationDao().saveClassifierEvaluation( + eval, + this.loadClassIdMap(dataDir, eval.getLabel()), + YES.equalsIgnoreCase(props.getProperty( + ParseOption.STORE_INSTANCE_EVAL.getOptionKey(), + ParseOption.STORE_INSTANCE_EVAL.getDefaultValue()))); + } + + /** + * parse the results in the specified output dir. use reference data from + * dataDir. + * + * @param dataDir + * @param outputDir + * @param model + * @param predict + * @param eval + * @param props + * @throws IOException + */ + protected void parseResults(File dataDir, File outputDir, String model, + String predict, SVMClassifierEvaluation eval, Properties props) + throws IOException { + // initialize common properties + initClassifierEvaluationFromProperties(props, eval); + // parse number of support vectors from model + eval.setSupportVectors(this.parseModel(model)); + // parse options from command line + parseOptions(eval, + props.getProperty(ParseOption.EVAL_LINE.getOptionKey())); + // parse fold, run, label from file base name + String fileBaseName = this.getFileBaseName(props); + initClassifierEvaluation(fileBaseName, eval); + // parse predictions + String instanceIdFile = dataDir + File.separator + fileBaseName + + "test_id.txt"; + String instanceFile = dataDir + File.separator + fileBaseName + + "test_data.txt"; + this.parsePredictions(predict, instanceFile, props, instanceIdFile, + eval); + } + +} Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtil.java URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtil.java?rev=1551254&view=auto ============================================================================== --- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtil.java (added) +++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtil.java Mon Dec 16 16:30:30 2013 @@ -0,0 +1,24 @@ +package org.apache.ctakes.ytex.libsvm; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; + +public interface LibSVMUtil { + + /** + * @param strQuery + * query to get instance id - class label + * @param labels + * fill with distinct labels + * @return Map[Instance ID, Map[Class Label, Class Id]] + */ + public abstract SortedMap> loadClassLabels( + String strQuery, final Set labels); + + public void outputInstanceIds(String outdir, + SortedMap> trainInstanceLabelMap, + String string) throws IOException; + +} \ No newline at end of file