ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vjapa...@apache.org
Subject svn commit: r1551254 [24/26] - in /ctakes/branches/ytex: ctakes-ytex-res/ ctakes-ytex-res/.settings/ ctakes-ytex-res/src/ ctakes-ytex-res/src/main/ ctakes-ytex-res/src/main/resources/ ctakes-ytex-res/src/main/resources/org/ ctakes-ytex-res/src/main/res...
Date Mon, 16 Dec 2013 16:30:40 GMT
Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtilImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtilImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtilImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/libsvm/LibSVMUtilImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,74 @@
+package org.apache.ctakes.ytex.libsvm;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import javax.sql.DataSource;
+
+import org.springframework.jdbc.core.JdbcTemplate;
+import org.springframework.jdbc.core.RowCallbackHandler;
+
+public class LibSVMUtilImpl implements LibSVMUtil {
+	private JdbcTemplate jdbcTemplate = null;
+	public DataSource getDataSource() {
+		return jdbcTemplate.getDataSource();
+	}
+
+	public void setDataSource(DataSource dataSource) {
+		this.jdbcTemplate = new JdbcTemplate(dataSource);
+	}
+	
+	/* (non-Javadoc)
+	 * @see org.apache.ctakes.ytex.libsvm.LibSVMUtil#loadClassLabels(java.lang.String, java.util.Set)
+	 */
+	public SortedMap<Integer, Map<String, Integer>> loadClassLabels(
+			String strQuery, final Set<String> labels) {
+		final SortedMap<Integer, Map<String, Integer>> instanceLabelsMap = new TreeMap<Integer, Map<String, Integer>>();
+		jdbcTemplate.query(strQuery, new RowCallbackHandler() {
+
+			@Override
+			public void processRow(ResultSet rs) throws SQLException {
+				int instanceId = rs.getInt(1);
+				String label = rs.getString(2);
+				int classID = rs.getInt(3);
+				Map<String, Integer> instanceLabels = instanceLabelsMap
+						.get(instanceId);
+				if (instanceLabels == null) {
+					instanceLabels = new HashMap<String, Integer>(1);
+					instanceLabelsMap.put(instanceId, instanceLabels);
+				}
+				labels.add(label);
+				instanceLabels.put(label, classID);
+			}
+		});
+		return instanceLabelsMap;
+	}
+	public void outputInstanceIds(String outdir,
+			SortedMap<Integer, Map<String, Integer>> trainInstanceLabelMap,
+			String string) throws IOException {
+		StringBuilder bFileName = new StringBuilder(outdir).append(
+				File.separator).append(string).append("_instance_ids").append(
+				".txt");
+		BufferedWriter w = null;
+		try {
+			w = new BufferedWriter(new FileWriter(bFileName.toString()));
+			for (int instanceId : trainInstanceLabelMap.keySet()) {
+				w.write(Integer.toString(instanceId));
+				w.newLine();
+			}
+		} finally {
+			if (w != null)
+				w.close();
+		}
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/semil/SemiLEvaluationParser.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/semil/SemiLEvaluationParser.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/semil/SemiLEvaluationParser.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/semil/SemiLEvaluationParser.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,267 @@
+package org.apache.ctakes.ytex.semil;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.kernel.BaseClassifierEvaluationParser;
+import org.apache.ctakes.ytex.kernel.KernelContextHolder;
+import org.apache.ctakes.ytex.kernel.model.ClassifierEvaluation;
+import org.apache.ctakes.ytex.kernel.model.SemiLClassifierEvaluation;
+
+import com.google.common.collect.BiMap;
+
+
+/**
+ * Parse semiL output, store in DB. With semiL there is no test data set - just
+ * training data & unlabelled data. Need the following files:
+ * <ul>
+ * <li>*.output - semil prediction output
+ * <li>options.properties - options passed to semil (semil.distance), other
+ * options (kernel.name, kernel.experiment)
+ * <li>_test_data.txt - test class ids
+ * <li>_train_id.txt - training label ids
+ * <li>_test_id.txt - test label ids
+ * </ul>
+ * 
+ * The semil output may contain the output of multiple runs. Will create a
+ * classifier_eval record for each run. SemiL may change the label of labeled
+ * data (soft label); currently the relabled instances will not be stored.
+ * 
+ * @author vhacongarlav
+ * 
+ */
+public class SemiLEvaluationParser extends BaseClassifierEvaluationParser {
+	public final static Log log = LogFactory
+			.getLog(SemiLEvaluationParser.class);
+	/**
+	 * parse options
+	 * 
+	 * <pre>
+	 *  gamma=10.000000 mu=0.500000 lambda=0.010100 hard_label=1 Laplacian=1 percentage of labeled points =0.000000
+	 * </pre>
+	 */
+	public static Pattern pGamma = Pattern.compile("gamma=([\\d\\.\\-\\+e]+)");
+	public static Pattern pMu = Pattern.compile("mu=([\\d\\.\\-\\+e]+)");
+	public static Pattern pLambda = Pattern
+			.compile("lambda=([\\d\\.\\-\\+e]+)");
+	public static Pattern pLabel = Pattern.compile("hard_label=([01])");
+	public static Pattern pLaplacian = Pattern.compile("Laplacian=([01])");
+	public static Pattern pPercent = Pattern
+			.compile("labeled points =([\\d\\.\\-\\+e]+)");
+	/**
+	 * distance files of the form <tt>label1_dist_pearson_5.txt</tt> parse out
+	 * the metric and degree from the file name.
+	 */
+	public static Pattern pOutput = Pattern.compile("dist_(\\w+)_(\\d+)");
+
+	/**
+	 * 
+	 * @param fileBaseName
+	 *            e.g. label1_run1_fold1
+	 * @param dataDir
+	 *            where train, test, id files are located
+	 * @param outputDir
+	 *            where classifier output is stored
+	 */
+	public void parseDirectory(File dataDir, File outputDir) throws IOException {
+		Properties kernelProps = this.loadProps(outputDir);
+		// get the name of the label file
+		String labelBase = kernelProps.getProperty("kernel.label.basename");
+		if (labelBase != null && labelBase.length() > 0) {
+			// load instance ids and their class ids
+			// construct the name of the class.txt file
+			String classFileName = dataDir + File.separator
+					+ labelBase.substring(0, labelBase.length() - "label".length())
+					+ "class.txt";			
+			List<InstanceClassInfo> listClassInfo = super.loadInstanceClassInfo(dataDir,
+					classFileName);
+			// process .output files
+			if (listClassInfo != null) {
+				for (File output : outputDir.listFiles(new FilenameFilter() {
+
+					@Override
+					public boolean accept(File dir, String name) {
+						return name.endsWith(".output");
+					}
+				})) {
+					parseSemiLOutput(dataDir, labelBase, kernelProps, output,
+							listClassInfo);
+				}
+			}
+		} else {
+			log.warn("couldn't parse directory; kernel.label.base not defined. Dir: "
+					+ outputDir);
+		}
+	}
+
+	/**
+	 * parse semil output file
+	 * 
+	 * @param fileBaseName
+	 *            parse label, run and fold out of this, e.g.
+	 *            label1_run1_fold1_xxx
+	 * @param kernelProps
+	 *            from options.properties
+	 * @param output
+	 *            semil output file with predictions
+	 * @param listClassInfo
+	 *            instance and class ids
+	 * @param saveInstanceEval
+	 *            should the instance-level evaluations be saved?
+	 * @throws IOException
+	 */
+	private void parseSemiLOutput(File dataDir, String fileBaseName, Properties kernelProps,
+			File output, List<InstanceClassInfo> listClassInfo) throws IOException {
+		BufferedReader outputReader = null;
+		try {
+			outputReader = new BufferedReader(new FileReader(output));
+			String optionsLine = null;
+			String predictLine = null;
+			while ((optionsLine = outputReader.readLine()) != null
+					&& (predictLine = outputReader.readLine()) != null) {
+				SemiLClassifierEvaluation ce = new SemiLClassifierEvaluation();
+				// set label, fold, etc
+				this.initClassifierEvaluation(fileBaseName, ce);
+				// set name, experiment
+				this.initClassifierEvaluationFromProperties(kernelProps, ce);
+				BiMap<Integer, String> classIdToNameMap = loadClassIdMap(
+						dataDir, ce.getLabel());
+				// parse options
+				parseOptions(ce, optionsLine, kernelProps, output.getName());
+				boolean storeUnlabeled = YES.equalsIgnoreCase(kernelProps
+						.getProperty(
+								ParseOption.STORE_UNLABELED.getOptionKey(),
+								ParseOption.STORE_UNLABELED.getDefaultValue()));
+				parsePredictedClasses(ce, predictLine, listClassInfo,
+						storeUnlabeled, classIdToNameMap);
+				// save the classifier evaluation
+				this.storeSemiSupervised(kernelProps, ce, classIdToNameMap);
+			}
+		} finally {
+			if (outputReader != null) {
+				try {
+					outputReader.close();
+				} catch (Exception ignore) {
+				}
+			}
+		}
+	}
+
+	/**
+	 * parse class predictions for test instances out of semil output.
+	 * 
+	 * @param ce
+	 *            evaluation to update
+	 * @param predictLine
+	 *            line with predictions
+	 * @param listClassInfo
+	 * @param storeUnlabeled
+	 *            should all predictions - not only for test instances be
+	 *            stored?
+	 */
+	private void parsePredictedClasses(ClassifierEvaluation ce,
+			String predictLine, List<InstanceClassInfo> listClassInfo,
+			boolean storeUnlabeled, BiMap<Integer, String> classIdToNameMap) {
+		String strClassIds[] = predictLine.split("\\s");
+		String classNames[] = new String[strClassIds.length];
+		for (int i = 0; i < strClassIds.length; i++) {
+			classNames[i] = classIdToNameMap.get(Integer.parseInt(strClassIds[i]));
+		}
+		updateSemiSupervisedPredictions(ce, listClassInfo, storeUnlabeled,
+				classNames, classIdToNameMap.inverse());
+	}
+
+	/**
+	 * parse options out of file, into object. get the distance type from
+	 * options.properties
+	 * 
+	 * <pre>
+	 * gamma=10.000000  mu=0.500000  lambda=0.010100 hard_label=1 Laplacian=1 percentage of labeled points =0.000000 data_size=242
+	 * </pre>
+	 * 
+	 * @param ce
+	 * @param optionsLine
+	 */
+	private void parseOptions(SemiLClassifierEvaluation ce, String optionsLine,
+			Properties kernelProps, String outputName) {
+		ce.setOptions(optionsLine);
+		ce.setGamma(this.parseDoubleOption(pGamma, optionsLine));
+		ce.setLambda(this.parseDoubleOption(pLambda, optionsLine));
+		ce.setMu(this.parseDoubleOption(pMu, optionsLine));
+		ce.setPercentLabeled(this.parseDoubleOption(pPercent, optionsLine));
+		ce.setNormalizedLaplacian(this.parseIntOption(pLaplacian, optionsLine) == 1);
+		ce.setSoftLabel(this.parseIntOption(pLabel, optionsLine) == 1);
+		Matcher mOutput = pOutput.matcher(outputName);
+		if (mOutput.find()) {
+			ce.setDistance(mOutput.group(1));
+			ce.setDegree(Integer.parseInt(mOutput.group(2)));
+		}
+		// ce.setDistance(kernelProps.getProperty(
+		// ParseOption.DISTANCE.getOptionKey(),
+		// ParseOption.DISTANCE.getDefaultValue()));
+		// ce.setDegree(Integer.parseInt(kernelProps.getProperty(
+		// ParseOption.DEGREE.getOptionKey(),
+		// ParseOption.DEGREE.getDefaultValue())));
+		ce.setAlgorithm("semiL");
+	}
+
+	/**
+	 * 
+	 * @param labelFile
+	 *            contains class ids for each instance. first token of each line
+	 *            is the class id.
+	 * @param instanceIds
+	 *            instance ids corresponding to lines
+	 * @return
+	 * @throws IOException
+	 */
+	Map<Integer, Integer> getInstanceIdClass(String labelFile,
+			List<Integer> instanceIds) throws IOException {
+		Map<Integer, Integer> mapInstanceIdClass = new HashMap<Integer, Integer>(
+				instanceIds.size());
+		BufferedReader instanceReader = null;
+		try {
+			instanceReader = new BufferedReader(new FileReader(labelFile));
+			int nLine = 0;
+			String instanceLine = null;
+			while ((instanceLine = instanceReader.readLine()) != null) {
+				mapInstanceIdClass.put(instanceIds.get(nLine), Integer
+						.parseInt(extractFirstToken(instanceLine, wsPattern)));
+				nLine++;
+			}
+		} finally {
+			if (instanceReader != null) {
+				try {
+					instanceReader.close();
+				} catch (Exception e) {
+					log.error(labelFile, e);
+				}
+			}
+		}
+		return mapInstanceIdClass;
+	}
+
+	public static void main(String args[]) throws IOException {
+		if (args.length < 2) {
+			System.out.println("Usage: java "
+					+ SemiLEvaluationParser.class.getName()
+					+ "dataDir outputDir");
+		} else {
+			BaseClassifierEvaluationParser parser = KernelContextHolder
+					.getApplicationContext().getBean(
+							SemiLEvaluationParser.class);
+			parser.parseDirectory(new File(args[0]), new File(args[1]));
+		}
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/semil/SemiLFormatterFactory.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/semil/SemiLFormatterFactory.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/semil/SemiLFormatterFactory.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/semil/SemiLFormatterFactory.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,437 @@
+package org.apache.ctakes.ytex.semil;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Map;
+import java.util.Properties;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+
+import org.apache.ctakes.ytex.kernel.FileUtil;
+import org.apache.ctakes.ytex.kernel.InstanceData;
+import org.apache.ctakes.ytex.kernel.KernelUtil;
+import org.apache.ctakes.ytex.kernel.SparseData;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatter;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatterFactory;
+import org.apache.ctakes.ytex.svmlight.SVMLightFormatterFactory.SVMLightFormatter;
+import org.apache.ctakes.ytex.weka.WekaFormatterFactory.WekaFormatter;
+
+
+/**
+ * Export data for use with SemiL. I would have liked to have computed the
+ * distance using the COLT library; however this was far too slow.
+ * 
+ * Produce following files:
+ * <ul>
+ * <li>[scope]_data.txt - sparse data file. Can be converted into distance
+ * matrix for SemiL using R or Matlab. R script provided. If you want to use
+ * semiL to generate a distance matrix, use the libsvm formatter.
+ * <li>[fold]_label.txt - semiL label file, one for each fold. class labels
+ * corresponding to rows. test data automatically unlabeled. The same
+ * data/distance matrix can be used with different label files - the rows across
+ * folds refer to the same instance ids. What differs is the labels for the test
+ * instances (0 for test instance's fold).
+ * <li>[fold]_class.txt - contains instance ids and target class ids for each
+ * fold. matrix with 3 columns: instance id, train/test, target class id. Used
+ * by SemiLEvaluationParser to evaluate SemiL predictions.
+ * </ul>
+ * 
+ * @author vhacongarlav
+ * 
+ */
+public class SemiLFormatterFactory implements SparseDataFormatterFactory {
+	
+	public static class SemiLDataFormatter extends SVMLightFormatter {
+		protected InstanceData instanceLabel = null;
+
+		NumberFormat semilNumberFormat = new DecimalFormat("#.######");
+		public SemiLDataFormatter(KernelUtil kernelUtil) {
+			super(kernelUtil);
+		}
+
+		// /**
+		// * cosine distance: <tt>1-aa'/sqrt(aa' * bb')</tt>
+		// */
+		// public static Statistic.VectorVectorFunction COSINE = new
+		// VectorVectorFunction() {
+		// DoubleDoubleFunction fun = new DoubleDoubleFunction() {
+		// public final double apply(double a, double b) {
+		// return Math.abs(a - b) / Math.abs(a + b);
+		// }
+		// };
+		//
+		// public final double apply(DoubleMatrix1D a, DoubleMatrix1D b) {
+		// double ab = a.zDotProduct(b);
+		// double sqrt_ab = Math.sqrt(a.zDotProduct(a) * b.zDotProduct(b));
+		// return 1 - ab / sqrt_ab;
+		// }
+		// };
+
+		/**
+		 * 
+		 * @param foldInstanceLabelMap
+		 * @param sparseData
+		 * @param label
+		 * @param run
+		 * @param fold
+		 * @throws IOException
+		 */
+		protected void exportData(SparseData sparseData, String label,
+				Integer run, Integer fold) throws IOException {
+			exportAttributeNames(sparseData, label, run, fold);
+			String filename = FileUtil.getScopedFileName(outdir, label, run,
+					fold, "data.txt");
+			exportSparseMatrix(filename, sparseData);
+
+		}
+
+		@Override
+		public void exportFold(SparseData sparseData,
+				SortedMap<Long, String> instanceClassMap, boolean train,
+				String label, Integer run, Integer fold) throws IOException {
+			// do nothing
+		}
+
+		/**
+		 * Write the 'label' file. This contains the training labels (label !=
+		 * 0) and test labels (labeled as 0) and unlabeled data (labeled as 0).
+		 * The order of the label file corresponds to the order in the data
+		 * file.
+		 * 
+		 * @param lblFilename
+		 *            filename to write to
+		 * @param mapInstanceIdToClass
+		 *            sorted map of instance id to class. this must correspond
+		 *            to the order in the data file
+		 * @throws IOException
+		 */
+		protected void exportLabel(String lblFilename,
+				SortedMap<Long, Integer> mapInstanceIdToClass)
+				throws IOException {
+			BufferedWriter wLabel = null;
+			try {
+				wLabel = new BufferedWriter(new FileWriter(lblFilename));
+				for (Map.Entry<Long, Integer> entryInstanceIdToClass : mapInstanceIdToClass
+						.entrySet()) {
+					wLabel.write(Integer.toString(entryInstanceIdToClass
+							.getValue()));
+					wLabel.write("\n");
+				}
+			} finally {
+				if (wLabel != null)
+					try {
+						wLabel.close();
+					} catch (Exception ignore) {
+					}
+			}
+		}
+
+		/**
+		 * pick through the training and test sets, figure out the class id for
+		 * all instance ids for training. If the instance is in the training set
+		 * and labeled it will get the appropriate class label. If the instance
+		 * is in the test set and labeled it will be unlabeled. If the instance
+		 * is in the training and test sets, it will use whatever label was
+		 * specified for the training set. Unlabeled instances are given the
+		 * class id 0.
+		 * <p/>
+		 * Write the [prefix]class.txt file
+		 * 
+		 * @param idFilename
+		 *            filename to write instance id\ttrain/test flag\ttarget
+		 *            class to
+		 * @param trainInstanceClassMap
+		 *            instances for training
+		 * @param testInstanceClassMap
+		 *            instance for testing
+		 * @param classToIndexMap
+		 *            map of class to class ids
+		 * @param instanceIds
+		 *            sorted set of instance ids; the order with which class.txt
+		 *            will be written, and the order with which instances will
+		 *            appear in the training data file.
+		 * @return map of instance id to class id for training
+		 * @throws IOException
+		 */
+		protected SortedMap<Long, Integer> getTrainingClassMap(
+				String idFilename,
+				SortedMap<Long, String> trainInstanceClassMap,
+				SortedMap<Long, String> testInstanceClassMap,
+				Map<String, Integer> classToIndexMap,
+				SortedSet<Long> instanceIds) throws IOException {
+			SortedMap<Long, Integer> mapInstanceIdToClass = new TreeMap<Long, Integer>();
+			BufferedWriter wId = null;
+			try {
+				wId = new BufferedWriter(new FileWriter(idFilename));
+				for (Long instanceId : instanceIds) {
+					// for training default to unlabeled
+					int classIdTrain = 0;
+					String classNameTrain = "0";
+					if (trainInstanceClassMap.containsKey(instanceId)) {
+						// if the instance is in the training set, then use that
+						// label
+						classNameTrain = trainInstanceClassMap.get(instanceId);
+						classIdTrain = classToIndexMap
+								.get(classNameTrain);
+					}
+					mapInstanceIdToClass.put(instanceId, classIdTrain);
+					// check test set for gold class
+					if (testInstanceClassMap != null
+							&& testInstanceClassMap.containsKey(instanceId)) {
+						classNameTrain = testInstanceClassMap.get(instanceId);
+					} 
+					// write instance id, if this is in the train set, and it's
+					// class
+					wId.write(Long.toString(instanceId));
+					wId.write("\t");
+					wId.write(trainInstanceClassMap.containsKey(instanceId) ? "1"
+							: "0");
+					wId.write("\t");
+					wId.write(classNameTrain);
+					wId.write("\n");
+				}
+			} finally {
+				if (wId != null)
+					try {
+						wId.close();
+					} catch (Exception ignore) {
+					}
+			}
+			return mapInstanceIdToClass;
+		}
+		/**
+		 * write distance up to 6 digit precision. only write distance if &lt;
+		 * 0.999. format: <tt>
+		 * row column dist
+		 * </tt> 1-based indices.
+		 * 
+		 * @todo - 0.999 also for euclidean distance??
+		 * 
+		 * @param data
+		 * @param wData
+		 * @throws IOException
+		 */
+		// private void writeDistanceMatrix(SparseDoubleMatrix2D data,
+		// String filename) throws IOException {
+		// String distanceFuncName = this.exportProperties.getProperty(
+		// "distance", "EUCLID");
+		// Statistic.VectorVectorFunction func = Statistic.EUCLID;
+		// if ("COSINE".equalsIgnoreCase(distanceFuncName)) {
+		// func = COSINE;
+		// }
+		// DoubleMatrix2D dist = Statistic.distance(data, func);
+		// BufferedWriter wData = null;
+		// try {
+		// wData = new BufferedWriter(new FileWriter(filename));
+		// for (int row = 1; row < dist.rows(); row++) {
+		// for (int col = row + 1; col < dist.columns(); col++) {
+		// double d = dist.get(row, col);
+		// if (d < 0.999) {
+		// wData.write(Integer.toString(row + 1));
+		// wData.write("    ");
+		// wData.write(Integer.toString(col + 1));
+		// wData.write("    ");
+		// wData.write(semilNumberFormat.format(round(d, 6)));
+		// wData.write("\n");
+		// }
+		// }
+		// }
+		// } finally {
+		// if (wData != null)
+		// try {
+		// wData.close();
+		// } catch (Exception ignore) {
+		// }
+		// }
+		// }
+
+		// private void exportDistance(SparseData sparseData, String label,
+		// Integer run, Integer fold) throws IOException {
+		// SparseDoubleMatrix2D data = new SparseDoubleMatrix2D(
+		// this.instanceIds.size(), maxAttributeIndex);
+		// int row = 0;
+		// for (Integer instanceId : this.instanceIds) {
+		// // write row to sparse data matrix
+		// // get 'vector'
+		// SortedMap<Integer, Double> instanceValues = getSparseLineValues(
+		// sparseData, numericAttributeMap, nominalAttributeMap,
+		// instanceId);
+		// // write it to the matrix
+		// for (SortedMap.Entry<Integer, Double> instanceValue : instanceValues
+		// .entrySet()) {
+		// // row = instance number
+		// // column = attribute index
+		// // value = value
+		// data.set(row, instanceValue.getKey() - 1,
+		// instanceValue.getValue());
+		// }
+		// // increment row index
+		// row++;
+		// }
+		// String filename = FileUtil.getFoldFilePrefix(outdir, label, run,
+		// fold) + "dist.txt";
+		// this.writeDistanceMatrix(data, filename);
+		// }
+
+		@Override
+		public void initializeExport(InstanceData instanceLabel,
+				Properties properties, SparseData sparseData)
+				throws IOException {
+			super.initializeExport(instanceLabel, properties, sparseData);
+			this.instanceLabel = instanceLabel;
+			if (properties.getProperty(SCOPE) == null
+					|| properties.getProperty(SCOPE).length() == 0) {
+				exportData(sparseData, null, null, null);
+			}
+		}
+
+		@Override
+		public void initializeFold(SparseData sparseData, String label,
+				Integer run, Integer fold,
+				SortedMap<Boolean, SortedMap<Long, String>> foldInstanceLabelMap)
+				throws IOException {
+			if (SCOPE_FOLD.equals(this.exportProperties.getProperty(SCOPE))) {
+				exportData(sparseData, label, run, fold);
+			}
+			String labelFileName = FileUtil.getScopedFileName(outdir, label,
+					run, fold, "label.txt");
+			String idFileName = FileUtil.getScopedFileName(outdir, label, run,
+					fold, "class.txt");
+			SortedMap<Long, Integer> trainInstanceIdToClass = getTrainingClassMap(
+					idFileName, foldInstanceLabelMap.get(true),
+					foldInstanceLabelMap.get(false),
+					this.labelToClassIndexMap.get(label),
+					sparseData.getInstanceIds());
+			exportLabel(labelFileName, trainInstanceIdToClass);
+			// exportLabel(idFileName, labelFileName,
+			// foldInstanceLabelMap.get(true),
+			// foldInstanceLabelMap.get(false),
+			// this.labelToClassIndexMap.get(label),
+			// sparseData.getInstanceIds());
+		}
+
+//		/**
+//		 * export the data
+//		 * 
+//		 * @param filename
+//		 * @param idFilename
+//		 * @param lblFilename
+//		 * @param bagOfWordsData
+//		 * @param trainInstanceClassMap
+//		 * @param testInstanceClassMap
+//		 * @param classToIndexMap
+//		 * @throws IOException
+//		 */
+//		private void exportLabel(String idFilename, String lblFilename,
+//				SortedMap<Long, String> trainInstanceClassMap,
+//				SortedMap<Long, String> testInstanceClassMap,
+//				Map<String, Integer> classToIndexMap,
+//				SortedSet<Long> instanceIds) throws IOException {
+//			// BufferedWriter wId = null;
+//			BufferedWriter wLabel = null;
+//			SortedMap<Long, Integer> mapInstanceIdToClass = this
+//					.getTrainingClassMap(idFilename, trainInstanceClassMap,
+//							testInstanceClassMap, classToIndexMap, instanceIds);
+//			try {
+//				// wId = new BufferedWriter(new FileWriter(idFilename));
+//				wLabel = new BufferedWriter(new FileWriter(lblFilename));
+//				for (Long instanceId : instanceIds) {
+//					// // for training default to unlabeled
+//					// int classIdTrain = 0;
+//					// if (trainInstanceClassMap.containsKey(instanceId)) {
+//					// // if the instance is in the training set, then use that
+//					// // label
+//					// classIdTrain = classToIndexMap
+//					// .get(trainInstanceClassMap.get(instanceId));
+//					// }
+//					// // check test set for gold class
+//					// int classIdGold = 0;
+//					// if (testInstanceClassMap != null
+//					// && testInstanceClassMap.containsKey(instanceId))
+//					// classIdGold = classToIndexMap.get(testInstanceClassMap
+//					// .get(instanceId));
+//					// else
+//					// classIdGold = classIdTrain;
+//					// // write instance id, if this is in the train set, and
+//					// it's
+//					// // class
+//					// wId.write(Long.toString(instanceId));
+//					// wId.write("\t");
+//					// wId.write(trainInstanceClassMap.containsKey(instanceId) ?
+//					// "1"
+//					// : "0");
+//					// wId.write("\t");
+//					// wId.write(Integer.toString(classIdGold));
+//					// wId.write("\n");
+//					// write label file for semiL
+//					int classIdTrain = mapInstanceIdToClass.get(instanceId);
+//					wLabel.write(Integer.toString(classIdTrain));
+//					wLabel.write("\n");
+//				}
+//			} finally {
+//				// if (wId != null)
+//				// try {
+//				// wId.close();
+//				// } catch (Exception ignore) {
+//				// }
+//				if (wLabel != null)
+//					try {
+//						wLabel.close();
+//					} catch (Exception ignore) {
+//					}
+//			}
+//		}
+
+		@Override
+		public void initializeLabel(
+				String label,
+				SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>> labelInstances,
+				Properties properties, SparseData sparseData)
+				throws IOException {
+			super.initializeLabel(label, labelInstances, properties, sparseData);
+			if (SCOPE_LABEL.equals(this.exportProperties.getProperty(SCOPE))) {
+				exportData(sparseData, label, null, null);
+			}
+		}
+
+		@Override
+		protected void fillLabelToClassIndexMap(Map<String, SortedSet<String>> labelToClassMap) {
+			kernelUtil.fillLabelToClassToIndexMap(labelToClassMap, this.labelToClassIndexMap);
+			updateLabelClassMapTransductive();
+		}		
+		// /**
+		// * round double to specified precision
+		// *
+		// * @param Rval
+		// * @param Rpl
+		// * @return
+		// */
+		// private double round(double Rval, int Rpl) {
+		// double p = (double) Math.pow(10, Rpl);
+		// Rval = Rval * p;
+		// double tmp = Math.round(Rval);
+		// return (double) tmp / p;
+		// }
+
+	}
+
+	private KernelUtil kernelUtil;
+
+	@Override
+	public SparseDataFormatter getFormatter() {
+		return new SemiLDataFormatter(this.getKernelUtil());
+	}	
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/InstanceDataExporter.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/InstanceDataExporter.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/InstanceDataExporter.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/InstanceDataExporter.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,22 @@
+package org.apache.ctakes.ytex.sparsematrix;
+
+import java.io.IOException;
+
+import org.apache.ctakes.ytex.kernel.InstanceData;
+
+
+public interface InstanceDataExporter {
+
+	public static final String FIELD_DELIM = "\t";
+	public static final String RECORD_DELIM = "\n";
+	public static final String STRING_ESCAPE = "";
+	public static final boolean INCLUDE_HEADER = false; 
+
+	public abstract void outputInstanceData(InstanceData instanceData,
+			String filename) throws IOException;
+
+	public abstract void outputInstanceData(InstanceData instanceData,
+			String filename, String fieldDelim, String recordDelim,
+			String stringEscape, boolean includeHeader) throws IOException;
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/InstanceDataExporterImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/InstanceDataExporterImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/InstanceDataExporterImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/InstanceDataExporterImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,142 @@
+package org.apache.ctakes.ytex.sparsematrix;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.Map;
+
+import org.apache.ctakes.ytex.kernel.InstanceData;
+
+
+/**
+ * output the instance data
+ * 
+ * @author vijay
+ * 
+ */
+public class InstanceDataExporterImpl implements InstanceDataExporter {
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see
+	 * org.apache.ctakes.ytex.sparsematrix.InstanceDataExporter#outputInstanceData(org.apache.ctakes.ytex.kernel
+	 * .InstanceData, java.lang.String)
+	 */
+	@Override
+	public void outputInstanceData(InstanceData instanceData, String filename)
+			throws IOException {
+		outputInstanceData(instanceData, filename, FIELD_DELIM, RECORD_DELIM,
+				STRING_ESCAPE, INCLUDE_HEADER);
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see
+	 * org.apache.ctakes.ytex.sparsematrix.InstanceDataExporter#outputInstanceData(org.apache.ctakes.ytex.kernel
+	 * .InstanceData, java.lang.String, java.lang.String, java.lang.String,
+	 * java.lang.String)
+	 */
+	@Override
+	public void outputInstanceData(InstanceData instanceData, String filename,
+			String fieldDelim, String recordDelim, String stringEscape,
+			boolean includeHeader) throws IOException {
+		BufferedWriter bw = null;
+		try {
+			StringWriter w = new StringWriter();
+			boolean includeLabel = false;
+			boolean includeRun = false;
+			boolean includeFold = false;
+			boolean includeTrain = false;
+			for (String label : instanceData.getLabelToInstanceMap().keySet()) {
+				for (int run : instanceData.getLabelToInstanceMap().get(label)
+						.keySet()) {
+					for (int fold : instanceData.getLabelToInstanceMap()
+							.get(label).get(run).keySet()) {
+						for (boolean train : instanceData
+								.getLabelToInstanceMap().get(label).get(run)
+								.get(fold).keySet()) {
+							for (Map.Entry<Long, String> instanceClass : instanceData
+									.getLabelToInstanceMap().get(label)
+									.get(run).get(fold).get(train).entrySet()) {
+								// write instance id
+								w.write(Long.toString(instanceClass.getKey()));
+								w.write(fieldDelim);
+								// write class
+								appendString(instanceClass.getValue(),
+										stringEscape, w);
+								// if there are multiple labels, write the label
+								if (label.length() > 0) {
+									includeLabel = true;
+									w.write(fieldDelim);
+									appendString(label, stringEscape, w);
+								}
+								// if there are multiple runs, write the run
+								if (run > 0) {
+									includeRun = true;
+									w.write(fieldDelim);
+									w.write(Integer.toString(run));
+								}
+								// if there are multiple folds, write the fold
+								if (fold > 0) {
+									includeFold = true;
+									w.write(fieldDelim);
+									w.write(Integer.toString(fold));
+								}
+								// if there is a distinction between training/testing, write the train/test flag
+								if (instanceData.getLabelToInstanceMap()
+										.get(label).get(run).get(fold).size() > 1) {
+									includeTrain = true;
+									w.write(fieldDelim);
+									w.write(train ? "1" : "0");
+								}
+								w.write(recordDelim);
+							}
+						}
+					}
+				}
+			}
+			bw = new BufferedWriter(new FileWriter(filename));
+			if (includeHeader) {
+				appendString("instance_id", stringEscape, bw);
+				bw.write(fieldDelim);
+				appendString("class", stringEscape, bw);
+				// write colnames
+				if (includeLabel) {
+					bw.write(fieldDelim);
+					appendString("label", stringEscape, bw);
+				}
+				if (includeRun) {
+					bw.write(fieldDelim);
+					appendString("run", stringEscape, bw);
+				}
+				if (includeFold) {
+					bw.write(fieldDelim);
+					appendString("fold", stringEscape, bw);
+				}
+				if (includeTrain) {
+					bw.write(fieldDelim);
+					appendString("train", stringEscape, bw);
+				}
+				bw.write(recordDelim);
+			}
+			// write the rest of the data
+			bw.write(w.toString());
+		} finally {
+			if (bw != null) {
+				bw.close();
+			}
+		}
+
+	}
+
+	private void appendString(String str, String stringEscape, Writer w)
+			throws IOException {
+		w.write(stringEscape);
+		w.write(str);
+		w.write(stringEscape);
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/SparseMatrixFormatterFactory.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/SparseMatrixFormatterFactory.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/SparseMatrixFormatterFactory.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/sparsematrix/SparseMatrixFormatterFactory.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,200 @@
+package org.apache.ctakes.ytex.sparsematrix;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.util.Map;
+import java.util.Properties;
+import java.util.SortedMap;
+
+import org.apache.ctakes.ytex.kernel.BaseSparseDataFormatter;
+import org.apache.ctakes.ytex.kernel.FileUtil;
+import org.apache.ctakes.ytex.kernel.InstanceData;
+import org.apache.ctakes.ytex.kernel.KernelUtil;
+import org.apache.ctakes.ytex.kernel.SparseData;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatter;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatterFactory;
+import org.apache.ctakes.ytex.libsvm.LibSVMFormatterFactory.LibSVMFormatter;
+
+
+/**
+ * export sparse data matrix to data.txt. Data can be scoped to label/fold.
+ * instance_id added as first column in data matrix. instance data (class,
+ * label, run, fold) exported to instance.txt.
+ * 
+ * @todo add options to control instance.txt output format.
+ * @see BaseSparseDataFormatter#exportSparseMatrix
+ * @author vijay
+ */
+public class SparseMatrixFormatterFactory implements SparseDataFormatterFactory {
+
+	InstanceDataExporter instanceDataExporter;
+	KernelUtil kernelUtil;
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	public InstanceDataExporter getInstanceDataExporter() {
+		return instanceDataExporter;
+	}
+
+	public void setInstanceDataExporter(
+			InstanceDataExporter instanceDataExporter) {
+		this.instanceDataExporter = instanceDataExporter;
+	}
+
+	@Override
+	public SparseDataFormatter getFormatter() {
+		return new SparseMatrixDataFormatter(getInstanceDataExporter(),
+				getKernelUtil());
+	}
+
+	public static class SparseMatrixDataFormatter extends LibSVMFormatter {
+		InstanceData instanceLabel;
+		InstanceDataExporter instanceDataExporter;
+
+		public SparseMatrixDataFormatter(
+				InstanceDataExporter instanceDataExporter, KernelUtil kernelUtil) {
+			super(kernelUtil);
+			this.instanceDataExporter = instanceDataExporter;
+		}
+
+		@Override
+		public void initializeExport(InstanceData instanceLabel,
+				Properties properties, SparseData sparseData)
+				throws IOException {
+			super.initializeExport(instanceLabel, properties, sparseData);
+			this.instanceLabel = instanceLabel;
+			instanceDataExporter.outputInstanceData(instanceLabel,
+					FileUtil.addFilenameToDir(outdir, "instance.txt"));
+			if (properties.getProperty(SCOPE) == null
+					|| properties.getProperty(SCOPE).length() == 0) {
+				exportSparseMatrix(sparseData, null, null, null);
+			}
+		}
+
+		@Override
+		public void initializeLabel(
+				String label,
+				SortedMap<Integer, SortedMap<Integer, SortedMap<Boolean, SortedMap<Long, String>>>> labelInstances,
+				Properties properties, SparseData sparseData)
+				throws IOException {
+			// super.initializeLabel(label, labelInstances, properties,
+			// sparseData);
+			if (SCOPE_LABEL.equals(this.exportProperties.getProperty(SCOPE))) {
+				exportSparseMatrix(sparseData, label, null, null);
+			}
+		}
+
+		@Override
+		public void initializeFold(SparseData sparseData, String label,
+				Integer run, Integer fold,
+				SortedMap<Boolean, SortedMap<Long, String>> foldInstanceLabelMap)
+				throws IOException {
+			if (SCOPE_FOLD.equals(this.exportProperties.getProperty(SCOPE))) {
+				exportSparseMatrix(sparseData, label, run, fold);
+			}
+		}
+
+		@Override
+		public void exportFold(SparseData sparseData,
+				SortedMap<Long, String> instanceClassMap, boolean train,
+				String label, Integer run, Integer fold) throws IOException {
+			// do nothing
+		}
+
+		private void exportSparseMatrix(SparseData sparseData, String label,
+				Integer run, Integer fold) throws IOException {
+			exportAttributeNames(sparseData, label, run, fold);
+			String filename = FileUtil.getScopedFileName(outdir, label, run,
+					fold, "data.txt");
+			this.exportSparseMatrix(filename, sparseData);
+		}
+
+		/**
+		 * add instance_id as first in the map of numeric attribute
+		 */
+		protected int exportAttributeNames(BufferedWriter w,
+				SparseData sparseData) throws IOException {
+			super.addNumericAttribute(w, ATTR_INSTANCE_ID);
+			return super.exportAttributeNames(w, sparseData);
+		}
+
+		/**
+		 * add instance_id to sparse line values
+		 */
+		@Override
+		protected SortedMap<Integer, Double> getSparseLineValues(
+				SparseData bagOfWordsData,
+				Map<String, Integer> numericAttributeMap,
+				Map<String, Map<String, Integer>> nominalAttributeMap,
+				long instanceId) {
+			SortedMap<Integer, Double> instanceLine = super
+					.getSparseLineValues(bagOfWordsData, numericAttributeMap,
+							nominalAttributeMap, instanceId);
+			instanceLine.put(this.numericAttributeMap.get(ATTR_INSTANCE_ID),
+					(double) instanceId);
+			return instanceLine;
+		}
+
+		// /**
+		// * id - export 2-column matrix column 1 - instance id column 2 - class
+		// * id
+		// *
+		// * data - export 3-column matrix column 1 - row index column 2 -
+		// column
+		// * index column 3 - value
+		// *
+		// * @param filename
+		// * @param idFilename
+		// * @param bagOfWordsData
+		// * @param instanceClassMap
+		// * @param numericAttributeMap
+		// * @param nominalAttributeMap
+		// * @param label
+		// * @throws IOException
+		// */
+		// protected void exportDataForLabel(String filename, String idFilename,
+		// SparseData bagOfWordsData,
+		// SortedMap<Integer, String> instanceClassMap,
+		// Map<String, Integer> classToIndexMap) throws IOException {
+		// BufferedWriter wData = null;
+		// BufferedWriter wId = null;
+		// try {
+		// wData = new BufferedWriter(new FileWriter(filename));
+		// wId = new BufferedWriter(new FileWriter(idFilename));
+		// // iterate over rows
+		// int row = 1;
+		// for (Map.Entry<Integer, String> instanceClass : instanceClassMap
+		// .entrySet()) {
+		// int instanceId = instanceClass.getKey();
+		// // allocate line with sparse attribute indices and values
+		// SortedMap<Integer, Double> instanceValues = getSparseLineValues(
+		// bagOfWordsData, numericAttributeMap,
+		// nominalAttributeMap, instanceId);
+		// // data file
+		// // write class id
+		// int classId = classToIndexMap.get(instanceClass.getValue());
+		// wId.write(Integer.toString(instanceId));
+		// wId.write("\t");
+		// wId.write(Integer.toString(classId));
+		// wId.write("\n");
+		// // write attributes
+		// exportSparseRow(bagOfWordsData, instanceId, wData, row);
+		// // increment row index
+		// row++;
+		// }
+		// } finally {
+		// if (wData != null)
+		// wData.close();
+		// if (wId != null)
+		// wId.close();
+		// }
+		// }
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SGTFormatterFactory.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SGTFormatterFactory.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SGTFormatterFactory.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SGTFormatterFactory.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,74 @@
+package org.apache.ctakes.ytex.svmlight;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.SortedMap;
+
+import org.apache.ctakes.ytex.kernel.FileUtil;
+import org.apache.ctakes.ytex.kernel.KernelUtil;
+import org.apache.ctakes.ytex.kernel.SparseData;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatter;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatterFactory;
+import org.apache.ctakes.ytex.semil.SemiLFormatterFactory.SemiLDataFormatter;
+
+
+/**
+ * For each scope, create a data.txt file. The SGT tools will be used to convert
+ * this into an adjacency graph. For each train/test fold, create a label and
+ * class file. The label file will be used by sgt for prediction, the class file
+ * will be used to parse the results.
+ * 
+ * @author vijay
+ * 
+ */
+public class SGTFormatterFactory implements SparseDataFormatterFactory {
+	private KernelUtil kernelUtil;
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	@Override
+	public SparseDataFormatter getFormatter() {
+		return new SGTFormatter(kernelUtil);
+	}
+
+	public static class SGTFormatter extends SemiLDataFormatter {
+		public SGTFormatter(KernelUtil kernelUtil) {
+			super(kernelUtil);
+		}
+
+		@Override
+		protected void exportData(SparseData sparseData, String label,
+				Integer run, Integer fold) throws IOException {
+			exportAttributeNames(sparseData, label, run, fold);
+			String filename = FileUtil.getScopedFileName(outdir, label, run,
+					fold, "data.txt");
+			BufferedWriter wData = null;
+			try {
+				wData = new BufferedWriter(new FileWriter(filename));
+				for (long instanceId : sparseData.getInstanceIds()) {
+					// get line with sparse attribute indices and values
+					SortedMap<Integer, Double> instanceValues = getSparseLineValues(
+							sparseData, numericAttributeMap,
+							nominalAttributeMap, instanceId);
+					// the class is irrelevant - we create label files used by
+					// sgt
+					wData.write(Integer.toString(0));
+					// write the line
+					writeLibsvmLine(wData, instanceValues);
+				}
+			} finally {
+				if (wData != null) {
+					wData.close();
+				}
+			}
+		}
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SVMLightFormatterFactory.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SVMLightFormatterFactory.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SVMLightFormatterFactory.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SVMLightFormatterFactory.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,236 @@
+package org.apache.ctakes.ytex.svmlight;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+
+import org.apache.ctakes.ytex.kernel.FileUtil;
+import org.apache.ctakes.ytex.kernel.InstanceData;
+import org.apache.ctakes.ytex.kernel.KernelUtil;
+import org.apache.ctakes.ytex.kernel.SparseData;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatter;
+import org.apache.ctakes.ytex.kernel.SparseDataFormatterFactory;
+import org.apache.ctakes.ytex.libsvm.LibSVMFormatterFactory.LibSVMFormatter;
+
+import com.google.common.collect.BiMap;
+import com.google.common.collect.HashBiMap;
+
+
+/**
+ * export for svmlight. Same format as for libsvm with following changes:
+ * <ul>
+ * <li>
+ * Only binary classification - classes must be -1 or 1
+ * <li>Transductive classification: test instances 'unlabelled' in training set
+ * (class 0)
+ * <ul/>
+ * If a test set is available, will use transductive svm format.
+ */
+public class SVMLightFormatterFactory implements SparseDataFormatterFactory {
+	KernelUtil kernelUtil;
+
+	public KernelUtil getKernelUtil() {
+		return kernelUtil;
+	}
+
+	public void setKernelUtil(KernelUtil kernelUtil) {
+		this.kernelUtil = kernelUtil;
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see org.apache.ctakes.ytex.libsvm.SparseDataFormatterFactory#getFormatter()
+	 */
+	@Override
+	public SparseDataFormatter getFormatter() {
+		return new SVMLightFormatter(this.getKernelUtil());
+	}
+
+	public static class SVMLightFormatter extends LibSVMFormatter {
+		protected SortedMap<Boolean, SortedMap<Long, String>> foldInstanceLabelMap;
+
+		public SVMLightFormatter(KernelUtil kernelUtil) {
+			super(kernelUtil);
+		}
+
+		/**
+		 * export the given train/test set
+		 */
+		@Override
+		public void exportFold(SparseData sparseData,
+				SortedMap<Long, String> instanceClassMap, boolean train,
+				String label, Integer run, Integer fold) throws IOException {
+			String filename = FileUtil.getDataFilePrefix(outdir, label, run,
+					fold, train) + "_data.txt";
+			String idFilename = FileUtil.getDataFilePrefix(outdir, label, run,
+					fold, train) + "_id.txt";
+			if (train && this.foldInstanceLabelMap.size() == 2) {
+				// train and test set available - export transductive data
+				exportTransductiveData(filename, idFilename, sparseData,
+						instanceClassMap,
+						this.foldInstanceLabelMap.get(Boolean.FALSE).keySet(),
+						this.labelToClassIndexMap.get(label));
+			} else {
+				// test set, or training set only
+				// 'normal' export
+				super.exportDataForLabel(filename, idFilename, sparseData,
+						instanceClassMap, labelToClassIndexMap.get(label));
+			}
+		}
+
+		/**
+		 * Export data file and id file
+		 * 
+		 * @param filename
+		 * @param idFilename
+		 * @param bagOfWordsData
+		 * @param instanceClassMap
+		 * @param numericAttributeMap
+		 * @param nominalAttributeMap
+		 * @param label
+		 * @return instance ids in order they are in the output file
+		 * @throws IOException
+		 */
+		protected List<Long> exportTransductiveData(String filename,
+				String idFilename, SparseData bagOfWordsData,
+				SortedMap<Long, String> trainClassMap, Set<Long> testInstances,
+				BiMap<String, Integer> classToIndexMap) throws IOException {
+			List<Long> instanceIds = new ArrayList<Long>();
+			BufferedWriter wData = null;
+			BufferedWriter wId = null;
+			try {
+				wData = new BufferedWriter(new FileWriter(filename));
+				wId = new BufferedWriter(new FileWriter(idFilename));
+				instanceIds.addAll(exportDataForInstances(bagOfWordsData,
+						trainClassMap, classToIndexMap, wData, wId));
+				SortedMap<Long, String> testClassMap = new TreeMap<Long, String>();
+				for (Long instanceId : testInstances) {
+					// for sparse datasets may duplicate instances in train/test
+					// set. Don't do that for transductive learning
+					if (!trainClassMap.containsKey(instanceId))
+						testClassMap.put(instanceId, "0");
+				}
+				instanceIds.addAll(exportDataForInstances(bagOfWordsData,
+						testClassMap, classToIndexMap, wData, wId));
+				return instanceIds;
+			} finally {
+				if (wData != null)
+					wData.close();
+				if (wId != null)
+					wId.close();
+			}
+		}
+
+		/**
+		 * create a map of attribute index - attribute value for the given
+		 * instance.
+		 * 
+		 * @param bagOfWordsData
+		 * @param numericAttributeMap
+		 * @param nominalAttributeMap
+		 * @param instanceId
+		 * @return
+		 */
+		// protected SortedMap<Integer, Double> getSparseLineValues(
+		// SparseData bagOfWordsData,
+		// Map<String, Integer> numericAttributeMap,
+		// Map<String, Map<String, Integer>> nominalAttributeMap,
+		// int instanceId) {
+		// SortedMap<Integer, Double> instanceValues = new TreeMap<Integer,
+		// Double>();
+		// // get numeric values for instance
+		// if (bagOfWordsData.getInstanceNumericWords()
+		// .containsKey(instanceId)) {
+		// for (Map.Entry<String, Double> numericValue : bagOfWordsData
+		// .getInstanceNumericWords().get(instanceId).entrySet()) {
+		// // look up index for attribute and put in map
+		// instanceValues.put(numericAttributeMap.get(numericValue
+		// .getKey()), numericValue.getValue());
+		// }
+		// }
+		// if (bagOfWordsData.getInstanceNominalWords()
+		// .containsKey(instanceId)) {
+		// for (Map.Entry<String, String> nominalValue : bagOfWordsData
+		// .getInstanceNominalWords().get(instanceId).entrySet()) {
+		// // look up index for attribute and value and put in map
+		// instanceValues.put(
+		// nominalAttributeMap.get(nominalValue.getKey()).get(
+		// nominalValue.getValue()), 1d);
+		// }
+		// }
+		// return instanceValues;
+		// }
+
+		/**
+		 * add the "0" class for transductive learning
+		 */
+		@Override
+		public void initializeExport(InstanceData instanceLabel,
+				Properties properties, SparseData sparseData)
+				throws IOException {
+			this.exportProperties = properties;
+			this.outdir = properties.getProperty("outdir");
+			FileUtil.createOutdir(outdir);
+			fillLabelToClassIndexMap(instanceLabel.getLabelToClassMap());
+		}
+
+		protected void fillLabelToClassIndexMap(Map<String, SortedSet<String>> labelToClassMap) {
+			for (Map.Entry<String, SortedSet<String>> labelToClass : labelToClassMap.entrySet()) {
+				BiMap<String, Integer> classToIndexMap = HashBiMap.create();
+				labelToClassIndexMap
+						.put(labelToClass.getKey(), classToIndexMap);
+				if (labelToClass.getValue().size() == 2) {
+					// use +1 and -1 for binary classification
+					classToIndexMap.put(labelToClass.getValue().first(), -1);
+					classToIndexMap.put(labelToClass.getValue().last(), 1);
+				} else {
+					int nIndex = 1;
+					for (String className : labelToClass.getValue()) {
+						Integer classNumber = null;
+						try {
+							classNumber = Integer.parseInt(className);
+						} catch (NumberFormatException fe) {
+						}
+						if (classNumber == null) {
+							classToIndexMap.put(className, nIndex++);
+						} else {
+							classToIndexMap.put(className, classNumber);
+						}
+					}
+				}
+			}
+			updateLabelClassMapTransductive();
+		}
+
+		/**
+		 * clean up fold specific state
+		 */
+		@Override
+		public void clearFold() {
+			this.numericAttributeMap.clear();
+			this.nominalAttributeMap.clear();
+			this.foldInstanceLabelMap = null;
+		}
+
+		@Override
+		public void initializeFold(SparseData sparseData, String label,
+				Integer run, Integer fold,
+				SortedMap<Boolean, SortedMap<Long, String>> foldInstanceLabelMap)
+				throws IOException {
+			super.initializeFold(sparseData, label, run, fold,
+					foldInstanceLabelMap);
+			this.foldInstanceLabelMap = foldInstanceLabelMap;
+		}
+
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SVMLightParser.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SVMLightParser.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SVMLightParser.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/svmlight/SVMLightParser.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,166 @@
+package org.apache.ctakes.ytex.svmlight;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.Properties;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.ytex.kernel.model.ClassifierEvaluation;
+import org.apache.ctakes.ytex.kernel.model.ClassifierInstanceEvaluation;
+import org.apache.ctakes.ytex.kernel.model.SVMClassifierEvaluation;
+import org.apache.ctakes.ytex.libsvm.LibSVMParser;
+
+
+/**
+ * same as libsvm with following changes:
+ * <ul>
+ * <li>parse output.txt - contains vcdim and number support vectors <
+ * </ul>
+ */
+public class SVMLightParser extends LibSVMParser {
+	static final Pattern psv = Pattern.compile("Number of SV:\\s(\\d+)\\s.*");
+	static final Pattern pvc = Pattern
+			.compile("Estimated VCdim of classifier: VCdim<=([\\d\\.]+)");
+
+	/**
+	 * Parse svm-classify input (instance file) and predictions (prediction
+	 * file). instance file has target class id and attributes for each
+	 * instance. predict file has value less than or greater than 0 for each
+	 * instance, corresponding to class ids -1 and +1.
+	 * 
+	 * @param predictionFile
+	 * @param instanceFile
+	 * @return
+	 * @throws IOException
+	 */
+	@Override
+	protected void parsePredictions(String predictionFile, String instanceFile,
+			Properties props, String instanceIdFile,
+			SVMClassifierEvaluation eval) throws IOException {
+		List<Long> instanceIds = null;
+		if (instanceIdFile != null)
+			instanceIds = parseInstanceIds(instanceIdFile);
+		boolean bStoreUnlabeled = YES.equals(props.getProperty(
+				ParseOption.STORE_UNLABELED.getOptionKey(),
+				ParseOption.STORE_UNLABELED.getDefaultValue()));
+		BufferedReader instanceReader = null;
+		BufferedReader predictionReader = null;
+		try {
+			instanceReader = new BufferedReader(new FileReader(instanceFile));
+			predictionReader = new BufferedReader(
+					new FileReader(predictionFile));
+			String instanceLine = null;
+			String predictionLine = null;
+			int nLine = 0;
+			while (((instanceLine = instanceReader.readLine()) != null)
+					&& ((predictionLine = predictionReader.readLine()) != null)) {
+				long instanceId = instanceIds.size() > nLine ? instanceIds
+						.get(nLine) : nLine;
+				nLine++;
+				int classIdTarget = Integer.parseInt(extractFirstToken(
+						instanceLine, wsPattern));
+				// only store unlabeled instances if configured to do so
+				if (bStoreUnlabeled || classIdTarget != 0) {
+					ClassifierInstanceEvaluation result = new ClassifierInstanceEvaluation();
+					// target class id is null for unlabeled instances
+					result.setTargetClassId(classIdTarget == 0 ? null
+							: classIdTarget);
+					int classIdPredicted = 0;
+					try {
+						double dPredict = Double.parseDouble(predictionLine);
+						if (dPredict > 0)
+							classIdPredicted = 1;
+						else
+							classIdPredicted = -1;
+					} catch (NumberFormatException nfe) {
+						System.err.println("error parsing:" + predictionLine);
+						nfe.printStackTrace(System.err);
+					}
+					result.setPredictedClassId(classIdPredicted);
+					result.setInstanceId(instanceId);
+					result.setClassifierEvaluation(eval);
+					eval.getClassifierInstanceEvaluations().put(instanceId,
+							result);
+				}
+			}
+		} finally {
+			if (instanceReader != null) {
+				try {
+					instanceReader.close();
+				} catch (Exception e) {
+					e.printStackTrace(System.err);
+				}
+			}
+			if (predictionReader != null) {
+				try {
+					predictionReader.close();
+				} catch (Exception e) {
+					e.printStackTrace(System.err);
+				}
+			}
+		}
+	}
+
+	/**
+	 * <pre>
+	 * Number of SV: 133 (including 0 at upper bound)
+	 * L1 loss: loss=0.00000
+	 * Norm of weight vector: |w|=2.09380
+	 * Norm of longest example vector: |x|=16.91153
+	 * Estimated VCdim of classifier: VCdim<=684.90185
+	 * </pre>
+	 * 
+	 * @param eval
+	 * @param trainOutputFile
+	 * @throws IOException
+	 */
+	private void parseTrainOutput(SVMClassifierEvaluation eval,
+			String trainOutputFile) throws IOException {
+		if (trainOutputFile == null)
+			return;
+		BufferedReader r = null;
+		try {
+			r = new BufferedReader(new FileReader(trainOutputFile));
+			String line = null;
+			while ((line = r.readLine()) != null) {
+				Matcher m = psv.matcher(line);
+				if (m.matches())
+					eval.setSupportVectors(Integer.parseInt(m.group(1)));
+				m = pvc.matcher(line);
+				if (m.matches())
+					eval.setVcdim(Double.parseDouble(m.group(1)));
+			}
+		} catch (FileNotFoundException fnfe) {
+			// ignore
+		} finally {
+			if (r != null)
+				r.close();
+		}
+	}
+
+	/**
+	 * parse output.txt - contains vcdim and number support vectors
+	 */
+	@Override
+	protected void parseResults(File dataDir, File outputDir, String model,
+			String predict, SVMClassifierEvaluation eval, Properties props)
+			throws IOException {
+		super.parseResults(dataDir, outputDir, model, predict, eval, props);
+		eval.setAlgorithm("svmlight");
+		parseTrainOutput(eval, outputDir + File.separator + "output.txt");
+	}
+
+	/**
+	 * store semi supervised results.
+	 */
+	@Override
+	protected void storeResults(File dataDir, Properties props, SVMClassifierEvaluation eval) {
+		this.storeSemiSupervised(props, eval, null);
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/dao/UMLSDao.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/dao/UMLSDao.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/dao/UMLSDao.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/dao/UMLSDao.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,54 @@
+package org.apache.ctakes.ytex.umls.dao;
+
+import gnu.trove.set.TIntSet;
+import gnu.trove.set.TShortSet;
+
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.ytex.umls.model.UmlsAuiFirstWord;
+
+
+public interface UMLSDao {
+
+	// public abstract List<Object[]> getRelationsForSABs(String sabs[]);
+	//
+	// public abstract List<Object[]> getAllRelations();
+
+	/**
+	 * pattern to match cuis and parse out their number
+	 */
+	public static final Pattern cuiPattern = Pattern.compile("\\AC(\\d{7})\\Z");
+
+	/**
+	 * get all aui, str from mrconso
+	 */
+	public List<Object[]> getAllAuiStr(String lastAui);
+
+	public void deleteAuiFirstWord();
+
+	public void insertAuiFirstWord(List<UmlsAuiFirstWord> listAuiFirstWord);
+
+	public abstract Map<String, String> getNames(List<String> subList);
+
+	/**
+	 * Get the 'last' UmlsAuiFirstWord. We insert them in ascending order of
+	 * auis.
+	 * 
+	 * @return
+	 */
+	public abstract String getLastAui();
+
+	/**
+	 * get a set of all cuis in RXNORM. used for DrugNer - need to set the
+	 * coding scheme to RXNORM. Convert the cui into a numeric representation
+	 * (chop off the preceding 'C') to save memory.
+	 * 
+	 * @return
+	 */
+	public abstract TIntSet getRXNORMCuis();
+
+	public abstract boolean isRXNORMCui(String cui);
+
+}
\ No newline at end of file

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/dao/UMLSDaoImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/dao/UMLSDaoImpl.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/dao/UMLSDaoImpl.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/dao/UMLSDaoImpl.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,155 @@
+package org.apache.ctakes.ytex.umls.dao;
+
+import gnu.trove.set.TIntSet;
+import gnu.trove.set.hash.TIntHashSet;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+
+import javax.sql.DataSource;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.ctakes.ytex.umls.model.UmlsAuiFirstWord;
+import org.hibernate.Query;
+import org.hibernate.SessionFactory;
+import org.springframework.jdbc.core.JdbcTemplate;
+
+
+public class UMLSDaoImpl implements UMLSDao {
+
+	public static final String INCLUDE_REL[] = new String[] { "PAR" };
+	public static final String EXCLUDE_RELA[] = new String[] { "inverse_isa" };
+	private static final Log log = LogFactory.getLog(UMLSDaoImpl.class);
+
+	SessionFactory sessionFactory;
+	private JdbcTemplate t;
+
+	public SessionFactory getSessionFactory() {
+		return sessionFactory;
+	}
+
+	public void setSessionFactory(SessionFactory sessionFactory) {
+		this.sessionFactory = sessionFactory;
+	}
+
+	public void setDataSource(DataSource ds) {
+		t = new JdbcTemplate(ds);
+	}
+
+	public DataSource getDataSource() {
+		return t.getDataSource();
+	}
+
+	// /*
+	// * (non-Javadoc)
+	// *
+	// * @see org.apache.ctakes.ytex.umls.dao.UMLSDao#getRelationsForSABs(java.util.Set)
+	// */
+	// public List<Object[]> getRelationsForSABs(String[] sabs) {
+	// Query q = sessionFactory.getCurrentSession().getNamedQuery(
+	// "getRelationsForSABs");
+	// q.setParameterList("sabs", sabs);
+	// // q.setParameterList("rel", INCLUDE_REL);
+	// // q.setParameterList("relaExclude", EXCLUDE_RELA);
+	// return (List<Object[]>) q.list();
+	// }
+
+	// /*
+	// * (non-Javadoc)
+	// *
+	// * @see org.apache.ctakes.ytex.umls.dao.UMLSDao#getAllRelations(java.util.Set)
+	// */
+	// public List<Object[]> getAllRelations() {
+	// Query q = sessionFactory.getCurrentSession().getNamedQuery(
+	// "getAllRelations");
+	// // q.setParameterList("rel", INCLUDE_REL);
+	// // q.setParameterList("relaExclude", EXCLUDE_RELA);
+	// return (List<Object[]>) q.list();
+	// }
+
+	/**
+	 * sets up the umls_aui_fword table.
+	 */
+	@SuppressWarnings("unchecked")
+	public List<Object[]> getAllAuiStr(String lastAui) {
+		Query q = null;
+		if (lastAui == null)
+			q = sessionFactory.getCurrentSession().getNamedQuery(
+					"getFirstAuiStr");
+		else {
+			q = sessionFactory.getCurrentSession().getNamedQuery(
+					"getNextAuiStr");
+			q.setString("aui", lastAui);
+		}
+		q.setMaxResults(10000);
+		return q.list();
+	}
+
+	public void deleteAuiFirstWord() {
+		// delete all entries
+		sessionFactory.getCurrentSession()
+				.createQuery("delete from UmlsAuiFirstWord").executeUpdate();
+	}
+
+	public void insertAuiFirstWord(List<UmlsAuiFirstWord> listAuiFirstWord) {
+		for (UmlsAuiFirstWord w : listAuiFirstWord)
+			sessionFactory.getCurrentSession().save(w);
+
+	}
+
+	@Override
+	public Map<String, String> getNames(List<String> subList) {
+		Map<String, String> names = new HashMap<String, String>(subList.size());
+		// get the shortest string for the specified cuis
+		updateNames("getCuiMinStr", subList, names);
+		// for those cuis with a preferred name, use it
+		updateNames("getCuiPreferredName", subList, names);
+		return names;
+	}
+
+	private void updateNames(String queryName, List<String> subList,
+			Map<String, String> names) {
+		Query q = sessionFactory.getCurrentSession().getNamedQuery(queryName);
+		q.setParameterList("cuis", subList);
+		@SuppressWarnings("unchecked")
+		List<Object[]> listCuiName = q.list();
+		for (Object[] cuiName : listCuiName) {
+			names.put((String) cuiName[0], (String) cuiName[1]);
+		}
+	}
+
+	@Override
+	public String getLastAui() {
+		Query q = sessionFactory.getCurrentSession()
+				.getNamedQuery("getLastAui");
+		String aui = (String) q.uniqueResult();
+		return aui;
+	}
+
+	@Override
+	@SuppressWarnings("unchecked")
+	public TIntSet getRXNORMCuis() {
+		TIntSet cuis = new TIntHashSet();
+		for (String cui : (List<String>) sessionFactory.getCurrentSession()
+				.getNamedQuery("getRXNORMCuis").list()) {
+			Matcher m = UMLSDao.cuiPattern.matcher(cui);
+			if (m.find()) {
+				cuis.add(Integer.parseInt(m.group(1)));
+			}
+		}
+		return cuis;
+	}
+
+	@Override
+	public boolean isRXNORMCui(String cui) {
+		Query q = sessionFactory.getCurrentSession().getNamedQuery(
+				"isRXNORMCui");
+		q.setCacheable(true);
+		q.setString("cui", cui);
+		long count = ((Long)q.uniqueResult());
+		return count > 0;
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRCONSO.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRCONSO.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRCONSO.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRCONSO.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,163 @@
+package org.apache.ctakes.ytex.umls.model;
+
+import java.io.Serializable;
+
+public class MRCONSO implements Serializable {
+	
+	/**
+	 * 
+	 */
+	private static final long serialVersionUID = 1L;
+	String aui;
+	String code;
+	String cui;
+	int cvf;
+	String ispref;
+	String lat;
+	String lui;
+	String sab;
+	String saui;
+	String scui;
+	String sdui;
+	int srl;
+	String str;
+	String stt;
+	String sui;
+	String suppress;
+	String ts;
+	String tty;
+	public MRCONSO() {
+		super();
+	}
+	@Override
+	public boolean equals(Object obj) {
+		if (this == obj)
+			return true;
+		if (obj == null)
+			return false;
+		if (getClass() != obj.getClass())
+			return false;
+		MRCONSO other = (MRCONSO) obj;
+		if (aui == null) {
+			if (other.aui != null)
+				return false;
+		} else if (!aui.equals(other.aui))
+			return false;
+		return true;
+	}
+	public String getAui() {
+		return aui;
+	}
+	public String getCode() {
+		return code;
+	}
+	public String getCui() {
+		return cui;
+	}
+	public int getCvf() {
+		return cvf;
+	}
+	public String getIspref() {
+		return ispref;
+	}
+	public String getLat() {
+		return lat;
+	}
+	public String getLui() {
+		return lui;
+	}
+	public String getSab() {
+		return sab;
+	}
+	public String getSaui() {
+		return saui;
+	}
+	public String getScui() {
+		return scui;
+	}
+	public String getSdui() {
+		return sdui;
+	}
+	public int getSrl() {
+		return srl;
+	}
+	public String getStr() {
+		return str;
+	}
+	public String getStt() {
+		return stt;
+	}
+	public String getSui() {
+		return sui;
+	}
+	public String getSuppress() {
+		return suppress;
+	}
+	public String getTs() {
+		return ts;
+	}
+	public String getTty() {
+		return tty;
+	}
+	@Override
+	public int hashCode() {
+		final int prime = 31;
+		int result = 1;
+		result = prime * result + ((aui == null) ? 0 : aui.hashCode());
+		return result;
+	}
+	public void setAui(String aui) {
+		this.aui = aui;
+	}
+	public void setCode(String code) {
+		this.code = code;
+	}
+	public void setCui(String cui) {
+		this.cui = cui;
+	}
+	public void setCvf(int cvf) {
+		this.cvf = cvf;
+	}
+	public void setIspref(String ispref) {
+		this.ispref = ispref;
+	}
+	public void setLat(String lat) {
+		this.lat = lat;
+	}
+	public void setLui(String lui) {
+		this.lui = lui;
+	}
+	public void setSab(String sab) {
+		this.sab = sab;
+	}
+	public void setSaui(String saui) {
+		this.saui = saui;
+	}
+	public void setScui(String scui) {
+		this.scui = scui;
+	}
+	public void setSdui(String sdui) {
+		this.sdui = sdui;
+	}
+	public void setSrl(int srl) {
+		this.srl = srl;
+	}
+	public void setStr(String str) {
+		this.str = str;
+	}
+	public void setStt(String stt) {
+		this.stt = stt;
+	}
+	public void setSui(String sui) {
+		this.sui = sui;
+	}
+	public void setSuppress(String suppress) {
+		this.suppress = suppress;
+	}
+	public void setTs(String ts) {
+		this.ts = ts;
+	}
+	public void setTty(String tty) {
+		this.tty = tty;
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRREL.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRREL.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRREL.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRREL.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,99 @@
+package org.apache.ctakes.ytex.umls.model;
+
+import java.io.Serializable;
+
+/**
+ * mapped to umls MRREL table
+ */
+public class MRREL implements Serializable {
+	/**
+	 * 
+	 */
+	private static final long serialVersionUID = 1L;
+	private String cui1;
+	private String cui2;
+
+	private String rel;
+
+	private String rela;
+	private String rui;
+	private String sab;
+
+	@Override
+	public String toString() {
+		return "MRREL [cui1=" + cui1 + ", cui2=" + cui2 + ", rui=" + rui + "]";
+	}
+
+	@Override
+	public int hashCode() {
+		final int prime = 31;
+		int result = 1;
+		result = prime * result + ((rui == null) ? 0 : rui.hashCode());
+		return result;
+	}
+
+	@Override
+	public boolean equals(Object obj) {
+		if (this == obj)
+			return true;
+		if (obj == null)
+			return false;
+		if (getClass() != obj.getClass())
+			return false;
+		MRREL other = (MRREL) obj;
+		if (rui == null) {
+			if (other.rui != null)
+				return false;
+		} else if (!rui.equals(other.rui))
+			return false;
+		return true;
+	}
+
+	public String getCui1() {
+		return cui1;
+	}
+
+	public String getCui2() {
+		return cui2;
+	}
+
+	public String getRel() {
+		return rel;
+	}
+
+	public String getRela() {
+		return rela;
+	}
+
+	public String getRui() {
+		return rui;
+	}
+
+	public String getSab() {
+		return sab;
+	}
+
+	public void setCui1(String cui1) {
+		this.cui1 = cui1;
+	}
+
+	public void setCui2(String cui2) {
+		this.cui2 = cui2;
+	}
+
+	public void setRel(String rel) {
+		this.rel = rel;
+	}
+
+	public void setRela(String rela) {
+		this.rela = rela;
+	}
+
+	public void setRui(String rui) {
+		this.rui = rui;
+	}
+
+	public void setSab(String sab) {
+		this.sab = sab;
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRSTY.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRSTY.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRSTY.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/MRSTY.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,92 @@
+package org.apache.ctakes.ytex.umls.model;
+
+import java.io.Serializable;
+
+public class MRSTY implements Serializable {
+
+	/**
+	 * 
+	 */
+	private static final long serialVersionUID = 1L;
+
+	String cui;
+	String tui;
+	String stn;
+	String sty;
+	String atui;
+	int cvf;
+	public String getCui() {
+		return cui;
+	}
+	public void setCui(String cui) {
+		this.cui = cui;
+	}
+	public String getTui() {
+		return tui;
+	}
+	public void setTui(String tui) {
+		this.tui = tui;
+	}
+	public String getStn() {
+		return stn;
+	}
+	public void setStn(String stn) {
+		this.stn = stn;
+	}
+	public String getSty() {
+		return sty;
+	}
+	public void setSty(String sty) {
+		this.sty = sty;
+	}
+	public String getAtui() {
+		return atui;
+	}
+	public void setAtui(String atui) {
+		this.atui = atui;
+	}
+	public int getCvf() {
+		return cvf;
+	}
+	public void setCvf(int cvf) {
+		this.cvf = cvf;
+	}
+	@Override
+	public String toString() {
+		return "MRSTY [cui=" + cui + ", tui=" + tui + "]";
+	}
+	public MRSTY() {
+		super();
+	}
+	@Override
+	public int hashCode() {
+		final int prime = 31;
+		int result = 1;
+		result = prime * result + ((cui == null) ? 0 : cui.hashCode());
+		result = prime * result + ((tui == null) ? 0 : tui.hashCode());
+		return result;
+	}
+	@Override
+	public boolean equals(Object obj) {
+		if (this == obj)
+			return true;
+		if (obj == null)
+			return false;
+		if (getClass() != obj.getClass())
+			return false;
+		MRSTY other = (MRSTY) obj;
+		if (cui == null) {
+			if (other.cui != null)
+				return false;
+		} else if (!cui.equals(other.cui))
+			return false;
+		if (tui == null) {
+			if (other.tui != null)
+				return false;
+		} else if (!tui.equals(other.tui))
+			return false;
+		return true;
+	}
+	
+	
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/UmlsAuiFirstWord.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/UmlsAuiFirstWord.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/UmlsAuiFirstWord.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/umls/model/UmlsAuiFirstWord.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,71 @@
+package org.apache.ctakes.ytex.umls.model;
+
+import java.io.Serializable;
+
+public class UmlsAuiFirstWord implements Serializable {
+	/**
+	 * 
+	 */
+	private static final long serialVersionUID = 1L;
+
+	String aui;
+
+	String fstem;
+
+	String fword;
+
+	String stemmedStr;
+
+	String tokenizedStr;
+	
+	public UmlsAuiFirstWord() {
+		super();
+	}
+	public UmlsAuiFirstWord(String aui, String fword) {
+		super();
+		this.aui = aui;
+		this.fword = fword;
+	}
+	public String getAui() {
+		return aui;
+	}
+	public String getFstem() {
+		return fstem;
+	}
+	public String getFword() {
+		return fword;
+	}
+	public String getStemmedStr() {
+		return stemmedStr;
+	}
+
+	public String getTokenizedStr() {
+		return tokenizedStr;
+	}
+
+	public void setAui(String aui) {
+		this.aui = aui;
+	}
+
+	public void setFstem(String fstem) {
+		this.fstem = fstem;
+	}
+
+	public void setFword(String fword) {
+		this.fword = fword;
+	}
+
+	public void setStemmedStr(String stemmedStr) {
+		this.stemmedStr = stemmedStr;
+	}
+
+	public void setTokenizedStr(String tokenizedStr) {
+		this.tokenizedStr = tokenizedStr;
+	}
+
+	@Override
+	public String toString() {
+		return "UmlsAuiFirstWord [aui=" + aui + ", fword=" + fword + "]";
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/DocumentResultInstanceImporter.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/DocumentResultInstanceImporter.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/DocumentResultInstanceImporter.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/DocumentResultInstanceImporter.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,56 @@
+package org.apache.ctakes.ytex.weka;
+
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.hibernate.SessionFactory;
+
+public class DocumentResultInstanceImporter implements
+		WekaResultInstanceImporter {
+	private SessionFactory sessionFactory;
+	private static final Log log = LogFactory
+			.getLog(DocumentResultInstanceImporter.class);
+
+	public SessionFactory getSessionFactory() {
+		return sessionFactory;
+	}
+
+	public void setSessionFactory(SessionFactory sessionFactory) {
+		this.sessionFactory = sessionFactory;
+	}
+
+	@Override
+	public void importInstanceResult(Integer instanceNumber,
+			List<String> instanceKey, String task, int classAuto,
+			int classGold, List<Double> predictions) {
+		// if (instanceKey.size() < 1) {
+		try {
+			int documentId = Integer.parseInt(instanceKey.get(0));
+			if (documentId > 0) {
+				// todo fix this
+//				Document doc = (Document) this.getSessionFactory()
+//						.getCurrentSession().get(Document.class, documentId);
+//				if (doc != null) {
+//					DocumentClass docClass = new DocumentClass();
+//					docClass.setDocument(doc);
+//					docClass.setClassAuto(classAuto);
+//					docClass.setClassGold(classGold);
+//					docClass.setTask(task);
+//					this.getSessionFactory().getCurrentSession().save(docClass);
+//				} else {
+//					log.error("no document for id: " + documentId);
+//				}
+			} else {
+				log.error("Invalid instance id: " + instanceKey
+						+ ", instanceNumber: " + instanceNumber);
+			}
+		} catch (NumberFormatException nfe) {
+			log.error("could not parse document id: " + instanceKey
+					+ ", instanceNumber: " + instanceNumber, nfe);
+		}
+		// } else {
+		// log.error("no attributes in key, instanceNumber: " + instanceNumber);
+		// }
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/ExportBagOfWords.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/ExportBagOfWords.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/ExportBagOfWords.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/ExportBagOfWords.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,23 @@
+package org.apache.ctakes.ytex.weka;
+
+import java.io.IOException;
+
+import org.apache.ctakes.ytex.kernel.BagOfWordsExporter;
+import org.apache.ctakes.ytex.kernel.KernelContextHolder;
+
+
+/**
+ * Export bag of words using the queries specified in the given property/xml
+ * file. Delegate to BagOfWordsExporter
+ * 
+ * @author vijay
+ * 
+ */
+public class ExportBagOfWords {
+
+	public static void main(String args[]) throws IOException {
+		BagOfWordsExporter exporter = (BagOfWordsExporter) KernelContextHolder
+				.getApplicationContext().getBean("wekaBagOfWordsExporter");
+		exporter.exportBagOfWords(args[0]);
+	}
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/ExportGramMatrix.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/ExportGramMatrix.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/ExportGramMatrix.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/ExportGramMatrix.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,25 @@
+package org.apache.ctakes.ytex.weka;
+
+import java.io.IOException;
+
+import org.apache.ctakes.ytex.kernel.KernelContextHolder;
+
+
+public class ExportGramMatrix {
+
+	/**
+	 * @param args
+	 * @throws IOException
+	 */
+	public static void main(String[] args) throws IOException {
+		if (args.length < 1) {
+			System.out
+					.println("usage: java org.apache.ctakes.ytex.kernel.ExportGramMatrix <property file>");
+		} else {
+			GramMatrixExporter g = (GramMatrixExporter) KernelContextHolder
+					.getApplicationContext().getBean("gramMatrixExporter");
+			g.exportGramMatrix(args[0]);
+		}
+	}
+
+}

Added: ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/GramMatrixExporter.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/GramMatrixExporter.java?rev=1551254&view=auto
==============================================================================
--- ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/GramMatrixExporter.java (added)
+++ ctakes/branches/ytex/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/weka/GramMatrixExporter.java Mon Dec 16 16:30:30 2013
@@ -0,0 +1,10 @@
+package org.apache.ctakes.ytex.weka;
+
+import java.io.IOException;
+
+public interface GramMatrixExporter {
+
+	public abstract void exportGramMatrix(String propertyFile)
+			throws IOException;
+
+}
\ No newline at end of file



Mime
View raw message