Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id E380410BCD for ; Thu, 18 Jul 2013 01:34:39 +0000 (UTC) Received: (qmail 20180 invoked by uid 500); 18 Jul 2013 01:34:39 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 20147 invoked by uid 500); 18 Jul 2013 01:34:39 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 20140 invoked by uid 99); 18 Jul 2013 01:34:39 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 18 Jul 2013 01:34:39 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 18 Jul 2013 01:34:34 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 9D4CC2388980; Thu, 18 Jul 2013 01:34:12 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1504339 - in /ctakes/trunk/ctakes-temporal: pom.xml src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java Date: Thu, 18 Jul 2013 01:34:12 -0000 To: commits@ctakes.apache.org From: stevenbethard@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20130718013412.9D4CC2388980@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: stevenbethard Date: Thu Jul 18 01:34:12 2013 New Revision: 1504339 URL: http://svn.apache.org/r1504339 Log: CTAKES-217: Revises CompareFeatureStructures to use java-diff-utils. The search for FeatureStructure equality is the same, but now nested uses of DiffUtils produce what is hopefully better output. In particular, there should now be more useful output for the case where annotations have been inserted or deleted (not just changed). Modified: ctakes/trunk/ctakes-temporal/pom.xml ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java Modified: ctakes/trunk/ctakes-temporal/pom.xml URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/pom.xml?rev=1504339&r1=1504338&r2=1504339&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/pom.xml (original) +++ ctakes/trunk/ctakes-temporal/pom.xml Thu Jul 18 01:34:12 2013 @@ -141,6 +141,11 @@ timenorm 0.9.0 + + com.googlecode.java-diff-utils + diffutils + 1.3.0 + Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java?rev=1504339&r1=1504338&r2=1504339&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java (original) +++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java Thu Jul 18 01:34:12 2013 @@ -2,16 +2,12 @@ package org.apache.ctakes.temporal.data. import java.io.File; import java.io.FileInputStream; -import java.util.Arrays; import java.util.Iterator; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import javax.annotation.Nullable; import org.apache.uima.cas.CASException; -import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Feature; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; @@ -24,16 +20,21 @@ import org.uimafit.factory.JCasFactory; import org.uimafit.util.JCasUtil; import com.google.common.base.Function; -import com.google.common.base.Joiner; import com.google.common.base.Objects; -import com.google.common.collect.Iterables; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Iterators; +import com.google.common.collect.ListMultimap; import com.google.common.collect.Lists; -import com.google.common.collect.Multimap; import com.google.common.collect.Ordering; -import com.google.common.collect.TreeMultimap; import com.lexicalscope.jewel.cli.CliFactory; import com.lexicalscope.jewel.cli.Option; +import difflib.Chunk; +import difflib.Delta; +import difflib.Patch; +import difflib.myers.Equalizer; +import difflib.myers.MyersDiff; + public class CompareFeatureStructures { static interface Options { @Option(longName = "dir1") @@ -54,36 +55,75 @@ public class CompareFeatureStructures { for (String annotationClassName : options.getAnnotationClassNames()) { annotationClasses.add(Class.forName(annotationClassName)); } - File dir1 = options.getDirectory1(); - File dir2 = options.getDirectory2(); - if (!Arrays.equals(dir1.list(), dir2.list())) { - System.err.printf("%s and %s contain different files", dir1, dir2); + + MyersDiff stringDiff = new MyersDiff(); + MyersDiff fsDiff = + new MyersDiff(new FeatureStructureEqualizer()); + + File originalDir = options.getDirectory1(); + File revisedDir = options.getDirectory2(); + Patch dirPatch = stringDiff.diff(originalDir.list(), revisedDir.list()); + if (!dirPatch.getDeltas().isEmpty()) { + log("--- %s files\n", originalDir); + log("+++ %s files\n", revisedDir); + log(dirPatch); } else { - for (String fileName : dir1.list()) { - System.err.printf("== Checking %s ===\n", fileName); - JCas jCas1 = readXMI(new File(dir1, fileName)); - JCas jCas2 = readXMI(new File(dir2, fileName)); - List viewNames1 = getViewNames(jCas1); - List viewNames2 = getViewNames(jCas2); - if (areEqual("view-names", viewNames1, viewNames2)) { - for (String viewName : viewNames1) { - JCas view1 = jCas1.getView(viewName); - JCas view2 = jCas2.getView(viewName); - for (Class annotationClass : annotationClasses) { - Multimap fsMap1 = toSortedMultimap(view1, annotationClass); - Multimap fsMap2 = toSortedMultimap(view2, annotationClass); - if (areEqual("annotation-counts", fsMap1.keys(), fsMap2.keys())) { - for (Type type : fsMap1.keySet()) { - Iterator fsIter1 = fsMap1.get(type).iterator(); - Iterator fsIter2 = fsMap2.get(type).iterator(); - while (fsIter1.hasNext() && fsIter2.hasNext()) { - FeatureStructure fs1 = fsIter1.next(); - FeatureStructure fs2 = fsIter2.next(); - FSDiff diff = new FSDiff(fs1, fs2); - if (diff.hasDifferences()) { - System.err.println(diff); + for (String fileName : originalDir.list()) { + File originalFile = new File(originalDir, fileName); + File revisedFile = new File(revisedDir, fileName); + JCas originalJCas = readXMI(originalFile); + JCas revisedJCas = readXMI(revisedFile); + List originalViews = getViewNames(originalJCas); + List revisedViews = getViewNames(revisedJCas); + Patch viewsPatch = stringDiff.diff(originalViews, revisedViews); + if (!viewsPatch.getDeltas().isEmpty()) { + log("--- %s views\n", originalFile); + log("+++ %s views\n", revisedFile); + log(viewsPatch); + } else { + for (String viewName : originalViews) { + JCas originalView = originalJCas.getView(viewName); + JCas revisedView = revisedJCas.getView(viewName); + List originalFSes = + toFeatureStructures(originalView, annotationClasses); + List revisedFSes = + toFeatureStructures(revisedView, annotationClasses); + Patch fsPatch = fsDiff.diff(originalFSes, revisedFSes); + if (!fsPatch.getDeltas().isEmpty()) { + log("--- %s view %s\n", originalFile, viewName); + log("+++ %s view %s\n", revisedFile, viewName); + for (Delta fsDelta : fsPatch.getDeltas()) { + logHeader(fsDelta); + switch (fsDelta.getType()) { + case DELETE: + case INSERT: + log(fsDelta); + log("=== fsDelta complete ===\n"); + break; + case CHANGE: + List originalLines = toLines(fsDelta.getOriginal().getLines()); + List revisedLines = toLines(fsDelta.getRevised().getLines()); + Patch linesPatch = stringDiff.diff(originalLines, revisedLines); + ListMultimap deletes = ArrayListMultimap.create(); + ListMultimap inserts = ArrayListMultimap.create(); + for (Delta linesDelta : linesPatch.getDeltas()) { + Chunk originalChunk = linesDelta.getOriginal(); + Chunk revisedChunk = linesDelta.getRevised(); + deletes.putAll(originalChunk.getPosition(), originalChunk.getLines()); + inserts.putAll(originalChunk.getPosition(), revisedChunk.getLines()); + } + for (int i = 0; i < originalLines.size(); ++i) { + if (!deletes.containsKey(i) && !inserts.containsKey(i)) { + log(" %s\n", originalLines.get(i)); + } + for (String line : deletes.get(i)) { + log("-%s\n", line); + } + for (String line : inserts.get(i)) { + log("+%s\n", line); } } + break; } } } @@ -94,6 +134,39 @@ public class CompareFeatureStructures { } + private static void log(String message, Object... args) { + System.err.printf(message, args); + } + + private static void log(Patch patch) { + for (Delta delta : patch.getDeltas()) { + logHeader(delta); + log(delta); + } + } + + private static void logHeader(Delta delta) { + Chunk original = delta.getOriginal(); + Chunk revised = delta.getRevised(); + log( + "@@ -%d,%d +%d,%d @@\n", + original.getPosition(), + original.size(), + revised.getPosition(), + revised.size()); + } + + private static void log(Delta delta) { + Chunk original = delta.getOriginal(); + Chunk revised = delta.getRevised(); + for (T line : original.getLines()) { + log("-%s\n", line.toString().replaceAll("\n", "\n-")); + } + for (T line : revised.getLines()) { + log("+%s\n", line.toString().replaceAll("\n", "\n+")); + } + } + private static JCas readXMI(File xmiFile) throws Exception { JCas jCas = JCasFactory.createJCas(); FileInputStream inputStream = new FileInputStream(xmiFile); @@ -106,49 +179,39 @@ public class CompareFeatureStructures { } private static List getViewNames(JCas jCas) throws CASException { - List names = Lists.newArrayList(); - Iterator views = jCas.getViewIterator(); - while (views.hasNext()) { - names.add(views.next().getViewName()); + List viewNames = Lists.newArrayList(); + Iterator viewIter = jCas.getViewIterator(); + while (viewIter.hasNext()) { + viewNames.add(viewIter.next().getViewName()); } - return names; + return viewNames; } - private static boolean areEqual(String name, Object o1, Object o2) { - boolean areEqual = Objects.equal(o1, o2); - if (!areEqual) { - System.err.printf("Difference in %s:\n-%s\n+%s\n", name, o1, o2); + private static List toFeatureStructures( + JCas jCas, + List> annotationClasses) { + List fsList = Lists.newArrayList(); + for (Class annotationClass : annotationClasses) { + Type type = JCasUtil.getType(jCas, annotationClass); + Iterators.addAll(fsList, jCas.getFSIndexRepository().getAllIndexedFS(type)); } - return areEqual; + return BY_TYPE_AND_OFFSETS.sortedCopy(fsList); } - private static Multimap toSortedMultimap( - JCas jCas, - Class annotationClass) { - Type type = JCasUtil.getType(jCas, annotationClass); - FSIterator fsIterator = jCas.getFSIndexRepository().getAllIndexedFS(type); - Multimap result = TreeMultimap.create(BY_NAME, BY_OFFSETS); - while (fsIterator.hasNext()) { - FeatureStructure fs = fsIterator.next(); - result.put(fs.getType(), fs); - } - return result; - } - - private static final Ordering BY_NAME = Ordering.natural().onResultOf( - new Function() { - @Override - public String apply(@Nullable Type input) { - return input.getName(); - } - }); - - private static final Ordering BY_OFFSETS = - Ordering.natural(). lexicographical().onResultOf( - new Function>() { + private static final Ordering BY_TYPE_AND_OFFSETS = + Ordering.natural().> lexicographical().onResultOf( + new Function>>() { @Override - public Iterable apply(@Nullable FeatureStructure input) { + public Iterable> apply(@Nullable FeatureStructure input) { List offsets = Lists.newArrayList(); + this.findOffsets(input, offsets); + List> result = + Lists.> newArrayList(input.getType().getName()); + result.addAll(Ordering.natural().sortedCopy(offsets)); + return result; + } + + private void findOffsets(FeatureStructure input, List offsets) { if (input != null) { if (input instanceof Annotation) { Annotation annotation = (Annotation) input; @@ -157,156 +220,75 @@ public class CompareFeatureStructures { } else if (input instanceof FSArray) { FSArray fsArray = (FSArray) input; for (int i = 0; i < fsArray.size(); ++i) { - Iterables.addAll(offsets, this.apply(fsArray.get(i))); + this.findOffsets(fsArray.get(i), offsets); } } else if (input instanceof NonEmptyFSList) { NonEmptyFSList fsList = (NonEmptyFSList) input; - Iterables.addAll(offsets, this.apply(fsList.getHead())); - Iterables.addAll(offsets, this.apply(fsList.getTail())); + this.findOffsets(fsList.getHead(), offsets); + this.findOffsets(fsList.getTail(), offsets); } else { for (Feature feature : input.getType().getFeatures()) { if (!feature.getRange().isPrimitive()) { - Iterables.addAll(offsets, this.apply(input.getFeatureValue(feature))); + this.findOffsets(input.getFeatureValue(feature), offsets); } } } } - return offsets; } }); - public static class FSDiff { - private List differences; - private FeatureStructure root1, root2; - - public FSDiff(FeatureStructure root1, FeatureStructure root2) { - this.root1 = root1; - this.root2 = root2; - this.differences = Lists.newArrayList(); - this.findDifferences( - this.root1, - this.root2, - Lists. newArrayList(), - Lists. newArrayList()); + public static List toLines(List fsList) { + List lines = Lists.newArrayList(); + for (FeatureStructure fs : fsList) { + for (String line : fs.toString().split("\n")) { + lines.add(line); + } } + return lines; + } - public boolean hasDifferences() { - return !this.differences.isEmpty(); - } + static class FeatureStructureEqualizer implements Equalizer { @Override - public String toString() { - String diff; - if (!this.hasDifferences()) { - diff = ""; - } else { - List paths = Lists.newArrayList(); - for (FSDifference difference : this.differences) { - List featureNames = Lists.newArrayList(); - for (Feature feature : difference.getPath()) { - featureNames.add(feature.getShortName()); - } - paths.add(Joiner.on('/').join(featureNames)); - } - diff = this.root1.toString(); - for (FSDifference difference : this.differences) { - String value1 = difference.getValue1().toString().trim(); - String value2 = difference.getValue2().toString().trim(); - String value1space = value1.replaceAll("\\s+", "\\\\s+"); - Pattern pattern = - Pattern.compile(String.format("^(.*?)(%s)", value1space), Pattern.MULTILINE); - Matcher matcher = pattern.matcher(diff); - StringBuffer buffer = new StringBuffer(); - while (matcher.find()) { - String prefix = matcher.group(1); - String replacement; - // don't re-replace things that have already been taken care of - if (prefix.startsWith("-") || prefix.startsWith("+")) { - replacement = matcher.group(); - } - // replace the current text with diff-style +/- text - else { - Matcher indentMatcher = Pattern.compile("^\\s*").matcher(prefix); - indentMatcher.find(); - String indent = indentMatcher.group(); - replacement = - String.format( - "%s%s\n%s%s", - "-" + prefix, - value1.replaceAll("\n", "\n-" + indent), - "+" + prefix, - value2.replaceAll("\n", "\n+" + indent)); - } - matcher.appendReplacement(buffer, replacement); - } - matcher.appendTail(buffer); - diff = buffer.toString(); - } - diff = diff.replaceAll("(?m)^(?![+-])", " "); - diff = String.format("Difference in %s:\n%s", paths, diff); - } - return diff; + public boolean equals(FeatureStructure original, FeatureStructure revised) { + return this.equals(original, revised, Lists. newArrayList()); } - private void findDifferences( - FeatureStructure fs1, - FeatureStructure fs2, - List featurePath, + private boolean equals( + FeatureStructure original, + FeatureStructure revised, List seen) { - if (!seen.contains(fs1) && !seen.contains(fs2)) { - seen.add(fs1); - seen.add(fs2); - for (Feature feature : fs1.getType().getFeatures()) { + if (!seen.contains(original) && !seen.contains(revised)) { + seen.add(original); + seen.add(revised); + for (Feature feature : original.getType().getFeatures()) { if (feature.getName().equals("uima.cas.AnnotationBase:sofa")) { continue; } - List newPath = Lists.newArrayList(featurePath); - newPath.add(feature); if (feature.getRange().isPrimitive()) { - String value1 = fs1.getFeatureValueAsString(feature); - String value2 = fs2.getFeatureValueAsString(feature); - if (!Objects.equal(value1, value2)) { - this.differences.add(new FSDifference(newPath, value1, value2)); + String originalValue = original.getFeatureValueAsString(feature); + String revisedValue = revised.getFeatureValueAsString(feature); + if (!Objects.equal(originalValue, revisedValue)) { + return false; } } else { - FeatureStructure value1 = fs1.getFeatureValue(feature); - FeatureStructure value2 = fs2.getFeatureValue(feature); - if (value1 == null - || value2 == null - || !value1.getType().getName().equals(value2.getType().getName())) { - if (!Objects.equal(value1, value2)) { - this.differences.add(new FSDifference(newPath, value1, value2)); + FeatureStructure originalValue = original.getFeatureValue(feature); + FeatureStructure revisedValue = revised.getFeatureValue(feature); + if (originalValue == null + || revisedValue == null + || !originalValue.getType().getName().equals(revisedValue.getType().getName())) { + if (!Objects.equal(originalValue, revisedValue)) { + return false; } } else { - this.findDifferences(value1, value2, newPath, seen); + if (!this.equals(originalValue, revisedValue, seen)) { + return false; + } } } } } - } - } - - public static class FSDifference { - - private List path; - private Object value1, value2; - - public FSDifference(List path, Object value1, Object value2) { - this.path = path; - this.value1 = value1; - this.value2 = value2; - } - - public List getPath() { - return path; - } - - public Object getValue1() { - return value1; - } - - public Object getValue2() { - return value2; + return true; } } }