commons-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jua...@apache.org
Subject cvs commit: jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff Diff.java DiffAlgorithm.java Revision.java SimpleDiff.java DiffAlgorithmBound.java SimpleDiffBound.java
Date Mon, 05 May 2003 22:43:25 GMT
juanco      2003/05/05 15:43:25

  Modified:    jrcs/src/java/org/apache/commons/jrcs/diff Diff.java
                        DiffAlgorithm.java Revision.java SimpleDiff.java
  Removed:     jrcs/src/java/org/apache/commons/jrcs/diff
                        DiffAlgorithmBound.java SimpleDiffBound.java
  Log:
  refactored to simplify implementation of Algorithm pattern
  
  Revision  Changes    Path
  1.8       +25 -23    jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff/Diff.java
  
  Index: Diff.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff/Diff.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- Diff.java	5 May 2003 21:49:28 -0000	1.7
  +++ Diff.java	5 May 2003 22:43:24 -0000	1.8
  @@ -108,41 +108,43 @@
   
       public static final String NL = System.getProperty("line.separator");
       public static final String RCS_EOL = "\n";
  -    static DiffAlgorithm algorithm = SimpleDiff.getInstance();
   
  -    private Object[] orig;
  -    private DiffAlgorithmBound algorithmImpl;
  +    protected Object[] orig;
  +    protected DiffAlgorithm algorithm;
   
       /**
  -     * create a differencing object using the given algorithm
  +     * Create a differencing object using the default algorithm
        *
  -     * @param o the original text which will be compared against
  -     * @param algorithm the difference algorithm to use.
  +     * @param the original text that will be compared
        */
  -    public Diff(Object[] o, DiffAlgorithm algorithm)
  +    public Diff(Object[] original)
       {
  -        init(o, algorithm);
  +        this(original, null);
       }
   
       /**
  -     * create a differencing object using the default algorithm
  +     * Create a differencing object using the given algorithm
        *
  -     * @param the original text that will be compared
  +     * @param o the original text which will be compared against
  +     * @param algorithm the difference algorithm to use.
        */
  -    public Diff(Object[] o)
  +    public Diff(Object[] original, DiffAlgorithm algorithm)
       {
  -        init(o, algorithm);
  -    }
  -
  -    protected void init(Object[] o, DiffAlgorithm algorithm)
  -    {
  -        if (o == null || algorithm == null)
  +        if (original == null)
           {
               throw new IllegalArgumentException();
           }
   
  -        orig = o;
  -        algorithmImpl = algorithm.createBoundInstance(o);
  +        this.orig = original;
  +        if (algorithm != null)
  +            this.algorithm = algorithm;
  +        else
  +            this.algorithm = defaultAlgorithm();
  +    }
  +
  +    protected DiffAlgorithm defaultAlgorithm()
  +    {
  +        return new SimpleDiff();
       }
   
       /**
  @@ -160,7 +162,7 @@
               throw new IllegalArgumentException();
           }
   
  -        return diff(orig, rev, algorithm);
  +        return diff(orig, rev, null);
       }
   
       /**
  @@ -175,7 +177,7 @@
                                   DiffAlgorithm algorithm)
           throws DifferentiationFailedException
       {
  -        if (orig == null || rev == null || algorithm == null)
  +        if (orig == null || rev == null)
           {
               throw new IllegalArgumentException();
           }
  @@ -192,7 +194,7 @@
       public Revision diff(Object[] rev)
           throws DifferentiationFailedException
       {
  -        return algorithmImpl.diff(rev);
  +        return algorithm.diff(orig, rev);
       }
   
       public static boolean compare(Object[] orig, Object[] rev)
  
  
  
  1.2       +12 -4     jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff/DiffAlgorithm.java
  
  Index: DiffAlgorithm.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff/DiffAlgorithm.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- DiffAlgorithm.java	5 May 2003 21:49:28 -0000	1.1
  +++ DiffAlgorithm.java	5 May 2003 22:43:24 -0000	1.2
  @@ -57,16 +57,24 @@
    * The interface to a difference algorithm.
    *
    * @version $Revision$ $Date$
  + * @author bwm
    *
    * <p>An algorithm is essentially a factory that creates instances of
    * the algorithm that are bound to original text.</p>
  - * @author bwm
    *
    */
   public interface DiffAlgorithm
   {
       /**
  -     * return a new instance of an algorithm bound to some original text
  +     * Compute a {@link org.apache.commons.jrcs.diff#Revision Revision}
  +     * between the original sequence and the revised sequence.
  +     * <p>
  +     * The revision can be used to construct the revised sequence
  +     * from the original sequence.
  +     *
  +     * @param rev the revised text
  +     * @return the revision script.
        */
  -    public DiffAlgorithmBound createBoundInstance(Object[] orig);
  +    public abstract Revision diff(Object[] orig, Object[] rev)
  +        throws DifferentiationFailedException;
   }
  
  
  
  1.4       +15 -1     jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff/Revision.java
  
  Index: Revision.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff/Revision.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- Revision.java	5 May 2003 21:49:28 -0000	1.3
  +++ Revision.java	5 May 2003 22:43:24 -0000	1.4
  @@ -126,6 +126,20 @@
   
   
       /**
  +     * Adds a delta to the start of this revision.
  +     * @param delta the {@link Delta Delta} to add.
  +     */
  +    public synchronized void insertDelta(Delta delta)
  +    {
  +        if (delta == null)
  +        {
  +            throw new IllegalArgumentException("new delta is null");
  +        }
  +        deltas_.add(0, delta);
  +    }
  +
  +
  +    /**
        * Retrieves a delta from this revision by position.
        * @param i the position of the delta to retrieve.
        * @return the specified delta
  
  
  
  1.2       +241 -18   jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff/SimpleDiff.java
  
  Index: SimpleDiff.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/jrcs/src/java/org/apache/commons/jrcs/diff/SimpleDiff.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- SimpleDiff.java	5 May 2003 21:49:28 -0000	1.1
  +++ SimpleDiff.java	5 May 2003 22:43:24 -0000	1.2
  @@ -54,43 +54,266 @@
    * <http://www.apache.org/>.
    */
   
  +import java.util.Collections;
  +import java.util.ArrayList;
  +import java.util.Arrays;
  +import java.util.HashMap;
  +import java.util.HashSet;
  +import java.util.LinkedList;
  +import java.util.List;
  +import java.util.Map;
  +import java.util.Random;
  +import java.util.Set;
  +
  +import org.apache.commons.jrcs.util.ToString;
  +
   /**
  - * The interface to a difference algorithm.
  + * Implements a simple difference algortithm
    *
    * @version $Revision$ $Date$
  + * @author <a href="mailto:juanco@suigeneris.org">Juanco Anez</a>
  + * @see Delta
  + * @see Revision
  + *
  + * <p><b>Overview of Algorithm</b></p>
  + *
  + * <p><i>this description was written by <a
  + * href='http://www.topmeadow.net/bwm'> bwm</a> whilst trying to understand
  + * the algorithm - Juancarlos - feel free to remove  this attribution if
  + * you agree with the content below - its just to absolve you of
  + * responsibility.</i></p>
  + *
  + * <p>The algorithm is optimised for situations where the input sequences
  + * have few repeated objects.  If it is given input with many repeated
  + * objects it may report sub-optimal changes.  However, given appropriate
  + * input, it is fast.</p>
  + *
  + * <p>The algorithm consists of the following steps:</p>
  + * <ul>
  + *   <li>compute a perfect hash function for the input data</li>
  + *   <li>compute the hash function for each element of the orginal
  + *       and revised input sequences</li>
  + *   <li>match the the input sequences to determine the deltas, i.e.
  + *       the differences between the original and revised sequences.</li>
  + * </ul>
  + *
  + * <p>The first step is to compute a perfect hash for the input data.  The
  + * hash function is defined only for objects that are in the original
  + * input sequence and in the revised input sequences:</p>
  + * <pre>
  + *   eq(x) = the index of the first occurence of x in the orig sequence.
  + * </pre>
  + *
  + * <p>With this hash function, the algorithm can compare integers rather
  + * than strings, which is considerably more efficient.</p>
  + *
  + * <p>The second step is to compute the datastructure on which the
  + * algorithm will operate.  Having computed the perfect hash function
  + * in the previous step, we can compute two arrays where
  + * indx[i] = eqs(orig[i]) and jndx[i] = eqs(rev[i]).  The algorithm can
  + * now operate on indx and jndx instead of orig and rev.</p>
  + *
  + * <p>The algorithm now matches indx and jndx.  Whilst indx[i] == jndx[i]
  + * it skips matching objects in the sequence.  In seeking to match objects
  + * in the input sequence it assumes that each object is likely to be unique.
  + * It uses the known characteristics of the unique hash function.  It can
  + * tell from the hash value if this object appeared in the other sequence
  + * at all.  If it did not, there is point in searching for a match.</p>
  + *
  + * <p>Recall that hash function value is the index earliest occurrence in
  + *  the orig sequence.  This information is used to search efficiently for
  + * the next match.  The algorithm is perfect when all input objects are
  + *  unique, but degrades when input objects are not unique.  When input
  + *  objects are not unique an optimal match may not be found, but a
  + * correct match will be.</p>
    *
  - * <p>An algorithm is essentially a factory that creates instances of
  - * the algorithm.  This algorithm uses the singleton pattern to create
  - * the factory.</p>
  + * <p>Having identified common matching objects in the orig and revised
  + * sequences, the differences between them are easily computed.</p>
  + *
  + * Modifications:
  + *
  + * 27/Apr/2003 bwm
  + * Added some comments whilst trying to figure out the algorithm
  + *
  + * 03 May 2003 bwm
  + * Created this implementation class by refactoring it out of the Diff
  + * class to enable plug in difference algorithms
    *
  - * @author bwm
    */
  +
   public class SimpleDiff
       implements DiffAlgorithm
   {
   
  -    private static SimpleDiff instance = new SimpleDiff();
  +    static final int NOT_FOUND_i = -2;
  +    static final int NOT_FOUND_j = -1;
  +    static final int EOS = Integer.MAX_VALUE;
  +
  +    public SimpleDiff()
  +    {
  +    }
  +
  +    protected int scan(int[] ndx, int i, int target)
  +    {
  +        while (ndx[i] < target)
  +        {
  +            i++;
  +        }
  +        return i;
  +    }
  +
  +    /**
  +     * compute the difference between an original and a revision.
  +     *
  +     * @param rev the revision to compare with the original.
  +     * @return a Revision describing the differences
  +     */
  +    public Revision diff(Object[] orig, Object[] rev)
  +        throws DifferentiationFailedException
  +    {
  +        // create map eqs, such that for each item in both orig and rev
  +        // eqs(item) = firstOccurrence(item, orig);
  +        Map eqs = buildEqSet(orig, rev);
  +
  +        // create an array such that
  +        //   indx[i] = NOT_FOUND_i if orig[i] is not in rev
  +        //   indx[i] = firstOccurrence(orig[i], orig)
  +        int[] indx = buildIndex(eqs, orig, NOT_FOUND_i);
  +
  +        // create an array such that
  +        //   jndx[j] = NOT_FOUND_j if orig[j] is not in rev
  +        //   jndx[j] = firstOccurrence(rev[j], orig)
  +        int[] jndx = buildIndex(eqs, rev, NOT_FOUND_j);
  +
  +        // what in effect has been done is to build a unique hash
  +        // for each item that is in both orig and rev
  +        // and to label each item in orig and new with that hash value
  +        // or a marker that the item is not common to both.
  +
  +        eqs = null; // let gc know we're done with this
  +
  +        Revision deltas = new Revision(); //!!! new Revision()
  +        int i = 0;
  +        int j = 0;
  +
  +        // skip matching
  +        // skip leading items that are equal
  +        // could be written
  +        // for (i=0; indx[i] != EOS && indx[i] == jndx[i]; i++);
  +        // j = i;
  +        for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++)
  +        {
  +            /* void */
  +        }
   
  -    private SimpleDiff()
  -    {}
  +        while (indx[i] != jndx[j])
  +        { // only equal if both == EOS
  +            // they are different
  +            int ia = i;
  +            int ja = j;
  +
  +            // size of this delta
  +            do
  +            {
  +                // look down rev for a match
  +                // stop at a match
  +                // or if the FO(rev[j]) > FO(orig[i])
  +                // or at the end
  +                while (jndx[j] < 0 || jndx[j] < indx[i])
  +                {
  +                    j++;
  +                }
  +                // look down orig for a match
  +                // stop at a match
  +                // or if the FO(orig[i]) > FO(rev[j])
  +                // or at the end
  +                while (indx[i] < 0 || indx[i] < jndx[j])
  +                {
  +                    i++;
  +                }
  +
  +                // this doesn't do a compare each line with each other line
  +                // so it won't find all matching lines
  +            }
  +            while (indx[i] != jndx[j]);
  +
  +            // on exit we have a match
  +
  +            // they are equal, reverse any exedent matches
  +            // it is possible to overshoot, so count back matching items
  +            while (i > ia && j > ja && indx[i - 1] == jndx[j - 1])
  +            {
  +                --i;
  +                --j;
  +            }
  +
  +            deltas.addDelta(Delta.newDelta(new Chunk(orig, ia, i - ia),
  +                                           new Chunk(rev, ja, j - ja)));
  +            // skip matching
  +            for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++)
  +            {
  +                /* void */
  +            }
  +        }
  +        return deltas;
  +    }
   
       /**
  -     * return the factory for this algorithm
  +     * create a <code>Map</code> from each common item in orig and rev
  +     * to the index of its first occurrence in orig
        *
  -     * @return the factory for this algorithm
  +     * @param orig the original sequence of items
  +     * @param rev  the revised sequence of items
        */
  -    public static SimpleDiff getInstance()
  +    protected Map buildEqSet(Object[] orig, Object[] rev)
       {
  -        return instance;
  +        // construct a set of the objects that orig and rev have in common
  +
  +        // first construct a set containing all the elements in orig
  +        Set items = new HashSet(Arrays.asList(orig));
  +
  +        // then remove all those not in rev
  +        items.retainAll(Arrays.asList(rev));
  +
  +        Map eqs = new HashMap();
  +        for (int i = 0; i < orig.length; i++)
  +        {
  +            // if its a common item and hasn't been found before
  +            if (items.contains(orig[i]))
  +            {
  +                // add it to the map
  +                eqs.put(orig[i], new Integer(i));
  +                // and make sure its not considered again
  +                items.remove(orig[i]);
  +            }
  +        }
  +        return eqs;
       }
   
       /**
  -     * return an instance of the algorithm bound to some original text
  +     * build a an array such each a[i] = eqs([i]) or NF if eqs([i]) undefined
        *
  -     * @return an instance of the algorithm
  +     * @param eqs a mapping from Object to Integer
  +     * @param seq a sequence of objects
  +     * @param NF  the not found marker
        */
  -    public DiffAlgorithmBound createBoundInstance(Object[] orig)
  +    protected int[] buildIndex(Map eqs, Object[] seq, int NF)
       {
  -        return new SimpleDiffBound(orig);
  +        int[] result = new int[seq.length + 1];
  +        for (int i = 0; i < seq.length; i++)
  +        {
  +            Integer value = (Integer) eqs.get(seq[i]);
  +            if (value == null || value.intValue() < 0)
  +            {
  +                result[i] = NF;
  +            }
  +            else
  +            {
  +                result[i] = value.intValue();
  +            }
  +        }
  +        result[seq.length] = EOS;
  +        return result;
       }
  -}
  \ No newline at end of file
  +
  +}
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org


Mime
View raw message