commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From l..@apache.org
Subject svn commit: r744126 [2/3] - in /commons/proper/math/trunk/src/experimental/org/apache/commons/math/linear: RecursiveLayoutRealMatrix.java RecursiveLayoutRealMatrixTest.java
Date Fri, 13 Feb 2009 14:38:50 GMT

Added: commons/proper/math/trunk/src/experimental/org/apache/commons/math/linear/RecursiveLayoutRealMatrix.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/experimental/org/apache/commons/math/linear/RecursiveLayoutRealMatrix.java?rev=744126&view=auto
==============================================================================
--- commons/proper/math/trunk/src/experimental/org/apache/commons/math/linear/RecursiveLayoutRealMatrix.java (added)
+++ commons/proper/math/trunk/src/experimental/org/apache/commons/math/linear/RecursiveLayoutRealMatrix.java Fri Feb 13 14:38:48 2009
@@ -0,0 +1,2078 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math.linear;
+
+import java.io.Serializable;
+
+import org.apache.commons.math.MathRuntimeException;
+
+/**
+ * Cache-friendly implementation of RealMatrix using recursive array layouts to store
+ * the matrix elements.
+ * <p>
+ * As of 2009-02-13, this implementation does not work! The padding at left and bottom
+ * sides of the matrix should be cleared after some operations like scalerAdd
+ * and is not. Also there is a limitation in the multiplication that can only
+ * process matrices with sizes similar enough to have the same power of two
+ * number of tiles in all three matrices A, B and C such that C = A*B. These
+ * parts have not been fixed since the performance gain with respect to
+ * DenseRealMatrix are not very important, and the numerical stability is not
+ * good. This may well be due to a bad implementation. This code has been put
+ * in the experimental part for the record, putting it into production would
+ * require solving all these issues.
+ * </p>
+ * <p>
+ * This implementation is based on the 2002 paper: <a
+ * href="http://www.cs.duke.edu/~alvy/papers/matrix-tpds.pdf">Recursive Array Layouts
+ * and Fast Matrix Multiplication</a> by Siddhartha Chatterjee, Alvin R. Lebeck,
+ * Praveen K. Patnala and Mithuna Thottethodi.
+ * </p>
+ * <p>
+ * The matrix is split into several rectangular tiles. The tiles are laid out using
+ * a space-filling curve in a 2<sup>k</sup>&times;2<sup>k</sup> square. This
+ * implementation uses the Gray-Morton layout which starts as follows for a three-level
+ * recursion (i.e. an 8x8 matrix). The tiles size are adjusted in order to have the
+ * 2<sup>k</sup>&times;2<sup>k</sup> square. This may require padding at the right and
+ * bottom sides of the matrix (see above paper for a discussion of this padding feature).
+ * </p>
+ * <pre>
+ *                    |
+ *    00 01 | 06 07   |   24  25 | 30  31
+ *    03 02 | 05 04   |   27  26 | 29  28
+ *    ------+------   |   -------+-------
+ *    12 13 | 10 11   |   20  21 | 18  19
+ *    15 14 | 09 08   |   23  22 | 17  16
+ *                    |
+ * -------------------+--------------------
+ *                    |
+ *    48 49 | 54 55   |   40  41 | 46  47
+ *    51 50 | 53 52   |   43  42 | 45  44
+ *    ------+------   |   -------+-------
+ *    60 61 | 58 59   |   36  37 | 34  35
+ *    63 62 | 57 56   |   39  38 | 33  32
+ *                    |
+ * </pre>
+ * @version $Revision$ $Date$
+ * @since 2.0
+ */
+public class RecursiveLayoutRealMatrix extends AbstractRealMatrix implements Serializable {
+    
+    /** Serializable version identifier */
+    private static final long serialVersionUID = 1607919006739190004L;
+
+    /** Maximal allowed tile size in bytes.
+     * <p>In order to avoid cache miss during multiplication,
+     * a suggested value is cache_size/3.</p>
+     */
+    private static final int MAX_TILE_SIZE_BYTES = (64 * 1024) / 3;
+    //private static final int MAX_TILE_SIZE_BYTES = 32;
+
+    /** Storage array for matrix elements. */
+    private final double data[];
+
+    /** Number of rows of the matrix. */
+    private final int rows;
+
+    /** Number of columns of the matrix. */
+    private final int columns;
+
+    /** Number of terminal tiles along rows and columns (guaranteed to be a power of 2). */
+    private final int tileNumber;
+
+    /** Number of rows in each terminal tile. */
+    private final int tileSizeRows;
+
+    /** Number of columns in each terminal tile. */
+    private final int tileSizeColumns;
+
+    /**
+     * Create a new matrix with the supplied row and column dimensions.
+     *
+     * @param rows  the number of rows in the new matrix
+     * @param columns  the number of columns in the new matrix
+     * @throws IllegalArgumentException if row or column dimension is not
+     *  positive
+     */
+    public RecursiveLayoutRealMatrix(final int rows, final int columns)
+        throws IllegalArgumentException {
+
+        super(rows, columns);
+        this.rows    = rows;
+        this.columns = columns;
+
+        // compute optimal layout
+        tileNumber      = tilesNumber(rows, columns);
+        tileSizeRows    = tileSize(rows, tileNumber);
+        tileSizeColumns = tileSize(columns, tileNumber);
+
+        // create storage array
+        data = new double[tileNumber * tileNumber * tileSizeRows * tileSizeColumns];
+
+    }
+
+    /**
+     * Create a new dense matrix copying entries from raw layout data.
+     * <p>The input array <em>must</em> be in raw layout.</p>
+     * <p>Calling this constructor is equivalent to call:
+     * <pre>matrix = new RecursiveLayoutRealMatrix(rawData.length, rawData[0].length,
+     *                                             toRecursiveLayout(rawData), false);</pre>
+     * </p>
+     * @param rawData data for new matrix, in raw layout
+     *
+     * @exception IllegalArgumentException if <code>rawData</code> shape is
+     * inconsistent with tile layout
+     * @see #DenseRealMatrix(int, int, double[][], boolean)
+     */
+    public RecursiveLayoutRealMatrix(final double[][] rawData)
+        throws IllegalArgumentException {
+        this(rawData.length, rawData[0].length, toRecursiveLayout(rawData), false);
+    }
+
+    /**
+     * Create a new dense matrix copying entries from recursive layout data.
+     * <p>The input array <em>must</em> already be in recursive layout.</p>
+     * @param rows  the number of rows in the new matrix
+     * @param columns  the number of columns in the new matrix
+     * @param data data for new matrix, in recursive layout
+     * @param copyArray if true, the input array will be copied, otherwise
+     * it will be referenced
+     *
+     * @exception IllegalArgumentException if <code>data</code> size is
+     * inconsistent with matrix size
+     * @see #toRecursiveLayout(double[][])
+     * @see #RecursiveLayoutRealMatrix(double[][])
+     */
+    public RecursiveLayoutRealMatrix(final int rows, final int columns,
+                                     final double[] data, final boolean copyArray)
+        throws IllegalArgumentException {
+
+        super(rows, columns);
+        this.rows    = rows;
+        this.columns = columns;
+
+        // compute optimal layout
+        tileNumber      = tilesNumber(rows, columns);
+        tileSizeRows    = tileSize(rows, tileNumber);
+        tileSizeColumns = tileSize(columns, tileNumber);
+
+        // create storage array
+        final int expectedLength = tileNumber * tileNumber * tileSizeRows * tileSizeColumns;
+        if (data.length != expectedLength) {
+            throw MathRuntimeException.createIllegalArgumentException("wrong array size (got {0}, expected {1})",
+                                                                      new Object[] {
+                                                                          data.length,
+                                                                          expectedLength
+                                                                      });
+        }
+
+        if (copyArray) {
+            // allocate storage array
+            this.data = data.clone();
+        } else {
+            // reference existing array
+            this.data = data;
+        }
+
+    }
+
+    /**
+     * Convert a data array from raw layout to recursive layout.
+     * <p>
+     * Raw layout is the straightforward layout where element at row i and
+     * column j is in array element <code>rawData[i][j]</code>. Recursive layout
+     * is the layout used in {@link RecursiveLayoutRealMatrix} instances, where the matrix
+     * is stored in a dimension 1 array using a space-filling curve to spread the matrix
+     * elements along the array.
+     * </p>
+     * @param rawData data array in raw layout
+     * @return a new data array containing the same entries but in recursive layout
+     * @exception IllegalArgumentException if <code>rawData</code> is not rectangular
+     *  (not all rows have the same length)
+     * @see #RecursiveLayoutRealMatrix(int, int, double[], boolean)
+     */
+    public static double[] toRecursiveLayout(final double[][] rawData)
+        throws IllegalArgumentException {
+
+        final int rows    = rawData.length;
+        final int columns = rawData[0].length;
+
+        // compute optimal layout
+        final int tileNumber      = tilesNumber(rows, columns);
+        final int tileSizeRows    = tileSize(rows, tileNumber);
+        final int tileSizeColumns = tileSize(columns, tileNumber);
+
+        // safety checks
+        for (int i = 0; i < rawData.length; ++i) {
+            final int length = rawData[i].length;
+            if (length != columns) {
+                throw MathRuntimeException.createIllegalArgumentException(
+                        "some rows have length {0} while others have length {1}",
+                        new Object[] { columns, length }); 
+            }
+        }
+
+        // convert array row after row
+        final double[] data = new double[tileNumber * tileNumber * tileSizeRows * tileSizeColumns];
+        for (int i = 0; i < rawData.length; ++i) {
+            final int iTile = i / tileSizeRows;
+            final double[] rawDataI = rawData[i];
+            for (int jTile = 0; jTile < tileNumber; ++jTile) {
+                final int tileStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns;
+                final int dataStart = tileStart + (i - iTile * tileSizeRows) * tileSizeColumns;
+                final int jStart    = jTile * tileSizeColumns;
+                if (jStart < columns) {
+                    final int jEnd      = Math.min(jStart + tileSizeColumns, columns);
+                    System.arraycopy(rawDataI, jStart, data, dataStart, jEnd - jStart);
+                }
+            }
+        }
+
+        return data;
+
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix createMatrix(final int rowDimension, final int columnDimension)
+        throws IllegalArgumentException {
+        return new RecursiveLayoutRealMatrix(rowDimension, columnDimension);
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix copy() {
+        return new RecursiveLayoutRealMatrix(rows, columns, data, true);
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix add(final RealMatrix m)
+        throws IllegalArgumentException {
+        try {
+            return add((RecursiveLayoutRealMatrix) m);
+        } catch (ClassCastException cce) {
+
+            // safety check
+            checkAdditionCompatible(m);
+
+            final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, columns);
+
+            // perform addition tile-wise, to ensure good cache behavior
+            for (int index = 0; index < tileNumber * tileNumber; ++index) {
+
+                // perform addition on the current tile
+                final int tileStart = index * tileSizeRows * tileSizeColumns;
+                final long indices  = tilesIndices(index);
+                final int iTile     = (int) (indices >> 32);
+                final int jTile     = (int) (indices & 0xffffffff);
+                final int pStart    = iTile * tileSizeRows;
+                final int pEnd      = Math.min(pStart + tileSizeRows, rows);
+                final int qStart    = jTile * tileSizeColumns;
+                final int qEnd      = Math.min(qStart + tileSizeColumns, columns);
+                for (int p = pStart; p < pEnd; ++p) {
+                    final int kStart = tileStart + (p - pStart) * tileSizeColumns;
+                    for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                        out.data[k] = data[k] + m.getEntry(p, q);
+                    }
+                }
+
+            }
+
+            return out;
+
+        }
+    }
+
+    /**
+     * Compute the sum of this and <code>m</code>.
+     *
+     * @param m    matrix to be added
+     * @return     this + m
+     * @throws  IllegalArgumentException if m is not the same size as this
+     */
+    public RecursiveLayoutRealMatrix add(final RecursiveLayoutRealMatrix m)
+        throws IllegalArgumentException {
+
+        // safety check
+        checkAdditionCompatible(m);
+
+        final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, columns);
+
+        // streamlined addition
+        for (int i = 0; i < data.length; ++i) {
+            out.data[i] = data[i] + m.data[i];
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix subtract(final RealMatrix m)
+        throws IllegalArgumentException {
+        try {
+            return subtract((RecursiveLayoutRealMatrix) m);
+        } catch (ClassCastException cce) {
+
+            // safety check
+            checkSubtractionCompatible(m);
+
+            final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, columns);
+
+            // perform subtraction tile-wise, to ensure good cache behavior
+            for (int index = 0; index < tileNumber * tileNumber; ++index) {
+
+                // perform addition on the current tile
+                final int tileStart = index * tileSizeRows * tileSizeColumns;
+                final long indices  = tilesIndices(index);
+                final int iTile     = (int) (indices >> 32);
+                final int jTile     = (int) (indices & 0xffffffff);
+                final int pStart    = iTile * tileSizeRows;
+                final int pEnd      = Math.min(pStart + tileSizeRows, rows);
+                final int qStart    = jTile * tileSizeColumns;
+                final int qEnd      = Math.min(qStart + tileSizeColumns, columns);
+                for (int p = pStart; p < pEnd; ++p) {
+                    final int kStart = tileStart + (p - pStart) * tileSizeColumns;
+                    for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                        out.data[k] = data[k] - m.getEntry(p, q);
+                    }
+                }
+
+            }
+
+            return out;
+
+        }
+    }
+
+    /**
+     * Compute this minus <code>m</code>.
+     *
+     * @param m    matrix to be subtracted
+     * @return     this - m
+     * @throws  IllegalArgumentException if m is not the same size as this
+     */
+    public RecursiveLayoutRealMatrix subtract(final RecursiveLayoutRealMatrix m)
+        throws IllegalArgumentException {
+
+        // safety check
+        checkSubtractionCompatible(m);
+
+        final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, columns);
+
+        // streamlined subtraction
+        for (int i = 0; i < data.length; ++i) {
+            out.data[i] = data[i] - m.data[i];
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix scalarAdd(final double d)
+        throws IllegalArgumentException {
+
+        final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, columns);
+
+        // streamlined addition
+        for (int i = 0; i < data.length; ++i) {
+            out.data[i] = data[i] + d;
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix scalarMultiply(final double d)
+        throws IllegalArgumentException {
+
+        final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, columns);
+
+        // streamlined multiplication
+        for (int i = 0; i < data.length; ++i) {
+            out.data[i] = data[i] * d;
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix multiply(final RealMatrix m)
+        throws IllegalArgumentException {
+        try {
+            return multiply((RecursiveLayoutRealMatrix) m);
+        } catch (ClassCastException cce) {
+
+            // safety check
+            checkMultiplicationCompatible(m);
+
+            final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, m.getColumnDimension());
+
+            // perform multiplication tile-wise, to ensure good cache behavior
+            for (int index = 0; index < out.tileNumber * out.tileNumber; ++index) {
+                final int tileStart = index * out.tileSizeRows * out.tileSizeColumns;
+                final long indices  = tilesIndices(index);
+                final int iTile     = (int) (indices >> 32);
+                final int jTile     = (int) (indices & 0xffffffff);
+                final int iStart    = iTile * out.tileSizeRows;
+                final int iEnd      = Math.min(iStart + out.tileSizeRows, out.rows);
+                final int jStart    = jTile * out.tileSizeColumns;
+                final int jEnd      = Math.min(jStart + out.tileSizeColumns, out.columns);
+
+                // perform multiplication for current tile
+                for (int kTile = 0; kTile < tileNumber; ++kTile) {
+                    final int kTileStart = tileIndex(iTile, kTile) * tileSizeRows * tileSizeColumns;
+                    for (int i = iStart, lStart = kTileStart, oStart = tileStart;
+                         i < iEnd;
+                         ++i, lStart += tileSizeColumns, oStart += out.tileSizeColumns) {
+                        final int lEnd = Math.min(lStart + tileSizeColumns, columns);
+                        for (int j = jStart, o = oStart; j < jEnd; ++j, ++o) {
+                            double sum = 0;
+                            for (int l = lStart, k = kTile * tileSizeColumns; l < lEnd; ++l, ++k) {
+                                sum += data[l] * m.getEntry(k, j);
+                            }
+                            out.data[o] += sum;
+                        }
+                    }
+                }
+            }
+
+            return out;
+
+        }
+    }
+
+    /**
+     * Returns the result of postmultiplying this by m.
+     * <p>The Strassen matrix multiplication method is used here. This
+     * method computes C = A &times; B recursively by splitting all matrices
+     * in four quadrants and computing:</p>
+     * <pre>
+     * P<sub>1</sub> = (A<sub>1,1</sub> + A<sub>2,2</sub>) &times; (B<sub>1,1</sub> + B<sub>2,2</sub>)
+     * P<sub>2</sub> = (A<sub>2,1</sub> + A<sub>2,2</sub>) &times; (B<sub>1,1</sub>)
+     * P<sub>3</sub> = (A<sub>1,1</sub>) &times; (B<sub>1,2</sub> - B<sub>2,2</sub>)
+     * P<sub>4</sub> = (A<sub>2,2</sub>) &times; (B<sub>2,1</sub> - B<sub>1,1</sub>)
+     * P<sub>5</sub> = (A<sub>1,1</sub> + A<sub>1,2</sub>) &times; B<sub>2,2</sub>
+     * P<sub>6</sub> = (A<sub>2,1</sub> - A<sub>1,1</sub>) &times; (B<sub>1,1</sub> + B<sub>1,2</sub>)
+     * P<sub>7</sub> = (A<sub>1,2</sub> - A<sub>2,2</sub>) &times; (B<sub>2,1</sub> + B<sub>2,2</sub>)
+     *
+     * C<sub>1,1</sub> = P<sub>1</sub> + P<sub>4</sub> - P<sub>5</sub> + P<sub>7</sub>
+     * C<sub>1,2</sub> = P<sub>3</sub> + P<sub>5</sub>
+     * C<sub>2,1</sub> = P<sub>2</sub> + P<sub>4</sub>
+     * C<sub>2,2</sub> = P<sub>1</sub> + P<sub>3</sub> - P<sub>2</sub> + P<sub>6</sub>
+     * </pre>
+     * <p>
+     * This implementation is based on the 2002 paper: <a
+     * href="http://www.cs.duke.edu/~alvy/papers/matrix-tpds.pdf">Recursive Array Layouts
+     * and Fast Matrix Multiplication</a> by Siddhartha Chatterjee, Alvin R. Lebeck,
+     * Praveen K. Patnala and Mithuna Thottethodi.
+     * </p>
+     *
+     * @param m    matrix to postmultiply by
+     * @return     this * m
+     * @throws     IllegalArgumentException
+     *             if columnDimension(this) != rowDimension(m)
+     */
+    public RecursiveLayoutRealMatrix multiply(RecursiveLayoutRealMatrix m)
+        throws IllegalArgumentException {
+
+        // safety check
+        checkMultiplicationCompatible(m);
+
+        final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, m.columns);
+        if ((tileNumber != m.tileNumber) || (tileNumber != out.tileNumber)) {
+            // TODO get rid of this test
+            throw new RuntimeException("multiplication " + rows + "x" + columns + " * " +
+                                       m.rows + "x" + m.columns + " -> left matrix: " + tileNumber +
+                                       " tiles, right matrix: " + m.tileNumber + " tiles, result matrix " +
+                                       out.tileNumber + " tiles");
+        }
+        strassenMultiply(data, 0, true, m.data, 0, true, out.data, 0, tileNumber,
+                         tileSizeRows, m.tileSizeColumns, tileSizeColumns);
+        
+        return out;
+
+    }
+
+    /**
+     * Perform recursive multiplication using Strassen's algorithm.
+     * @param a left term of multiplication
+     * @param aStart start index in a
+     * @param aDirect direct/reversed orientation flag for a
+     * @param b right term of multiplication
+     * @param bStart start index in b
+     * @param bDirect direct/reversed orientation flag for b
+     * @param result result array (will have same orientation as b)
+     * @param resultStart start index in result
+     * @param nTiles number of elements to add
+     * @param bsRows number of rows in result tiles
+     * @param bsColumns number of columns in result tiles
+     * @param bsMultiplicands number of rows/columns in multiplicands
+     */
+    private static void strassenMultiply(final double[] a, final int aStart, final boolean aDirect,
+                                         final double[] b, final int bStart, final boolean bDirect,
+                                         final double[] result, final int resultStart, final int nTiles,
+                                         final int bsRows, final int bsColumns, final int bsMultiplicands) {
+        if (nTiles == 1) {
+            // leaf recursion tile: perform traditional multiplication
+            final int bsColumns2 = 2 * bsColumns;
+            final int bsColumns3 = 3 * bsColumns;
+            final int bsColumns4 = 4 * bsColumns;
+            for (int i = 0; i < bsRows; ++i) {
+                for (int j = 0; j < bsColumns; ++j) {
+                    double sum = 0;
+                    int k  = 0;
+                    int aK = aStart + i * bsMultiplicands;
+                    int bK = bStart + j;
+                    while (k < bsMultiplicands - 3) {
+                        sum += a[aK]     * b[bK] +
+                               a[aK + 1] * b[bK + bsColumns] +
+                               a[aK + 2] * b[bK + bsColumns2] +
+                               a[aK + 3] * b[bK + bsColumns3];
+                        k  += 4;
+                        aK += 4;
+                        bK += bsColumns4;
+                    }
+                    while (k < bsMultiplicands) {
+                        sum += a[aK] * b[bK];
+                        k  += 1;
+                        aK += 1;
+                        bK += bsColumns;
+                    }
+                    result[resultStart + i * bsColumns + j] = sum;
+                }
+            }
+        } else {
+            // regular recursion node: use recursive Strassen implementation
+            final int n2            = nTiles / 2;
+            final int aQuadrantSize = bsRows          * n2 * bsMultiplicands * n2;
+            final int bQuadrantSize = bsMultiplicands * n2 * bsColumns       * n2;
+            final int cQuadrantSize = bsRows          * n2 * bsColumns       * n2;
+            final double[] sA = new double[aQuadrantSize];
+            final double[] sB = new double[bQuadrantSize];
+            final boolean nonLeafQuadrants = n2 > 1;
+
+            // identify A quadrants start indices
+            final int a11Start, a12Start, a21Start, a22Start;
+            if (aDirect) {
+                a11Start = aStart;
+                a12Start = aStart +     aQuadrantSize;
+                a21Start = aStart + 3 * aQuadrantSize;
+                a22Start = aStart + 2 * aQuadrantSize;
+            } else {
+                a11Start = aStart + 2 * aQuadrantSize;
+                a12Start = aStart + 3 * aQuadrantSize;
+                a21Start = aStart +     aQuadrantSize;
+                a22Start = aStart;
+            }
+
+            // identify B and C quadrants start indices
+            // (C is constructed with the same orientation as B)
+            final int b11Start, b12Start, b21Start, b22Start;
+            final int c11Start, c12Start, c21Start, c22Start;
+            if (bDirect) {
+                b11Start = bStart;
+                b12Start = bStart +     bQuadrantSize;
+                b21Start = bStart + 3 * bQuadrantSize;
+                b22Start = bStart + 2 * bQuadrantSize;
+                c11Start = resultStart;
+                c12Start = resultStart +     cQuadrantSize;
+                c21Start = resultStart + 3 * cQuadrantSize;
+                c22Start = resultStart + 2 * cQuadrantSize;
+            } else {
+                b11Start = bStart + 2 * bQuadrantSize;
+                b12Start = bStart + 3 * bQuadrantSize;
+                b21Start = bStart +     bQuadrantSize;
+                b22Start = bStart;
+                c11Start = resultStart + 2 * cQuadrantSize;
+                c12Start = resultStart + 3 * cQuadrantSize;
+                c21Start = resultStart +     cQuadrantSize;
+                c22Start = resultStart;
+            }
+
+            // optimal order for cache efficiency: P3, P6, P2, P1, P5, P7, P4
+
+            // P3  = (A11)(B12 - B22)
+            // C12 = P3 + ...
+            tilesSubtract(b, b12Start, false, b, b22Start, false, sB, 0,
+                          bQuadrantSize, nonLeafQuadrants);
+            strassenMultiply(a, a11Start, true, sB, 0, false, result, c12Start,
+                             n2, bsRows, bsColumns, bsMultiplicands);
+
+            // P6  = (A21 - A11)(B11 + B12)
+            // C22 = P3 + P6 + ...
+            final double[] p67 = new double[cQuadrantSize];
+            tilesSubtract(a, a21Start, true, a, a11Start, true, sA, 0,
+                          aQuadrantSize, nonLeafQuadrants);
+            tilesAdd(b, b11Start, true, b, b12Start, false, sB, 0,
+                     bQuadrantSize, nonLeafQuadrants);
+            strassenMultiply(sA, 0, true, sB, 0, true, p67, 0,
+                             n2, bsRows, bsColumns, bsMultiplicands);
+            tilesAdd(result, c12Start, false, p67, 0, true, result, c22Start,
+                     cQuadrantSize, nonLeafQuadrants);
+
+            // P2  = (A21 + A22)(B11)
+            // C21 = P2 + ...
+            // C22 = P3 + P6 - P2 + ...
+            tilesAdd(a, a21Start, true, a, a22Start, false, sA, 0,
+                     aQuadrantSize, nonLeafQuadrants);
+            strassenMultiply(sA, 0, true, b, b11Start, true, result, c21Start,
+                             n2, bsRows, bsColumns, bsMultiplicands);
+            tilesSelfSubtract(result, c22Start, false, result, c21Start, true,
+                              cQuadrantSize, nonLeafQuadrants);
+
+            // P1  = (A11 + A22)(B11 + B22)
+            // C11 = P1 + ...
+            // C22 = P3 + P6 - P2 + P1
+            tilesAdd(a, a11Start, true, a, a22Start, false, sA, 0,
+                     aQuadrantSize, nonLeafQuadrants);
+            tilesAdd(b, b11Start, true, b, b22Start, false, sB, 0,
+                     bQuadrantSize, nonLeafQuadrants);
+            strassenMultiply(sA, 0, true, sB, 0, true, result, c11Start,
+                             n2, bsRows, bsColumns, bsMultiplicands);
+            tilesSelfAdd(result, c22Start, false, result, c11Start, true,
+                         cQuadrantSize, nonLeafQuadrants);
+
+            // P5  = (A11 + A12)B22
+            // beware: there is a sign error here in Chatterjee et al. paper
+            // in figure 1, table b they subtract A12 from A11 instead of adding it
+            // C12 = P3 + P5
+            // C11 = P1 - P5 + ...
+            final double[] p45 = new double[cQuadrantSize];
+            tilesAdd(a, a11Start, true, a, a12Start, false, sA, 0,
+                     aQuadrantSize, nonLeafQuadrants);
+            strassenMultiply(sA, 0, true, b, b22Start, false, p45, 0,
+                             n2, bsRows, bsColumns, bsMultiplicands);
+            tilesSelfAdd(result, c12Start, false, p45, 0, false,
+                         cQuadrantSize, nonLeafQuadrants);
+            tilesSelfSubtract(result, c11Start, true, p45, 0, false,
+                              cQuadrantSize, nonLeafQuadrants);
+
+            // P7  = (A12 - A22)(B21 + B22)
+            // C11 = P1 - P5 + P7 + ...
+            tilesSubtract(a, a12Start, false, a, a22Start, false, sA, 0,
+                          aQuadrantSize, nonLeafQuadrants);
+            tilesAdd(b, b21Start, true, b, b22Start, false, sB, 0,
+                     bQuadrantSize, nonLeafQuadrants);
+            strassenMultiply(sA, 0, false, sB, 0, true, p67, 0,
+                             n2, bsRows, bsColumns, bsMultiplicands);
+            tilesSelfAdd(result, c11Start, true, p67, 0, true,
+                         cQuadrantSize, nonLeafQuadrants);
+
+            // P4  = (A22)(B21 - B11)
+            // C11 = P1 - P5 + P7 + P4
+            // C21 = P2 + P4
+            tilesSubtract(b, b21Start, true, b, b11Start, true, sB, 0,
+                          bQuadrantSize, nonLeafQuadrants);
+            strassenMultiply(a, a22Start, false, sB, 0, true, p45, 0,
+                             n2, bsRows, bsColumns, bsMultiplicands);
+            tilesSelfAdd(result, c11Start, true, p45, 0, true,
+                         cQuadrantSize, nonLeafQuadrants);
+            tilesSelfAdd(result, c21Start, true, p45, 0, true,
+                         cQuadrantSize, nonLeafQuadrants);
+
+        }
+    }
+
+    /**
+     * Perform an addition on a few tiles in arrays.
+     * @param a left term of addition
+     * @param aStart start index in a
+     * @param aDirect direct/reversed orientation flag for a
+     * @param b right term of addition
+     * @param bStart start index in b
+     * @param bDirect direct/reversed orientation flag for b
+     * @param result result array (will have same orientation as a)
+     * @param resultStart start index in result
+     * @param n number of elements to add
+     * @param nonLeafQuadrants if true the quadrant can be further decomposed
+     */
+    private static void tilesAdd(final double[] a, final int aStart, final boolean aDirect,
+                                 final double[] b, final int bStart, final boolean bDirect,
+                                 final double[] result, final int resultStart,
+                                 final int n, final boolean nonLeafQuadrants) {
+        if ((aDirect ^ bDirect) & nonLeafQuadrants) {
+            // a and b have different orientations
+            // perform addition in two half
+            final int n2 = n / 2;
+            addLoop(a, aStart,      b, bStart + n2, result, resultStart,      n2);
+            addLoop(a, aStart + n2, b, bStart,      result, resultStart + n2, n2);
+        } else {
+            // a and b have same orientations
+            // perform addition in one loop
+            addLoop(a, aStart, b, bStart, result, resultStart, n);
+        }
+    }
+
+    /**
+     * Perform an addition loop.
+     * @param a left term of addition
+     * @param aStart start index in a
+     * @param b right term of addition
+     * @param bStart start index in b
+     * @param result result array (will have same orientation as a)
+     * @param resultStart start index in result
+     * @param n number of elements to add
+     */
+    private static void addLoop(final double[] a, final int aStart,
+                                final double[] b, final int bStart,
+                                final double[] result, final int resultStart,
+                                final int n) {
+        int i = 0;
+        while (i < n - 3) {
+            final int r0 = resultStart + i;
+            final int a0 = aStart      + i;
+            final int b0 = bStart      + i;
+            result[r0]     = a[a0]     + b[b0];
+            result[r0 + 1] = a[a0 + 1] + b[b0 + 1];
+            result[r0 + 2] = a[a0 + 2] + b[b0 + 2];
+            result[r0 + 3] = a[a0 + 3] + b[b0 + 3];
+            i += 4;
+        }
+        while (i < n) {
+            result[resultStart + i] = a[aStart + i] + b[bStart + i];
+            ++i;
+        }
+    }
+
+    /**
+     * Perform a subtraction on a few tiles in arrays.
+     * @param a left term of subtraction
+     * @param aStart start index in a
+     * @param aDirect direct/reversed orientation flag for a
+     * @param b right term of subtraction
+     * @param bStart start index in b
+     * @param bDirect direct/reversed orientation flag for b
+     * @param result result array (will have same orientation as a)
+     * @param resultStart start index in result
+     * @param n number of elements to subtract
+     * @param nonLeafQuadrants if true the quadrant can be further decomposed
+     */
+    private static void tilesSubtract(final double[] a, final int aStart, final boolean aDirect,
+                                      final double[] b, final int bStart, final boolean bDirect,
+                                      final double[] result, final int resultStart,
+                                      final int n, final boolean nonLeafQuadrants) {
+        if ((aDirect ^ bDirect) & nonLeafQuadrants) {
+            // a and b have different orientations
+            // perform subtraction in two half
+            final int n2 = n / 2;
+            subtractLoop(a, aStart,      b, bStart + n2, result, resultStart,      n2);
+            subtractLoop(a, aStart + n2, b, bStart,      result, resultStart + n2, n2);
+        } else {
+            // a and b have same orientations
+            // perform subtraction in one loop
+            subtractLoop(a, aStart, b, bStart, result, resultStart, n);
+        }
+    }
+
+    /**
+     * Perform a subtraction loop.
+     * @param a left term of subtraction
+     * @param aStart start index in a
+     * @param b right term of subtraction
+     * @param bStart start index in b
+     * @param result result array (will have same orientation as a)
+     * @param resultStart start index in result
+     * @param n number of elements to subtract
+     */
+    private static void subtractLoop(final double[] a, final int aStart,
+                                     final double[] b, final int bStart,
+                                     final double[] result, final int resultStart,
+                                     final int n) {
+        int i = 0;
+        while (i < n - 3) {
+            final int r0 = resultStart + i;
+            final int a0 = aStart      + i;
+            final int b0 = bStart      + i;
+            result[r0]     = a[a0]     - b[b0];
+            result[r0 + 1] = a[a0 + 1] - b[b0 + 1];
+            result[r0 + 2] = a[a0 + 2] - b[b0 + 2];
+            result[r0 + 3] = a[a0 + 3] - b[b0 + 3];
+            i += 4;
+        }
+        while (i < n) {
+            result[resultStart + i] = a[aStart + i] - b[bStart + i];
+            ++i;
+        }
+    }
+
+    /**
+     * Perform a self-addition on a few tiles in arrays.
+     * @param a left term of addition (will be overwritten with result)
+     * @param aStart start index in a
+     * @param aDirect direct/reversed orientation flag for a
+     * @param b right term of addition
+     * @param bStart start index in b
+     * @param bDirect direct/reversed orientation flag for b
+     * @param n number of elements to add
+     * @param nonLeafQuadrants if true the quadrant can be further decomposed
+     */
+    private static void tilesSelfAdd(final double[] a, final int aStart, final boolean aDirect,
+                                     final double[] b, final int bStart, final boolean bDirect,
+                                     final int n, final boolean nonLeafQuadrants) {
+        if ((aDirect ^ bDirect) & nonLeafQuadrants) {
+            // a and b have different orientations
+            // perform addition in two half
+            final int n2 = n / 2;
+            selfAddLoop(a, aStart,      b, bStart + n2, n2);
+            selfAddLoop(a, aStart + n2, b, bStart,      n2);
+        } else {
+            // a and b have same orientations
+            // perform addition in one loop
+            selfAddLoop(a, aStart, b, bStart, n);
+        }
+    }
+
+    /**
+     * Perform a self-addition loop.
+     * @param a left term of addition (will be overwritten with result)
+     * @param aStart start index in a
+     * @param b right term of addition
+     * @param bStart start index in b
+     * @param n number of elements to add
+     */
+    private static void selfAddLoop(final double[] a, final int aStart,
+                                    final double[] b, final int bStart,
+                                    final int n) {
+        int i = 0;
+        while (i < n - 3) {
+            final int a0 = aStart + i;
+            final int b0 = bStart + i;
+            a[a0]     += b[b0];
+            a[a0 + 1] += b[b0 + 1];
+            a[a0 + 2] += b[b0 + 2];
+            a[a0 + 3] += b[b0 + 3];
+            i += 4;
+        }
+        while (i < n) {
+            a[aStart + i] += b[bStart + i];
+            ++i;
+        }
+    }
+
+    /**
+     * Perform a self-subtraction on a few tiles in arrays.
+     * @param a left term of subtraction (will be overwritten with result)
+     * @param aStart start index in a
+     * @param aDirect direct/reversed orientation flag for a
+     * @param b right term of subtraction
+     * @param bStart start index in b
+     * @param bDirect direct/reversed orientation flag for b
+     * @param n number of elements to subtract
+     * @param nonLeafQuadrants if true the quadrant can be further decomposed
+     */
+    private static void tilesSelfSubtract(final double[] a, final int aStart, final boolean aDirect,
+                                          final double[] b, final int bStart, final boolean bDirect,
+                                          final int n, final boolean nonLeafQuadrants) {
+        if ((aDirect ^ bDirect) & nonLeafQuadrants) {
+            // a and b have different orientations
+            // perform subtraction in two half
+            final int n2 = n / 2;
+            selfSubtractLoop(a, aStart,      b, bStart + n2, n2);
+            selfSubtractLoop(a, aStart + n2, b, bStart,      n2);
+        } else {
+            // a and b have same orientations
+            // perform subtraction in one loop
+            selfSubtractLoop(a, aStart, b, bStart, n);
+        }
+    }
+
+    /**
+     * Perform a self-subtraction loop.
+     * @param a left term of subtraction (will be overwritten with result)
+     * @param aStart start index in a
+     * @param b right term of subtraction
+     * @param bStart start index in b
+     * @param n number of elements to subtract
+     */
+    private static void selfSubtractLoop(final double[] a, final int aStart,
+                                         final double[] b, final int bStart,
+                                         final int n) {
+        int i = 0;
+        while (i < n - 3) {
+            final int a0 = aStart + i;
+            final int b0 = bStart + i;
+            a[a0]     -= b[b0];
+            a[a0 + 1] -= b[b0 + 1];
+            a[a0 + 2] -= b[b0 + 2];
+            a[a0 + 3] -= b[b0 + 3];
+            i += 4;
+        }
+        while (i < n) {
+            a[aStart + i] -= b[bStart + i];
+            ++i;
+        }
+    }
+
+    /** {@inheritDoc} */
+    public double[][] getData() {
+
+        final double[][] out = new double[rows][columns];
+
+        // perform extraction tile-wise, to ensure good cache behavior
+        for (int index = 0; index < tileNumber * tileNumber; ++index) {
+
+            // perform extraction on the current tile
+            final int tileStart = index * tileSizeRows * tileSizeColumns;
+            final long indices  = tilesIndices(index);
+            final int iTile     = (int) (indices >> 32);
+            final int jTile     = (int) (indices & 0xffffffff);
+            final int pStart    = iTile * tileSizeRows;
+            final int qStart    = jTile * tileSizeColumns;
+            if (pStart < rows && qStart < columns) {
+                final int pEnd = Math.min(pStart + tileSizeRows, rows);
+                final int qEnd = Math.min(qStart + tileSizeColumns, columns);
+                int tileRowStart = tileStart;
+                for (int p = pStart; p < pEnd; ++p) {
+                    System.arraycopy(data, tileRowStart, out[p], qStart, qEnd - qStart);
+                    tileRowStart += tileSizeColumns;
+                }
+            }
+
+        }
+
+        return out;
+        
+    }
+
+    /** {@inheritDoc} */
+    public double getFrobeniusNorm() {
+        double sum2 = 0;
+        for (final double entry : data) {
+            sum2 += entry * entry;
+        }
+        return Math.sqrt(sum2);
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix getSubMatrix(final int startRow, final int endRow,
+                                   final int startColumn, final int endColumn)
+        throws MatrixIndexException {
+
+        // safety checks
+        checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
+
+        // create the output matrix
+        final RecursiveLayoutRealMatrix out =
+            new RecursiveLayoutRealMatrix(endRow - startRow + 1, endColumn - startColumn + 1);
+
+        // perform extraction tile-wise, to ensure good cache behavior
+        for (int iTile = 0; iTile < out.tileNumber; ++iTile) {
+            final int iStart = startRow + iTile * out.tileSizeRows;
+            final int iEnd   = Math.min(startRow + Math.min((iTile + 1) * out.tileSizeRows, out.rows),
+                                        endRow + 1);
+            for (int jTile = 0; jTile < out.tileNumber; ++jTile) {
+                final int jStart = startColumn + jTile * out.tileSizeColumns;
+                final int jEnd   = Math.min(startColumn + Math.min((jTile + 1) * out.tileSizeColumns, out.columns),
+                                            endColumn + 1);
+
+                // the current output tile may expand on more than one instance tile
+                for (int pTile = iStart / tileSizeRows; pTile * tileSizeRows < iEnd; ++pTile) {
+                    final int p0     = pTile * tileSizeRows;
+                    final int pStart = Math.max(p0, iStart);
+                    final int pEnd   = Math.min(Math.min(p0 + tileSizeRows, endRow + 1), iEnd);
+                    for (int qTile = jStart / tileSizeColumns; qTile * tileSizeColumns < jEnd; ++qTile) {
+                        final int q0     = qTile * tileSizeColumns;
+                        final int qStart = Math.max(q0, jStart);
+                        final int qEnd   = Math.min(Math.min(q0 + tileSizeColumns, endColumn + 1), jEnd);
+
+                        // copy the overlapping part of instance and output tiles
+                        int outIndex = tileIndex(iTile, jTile) * out.tileSizeRows * out.tileSizeColumns +
+                                       (pStart - iStart) * out.tileSizeColumns + (qStart - jStart);
+                        int index    = tileIndex(pTile, qTile) * tileSizeRows * tileSizeColumns +
+                                       (pStart - p0) * tileSizeColumns + (qStart - q0);
+                        for (int p = pStart; p < pEnd; ++p) {
+                            System.arraycopy(data, index, out.data, outIndex, qEnd - qStart);
+                            outIndex += out.tileSizeColumns;
+                            index    += tileSizeColumns;
+                        }
+                        
+
+                    }
+               }
+
+            }
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public void setSubMatrix(final double[][] subMatrix, final int row, final int column)
+        throws MatrixIndexException {
+
+        // safety checks
+        final int refLength = subMatrix[0].length;
+        if (refLength < 1) {
+            throw MathRuntimeException.createIllegalArgumentException("matrix must have at least one column",
+                                                                      null);             
+        }
+        final int endRow    = row + subMatrix.length - 1;
+        final int endColumn = column + refLength - 1;
+        checkSubMatrixIndex(row, endRow, column, endColumn);
+        for (final double[] subRow : subMatrix) {
+            if (subRow.length != refLength) {
+                throw MathRuntimeException.createIllegalArgumentException("some rows have length {0} while others have length {1}",
+                                                                          new Object[] {
+                                                                              refLength, subRow.length
+                                                                          }); 
+            }
+        }
+
+        // compute tiles bounds
+        final int tileStartRow    = row / tileSizeRows;
+        final int tileEndRow      = (endRow + tileSizeRows) / tileSizeRows;
+        final int tileStartColumn = column / tileSizeColumns;
+        final int tileEndColumn   = (endColumn + tileSizeColumns) / tileSizeColumns;
+
+        // perform copy tile-wise, to ensure good cache behavior
+        for (int iTile = tileStartRow; iTile < tileEndRow; ++iTile) {
+            final int firstRow = iTile * tileSizeRows;
+            final int iStart   = Math.max(row,    firstRow);
+            final int iEnd     = Math.min(endRow + 1, firstRow + tileSizeRows);
+
+            for (int jTile = tileStartColumn; jTile < tileEndColumn; ++jTile) {
+                final int firstColumn = jTile * tileSizeColumns;
+                final int jStart      = Math.max(column,    firstColumn);
+                final int jEnd        = Math.min(endColumn + 1, firstColumn + tileSizeColumns);
+                final int jLength     = jEnd - jStart;
+                final int tileStart   = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns;
+
+                // handle one tile, row by row
+                for (int i = iStart; i < iEnd; ++i) {
+                    System.arraycopy(subMatrix[i - row], jStart - column,
+                                     data, tileStart + (i - firstRow) * tileSizeColumns + (jStart - firstColumn),
+                                     jLength);
+                }
+
+            }
+        }
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix getRowMatrix(final int row)
+        throws MatrixIndexException {
+
+        checkRowIndex(row);
+        final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(1, columns);
+
+        // a row matrix has always only one large tile,
+        // because a single row cannot be split into 2^k tiles
+        // perform copy tile-wise, to ensure good cache behavior
+        final int iTile     = row / tileSizeRows;
+        final int rowOffset = row - iTile * tileSizeRows;
+        int outIndex        = 0;
+        for (int jTile = 0; jTile < tileNumber; ++jTile) {
+            final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                               rowOffset * tileSizeColumns;
+            final int length = Math.min(outIndex + tileSizeColumns, columns) - outIndex;
+            System.arraycopy(data, kStart, out.data, outIndex, length);
+            outIndex += length;
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public void setRowMatrix(final int row, final RealMatrix matrix)
+        throws MatrixIndexException, InvalidMatrixException {
+        try {
+            setRowMatrix(row, (RecursiveLayoutRealMatrix) matrix);
+        } catch (ClassCastException cce) {
+            super.setRowMatrix(row, matrix);
+        }
+    }
+
+    /**
+     * Sets the entries in row number <code>row</code>
+     * as a row matrix.  Row indices start at 0.
+     *
+     * @param row the row to be set
+     * @param matrix row matrix (must have one row and the same number of columns
+     * as the instance)
+     * @throws MatrixIndexException if the specified row index is invalid
+     * @throws InvalidMatrixException if the matrix dimensions do not match one
+     * instance row
+     */
+    public void setRowMatrix(final int row, final RecursiveLayoutRealMatrix matrix)
+        throws MatrixIndexException, InvalidMatrixException {
+
+        checkRowIndex(row);
+        final int nCols = getColumnDimension();
+        if ((matrix.getRowDimension() != 1) ||
+            (matrix.getColumnDimension() != nCols)) {
+            throw new InvalidMatrixException("dimensions mismatch: got {0}x{1} but expected {2}x{3}",
+                                             new Object[] {
+                                                 matrix.getRowDimension(),
+                                                 matrix.getColumnDimension(),
+                                                 1, nCols
+                                             });
+        }
+
+        // a row matrix has always only one large tile,
+        // because a single row cannot be split into 2^k tiles
+        // perform copy tile-wise, to ensure good cache behavior
+        final int iTile     = row / tileSizeRows;
+        final int rowOffset = row - iTile * tileSizeRows;
+        int outIndex        = 0;
+        for (int jTile = 0; jTile < tileNumber; ++jTile) {
+            final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                               rowOffset * tileSizeColumns;
+            final int length = Math.min(outIndex + tileSizeColumns, columns) - outIndex;
+            System.arraycopy(matrix.data, outIndex, data, kStart, length);
+            outIndex += length;
+        }
+
+    }
+    
+    /** {@inheritDoc} */
+    public RealMatrix getColumnMatrix(final int column)
+        throws MatrixIndexException {
+
+        checkColumnIndex(column);
+        final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(rows, 1);
+
+        // a column matrix has always only one large tile,
+        // because a single column cannot be split into 2^k tiles
+        // perform copy tile-wise, to ensure good cache behavior
+        final int jTile        = column / tileSizeColumns;
+        final int columnOffset = column - jTile * tileSizeColumns;
+        for (int iTile = 0; iTile < tileNumber; ++iTile) {
+            final int pStart = iTile * tileSizeRows;
+            final int pEnd   = Math.min(pStart + tileSizeRows, rows);
+            final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                               columnOffset;
+            for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                out.data[p] = data[k];
+            }
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public void setColumnMatrix(final int column, final RealMatrix matrix)
+        throws MatrixIndexException, InvalidMatrixException {
+        try {
+            setColumnMatrix(column, (RecursiveLayoutRealMatrix) matrix);
+        } catch (ClassCastException cce) {
+            super.setColumnMatrix(column, matrix);
+        }
+    }
+
+    /**
+     * Sets the entries in column number <code>column</code>
+     * as a column matrix.  Column indices start at 0.
+     *
+     * @param column the column to be set
+     * @param matrix column matrix (must have one column and the same number of rows
+     * as the instance)
+     * @throws MatrixIndexException if the specified column index is invalid
+     * @throws InvalidMatrixException if the matrix dimensions do not match one
+     * instance column
+     */
+    void setColumnMatrix(final int column, final RecursiveLayoutRealMatrix matrix)
+        throws MatrixIndexException, InvalidMatrixException {
+
+        checkColumnIndex(column);
+        final int nRows = getRowDimension();
+        if ((matrix.getRowDimension() != nRows) ||
+            (matrix.getColumnDimension() != 1)) {
+            throw new InvalidMatrixException("dimensions mismatch: got {0}x{1} but expected {2}x{3}",
+                                             new Object[] {
+                                                 matrix.getRowDimension(),
+                                                 matrix.getColumnDimension(),
+                                                 nRows, 1
+                                             });
+        }
+
+        // a column matrix has always only one large tile,
+        // because a single column cannot be split into 2^k tiles
+        // perform copy tile-wise, to ensure good cache behavior
+        final int jTile        = column / tileSizeColumns;
+        final int columnOffset = column - jTile * tileSizeColumns;
+        for (int iTile = 0; iTile < tileNumber; ++iTile) {
+            final int pStart = iTile * tileSizeRows;
+            final int pEnd   = Math.min(pStart + tileSizeRows, rows);
+            final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                               columnOffset;
+            for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                data[k] = matrix.data[p];
+            }
+        }
+
+    }
+    
+    /** {@inheritDoc} */
+    public void setRowVector(final int row, final RealVector vector)
+        throws MatrixIndexException, InvalidMatrixException {
+        try {
+            setRow(row, ((RealVectorImpl) vector).getDataRef());
+        } catch (ClassCastException cce) {
+            checkRowIndex(row);
+            if (vector.getDimension() != columns) {
+                throw new InvalidMatrixException("dimensions mismatch: got {0}x{1} but expected {2}x{3}",
+                                                 new Object[] {
+                                                     1, vector.getDimension(),
+                                                     1, columns
+                                                 });
+            }
+
+            // perform copy tile-wise, to ensure good cache behavior
+            final int iTile     = row / tileSizeRows;
+            final int rowOffset = row - iTile * tileSizeRows;
+            int outIndex        = 0;
+            for (int jTile = 0; jTile < tileNumber; ++jTile) {
+                final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                                   rowOffset * tileSizeColumns;
+                final int length = Math.min(outIndex + tileSizeColumns, columns) - outIndex;
+                for (int l = 0; l < length; ++l) {
+                    data[kStart + l] = vector.getEntry(outIndex + l);
+                }
+                outIndex += length;
+            }
+        }
+    }
+
+    /** {@inheritDoc} */
+    public void setColumnVector(final int column, final RealVector vector)
+        throws MatrixIndexException, InvalidMatrixException {
+        try {
+            setColumn(column, ((RealVectorImpl) vector).getDataRef());
+        } catch (ClassCastException cce) {
+            checkColumnIndex(column);
+            if (vector.getDimension() != rows) {
+                throw new InvalidMatrixException("dimensions mismatch: got {0}x{1} but expected {2}x{3}",
+                                                 new Object[] {
+                                                     vector.getDimension(), 1,
+                                                     rows, 1
+                                                 });
+            }
+
+            // perform copy tile-wise, to ensure good cache behavior
+            final int jTile        = column / tileSizeColumns;
+            final int columnOffset = column - jTile * tileSizeColumns;
+            for (int iTile = 0; iTile < tileNumber; ++iTile) {
+                final int pStart = iTile * tileSizeRows;
+                final int pEnd   = Math.min(pStart + tileSizeRows, rows);
+                final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                                   columnOffset;
+                for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                    data[k] = vector.getEntry(p);
+                }
+            }
+        }
+    }
+
+    /** {@inheritDoc} */
+    public double[] getRow(final int row)
+        throws MatrixIndexException {
+
+        checkRowIndex(row);
+        final double[] out = new double[columns];
+
+        // perform copy tile-wise, to ensure good cache behavior
+        final int iTile     = row / tileSizeRows;
+        final int rowOffset = row - iTile * tileSizeRows;
+        int outIndex        = 0;
+        for (int jTile = 0; jTile < tileNumber; ++jTile) {
+            final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                               rowOffset * tileSizeColumns;
+            final int length = Math.min(outIndex + tileSizeColumns, columns) - outIndex;
+            System.arraycopy(data, kStart, out, outIndex, length);
+            outIndex += length;
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public void setRow(final int row, final double[] array)
+        throws MatrixIndexException, InvalidMatrixException {
+
+        checkRowIndex(row);
+        if (array.length != columns) {
+            throw new InvalidMatrixException("dimensions mismatch: got {0}x{1} but expected {2}x{3}",
+                                             new Object[] {
+                                                 1, array.length,
+                                                 1, columns
+                                             });
+        }
+
+        // perform copy tile-wise, to ensure good cache behavior
+        final int iTile     = row / tileSizeRows;
+        final int rowOffset = row - iTile * tileSizeRows;
+        int outIndex        = 0;
+        for (int jTile = 0; jTile < tileNumber; ++jTile) {
+            final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                               rowOffset * tileSizeColumns;
+            final int length = Math.min(outIndex + tileSizeColumns, columns) - outIndex;
+            System.arraycopy(array, outIndex, data, kStart, length);
+            outIndex += length;
+        }
+
+    }
+
+    /** {@inheritDoc} */
+    public double[] getColumn(final int column)
+        throws MatrixIndexException {
+
+        checkColumnIndex(column);
+        final double[] out = new double[rows];
+
+        // perform copy tile-wise, to ensure good cache behavior
+        final int jTile        = column / tileSizeColumns;
+        final int columnOffset = column - jTile * tileSizeColumns;
+        for (int iTile = 0; iTile < tileNumber; ++iTile) {
+            final int pStart = iTile * tileSizeRows;
+            final int pEnd   = Math.min(pStart + tileSizeRows, rows);
+            final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                               columnOffset;
+            for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                out[p] = data[k];
+            }
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public void setColumn(final int column, final double[] array)
+        throws MatrixIndexException, InvalidMatrixException {
+
+        checkColumnIndex(column);
+        if (array.length != rows) {
+            throw new InvalidMatrixException("dimensions mismatch: got {0}x{1} but expected {2}x{3}",
+                                             new Object[] {
+                                                 array.length, 1,
+                                                 rows, 1
+                                             });
+        }
+
+        // perform copy tile-wise, to ensure good cache behavior
+        final int jTile        = column / tileSizeColumns;
+        final int columnOffset = column - jTile * tileSizeColumns;
+        for (int iTile = 0; iTile < tileNumber; ++iTile) {
+            final int pStart = iTile * tileSizeRows;
+            final int pEnd   = Math.min(pStart + tileSizeRows, rows);
+            final int kStart = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns +
+                               columnOffset;
+            for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                data[k] = array[p];
+            }
+        }
+
+    }
+
+    /** {@inheritDoc} */
+    public double getEntry(final int row, final int column)
+        throws MatrixIndexException {
+        if ((row < 0) || (row >= rows) || (column < 0) || (column >= columns)) {
+            throw new MatrixIndexException("no entry at indices ({0}, {1}) in a {2}x{3} matrix",
+                                           new Object[] {
+                                               row, column,
+                                               getRowDimension(), getColumnDimension()
+                                           });
+        }
+        return data[index(row, column)];
+    }
+
+    /** {@inheritDoc} */
+    public void setEntry(final int row, final int column, final double value)
+        throws MatrixIndexException {
+        if ((row < 0) || (row >= rows) || (column < 0) || (column >= columns)) {
+            throw new MatrixIndexException("no entry at indices ({0}, {1}) in a {2}x{3} matrix",
+                                           new Object[] {
+                                               row, column,
+                                               getRowDimension(), getColumnDimension()
+                                           });
+        }
+        data[index(row, column)] = value;
+    }
+
+    /** {@inheritDoc} */
+    public void addToEntry(final int row, final int column, final double increment)
+        throws MatrixIndexException {
+        if ((row < 0) || (row >= rows) || (column < 0) || (column >= columns)) {
+            throw new MatrixIndexException("no entry at indices ({0}, {1}) in a {2}x{3} matrix",
+                                           new Object[] {
+                                               row, column,
+                                               getRowDimension(), getColumnDimension()
+                                           });
+        }
+        data[index(row, column)] += increment;
+    }
+
+    /** {@inheritDoc} */
+    public void multiplyEntry(final int row, final int column, final double factor)
+        throws MatrixIndexException {
+        if ((row < 0) || (row >= rows) || (column < 0) || (column >= columns)) {
+            throw new MatrixIndexException("no entry at indices ({0}, {1}) in a {2}x{3} matrix",
+                                           new Object[] {
+                                               row, column,
+                                               getRowDimension(), getColumnDimension()
+                                           });
+        }
+        data[index(row, column)] *= factor;
+    }
+
+    /** {@inheritDoc} */
+    public RealMatrix transpose() {
+
+        final RecursiveLayoutRealMatrix out = new RecursiveLayoutRealMatrix(columns, rows);
+
+        // perform transpose tile-wise, to ensure good cache behavior
+        for (int index = 0; index < tileNumber * tileNumber; ++index) {
+            final int tileStart    = index * tileSizeRows * tileSizeColumns;
+            final long indices     = tilesIndices(index);
+            final int outJTile     = (int) (indices >> 32);        // iTile in the instance
+            final int outITile     = (int) (indices & 0xffffffff); // jTile in the instance
+            final int outIndex     = tileIndex(outITile, outJTile);
+            final int outTileStart = outIndex * tileSizeRows * tileSizeColumns;
+
+            // transpose current tile
+            final int outPStart = outITile * tileSizeColumns;
+            final int outPEnd   = Math.min(outPStart + tileSizeColumns, columns);
+            final int outQStart = outJTile * tileSizeRows;
+            final int outQEnd   = Math.min(outQStart + tileSizeRows, rows);
+            for (int outP = outPStart; outP < outPEnd; ++outP) {
+                final int dP = outP - outPStart;
+                int k = outTileStart + dP * tileSizeRows;
+                int l = tileStart + dP;
+                for (int outQ = outQStart; outQ < outQEnd; ++outQ) {
+                    out.data[k++] = data[l];
+                    l+= tileSizeColumns;
+                }
+            }
+
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public int getRowDimension() {
+        return rows;
+    }
+
+    /** {@inheritDoc} */
+    public int getColumnDimension() {
+        return columns;
+    }
+
+    /** {@inheritDoc} */
+    public double[] operate(final double[] v)
+        throws IllegalArgumentException {
+
+        if (v.length != columns) {
+            throw MathRuntimeException.createIllegalArgumentException("vector length mismatch:" +
+                                                                      " got {0} but expected {1}",
+                                                                      new Object[] {
+                                                                          v.length, columns
+                                                                      });
+        }
+        final double[] out = new double[rows];
+
+        // perform multiplication tile-wise, to ensure good cache behavior
+        for (int index = 0; index < tileNumber * tileNumber; ++index) {
+            final int tileStart = index * tileSizeRows * tileSizeColumns;
+            final long indices  = tilesIndices(index);
+            final int iTile     = (int) (indices >> 32);
+            final int jTile     = (int) (indices & 0xffffffff);
+            final int pStart    = iTile * tileSizeRows;
+            final int pEnd      = Math.min(pStart + tileSizeRows, rows);
+            final int qStart    = jTile * tileSizeColumns;
+            final int qEnd      = Math.min(qStart + tileSizeColumns, columns);
+            for (int p = pStart, k = tileStart; p < pEnd; ++p) {
+                double sum = 0;
+                int    q   = qStart;
+                while (q < qEnd - 3) {
+                    sum += data[k]     * v[q]     +
+                           data[k + 1] * v[q + 1] +
+                           data[k + 2] * v[q + 2] +
+                           data[k + 3] * v[q + 3];
+                    k += 4;
+                    q += 4;
+                }
+                while (q < qEnd) {
+                    sum += data[k++] * v[q++];
+                }
+                out[p] += sum;
+            }
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public double[] preMultiply(final double[] v)
+        throws IllegalArgumentException {
+
+        if (v.length != rows) {
+            throw MathRuntimeException.createIllegalArgumentException("vector length mismatch:" +
+                                                                      " got {0} but expected {1}",
+                                                                      new Object[] {
+                                                                          v.length, rows
+                                                                      });
+        }
+        final double[] out = new double[columns];
+
+        final int offset1 = tileSizeColumns;
+        final int offset2 = offset1 + offset1;
+        final int offset3 = offset2 + offset1;
+        final int offset4 = offset3 + offset1;
+
+        // perform multiplication tile-wise, to ensure good cache behavior
+        for (int index = 0; index < tileNumber * tileNumber; ++index) {
+            final int tileStart = index * tileSizeRows * tileSizeColumns;
+            final long indices  = tilesIndices(index);
+            final int iTile     = (int) (indices >> 32);
+            final int jTile     = (int) (indices & 0xffffffff);
+            final int pStart    = iTile * tileSizeRows;
+            final int pEnd      = Math.min(pStart + tileSizeRows, rows);
+            final int qStart    = jTile * tileSizeColumns;
+            final int qEnd      = Math.min(qStart + tileSizeColumns, columns);
+            for (int q = qStart; q < qEnd; ++q) {
+                int k = tileStart + q - qStart;
+                double sum = 0;
+                int p = pStart;
+                while (p < pEnd - 3) {
+                    sum += data[k]           * v[p]     +
+                           data[k + offset1] * v[p + 1] +
+                           data[k + offset2] * v[p + 2] +
+                           data[k + offset3] * v[p + 3];
+                    k += offset4;
+                    p += 4;
+                }
+                while (p < pEnd) {
+                    sum += data[k] * v[p++];
+                    k   += offset1;
+                }
+                out[q] += sum;
+            }
+        }
+
+        return out;
+
+    }
+
+    /** {@inheritDoc} */
+    public double walkInRowOrder(final RealMatrixChangingVisitor visitor)
+        throws MatrixVisitorException {
+        visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
+        for (int iTile = 0; iTile < tileNumber; ++iTile) {
+            final int pStart = iTile * tileSizeRows;
+            final int pEnd   = Math.min(pStart + tileSizeRows, rows);
+            for (int p = pStart; p < pEnd; ++p) {
+                for (int jTile = 0; jTile < tileNumber; ++jTile) {
+                    final int qStart    = jTile * tileSizeColumns;
+                    final int qEnd      = Math.min(qStart + tileSizeColumns, columns);
+                    final int tileStart = tileIndex(iTile, jTile) *
+                                          tileSizeRows * tileSizeColumns;
+                    final int kStart    = tileStart + (p - pStart) * tileSizeColumns;
+                    for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                        data[k] = visitor.visit(p, q, data[k]);
+                    }
+                }
+             }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInRowOrder(final RealMatrixPreservingVisitor visitor)
+        throws MatrixVisitorException {
+        visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
+        for (int iTile = 0; iTile < tileNumber; ++iTile) {
+            final int pStart = iTile * tileSizeRows;
+            final int pEnd   = Math.min(pStart + tileSizeRows, rows);
+            for (int p = pStart; p < pEnd; ++p) {
+                for (int jTile = 0; jTile < tileNumber; ++jTile) {
+                    final int qStart    = jTile * tileSizeColumns;
+                    final int qEnd      = Math.min(qStart + tileSizeColumns, columns);
+                    final int tileStart = tileIndex(iTile, jTile) *
+                                          tileSizeRows * tileSizeColumns;
+                    final int kStart    = tileStart + (p - pStart) * tileSizeColumns;
+                    for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                        visitor.visit(p, q, data[k]);
+                    }
+                }
+             }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInRowOrder(final RealMatrixChangingVisitor visitor,
+                                 final int startRow, final int endRow,
+                                 final int startColumn, final int endColumn)
+        throws MatrixIndexException, MatrixVisitorException {
+        checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
+        visitor.start(rows, columns, startRow, endRow, startColumn, endColumn);
+        for (int iTile = startRow / tileSizeRows; iTile < 1 + endRow / tileSizeRows; ++iTile) {
+            final int p0     = iTile * tileSizeRows;
+            final int pStart = Math.max(startRow, p0);
+            final int pEnd   = Math.min((iTile + 1) * tileSizeRows, 1 + endRow);
+            for (int p = pStart; p < pEnd; ++p) {
+                for (int jTile = startColumn / tileSizeColumns; jTile < 1 + endColumn / tileSizeColumns; ++jTile) {
+                    final int q0        = jTile * tileSizeColumns;
+                    final int qStart    = Math.max(startColumn, q0);
+                    final int qEnd      = Math.min((jTile + 1) * tileSizeColumns, 1 + endColumn);
+                    final int tileStart = tileIndex(iTile, jTile) *
+                                          tileSizeRows * tileSizeColumns;
+                    final int kStart    = tileStart + (p - p0) * tileSizeColumns + (qStart - q0);
+                    for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                        data[k] = visitor.visit(p, q, data[k]);
+                    }
+                }
+             }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInRowOrder(final RealMatrixPreservingVisitor visitor,
+                                 final int startRow, final int endRow,
+                                 final int startColumn, final int endColumn)
+        throws MatrixIndexException, MatrixVisitorException {
+        checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
+        visitor.start(rows, columns, startRow, endRow, startColumn, endColumn);
+        for (int iTile = startRow / tileSizeRows; iTile < 1 + endRow / tileSizeRows; ++iTile) {
+            final int p0     = iTile * tileSizeRows;
+            final int pStart = Math.max(startRow, p0);
+            final int pEnd   = Math.min((iTile + 1) * tileSizeRows, 1 + endRow);
+            for (int p = pStart; p < pEnd; ++p) {
+                for (int jTile = startColumn / tileSizeColumns; jTile < 1 + endColumn / tileSizeColumns; ++jTile) {
+                    final int q0        = jTile * tileSizeColumns;
+                    final int qStart    = Math.max(startColumn, q0);
+                    final int qEnd      = Math.min((jTile + 1) * tileSizeColumns, 1 + endColumn);
+                    final int tileStart = tileIndex(iTile, jTile) *
+                                          tileSizeRows * tileSizeColumns;
+                    final int kStart    = tileStart + (p - p0) * tileSizeColumns + (qStart - q0);
+                    for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                        visitor.visit(p, q, data[k]);
+                    }
+                }
+             }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInColumnOrder(final RealMatrixChangingVisitor visitor)
+        throws MatrixVisitorException {
+        visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
+        for (int jTile = 0; jTile < tileNumber; ++jTile) {
+            final int qStart = jTile * tileSizeColumns;
+            final int qEnd   = Math.min(qStart + tileSizeColumns, columns);
+            for (int q = qStart; q < qEnd; ++q) {
+                for (int iTile = 0; iTile < tileNumber; ++iTile) {
+                    final int pStart    = iTile * tileSizeRows;
+                    final int pEnd      = Math.min(pStart + tileSizeRows, rows);
+                    final int tileStart = tileIndex(iTile, jTile) *
+                                          tileSizeRows * tileSizeColumns;
+                    final int kStart    = tileStart + (q - qStart);
+                    for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                        data[k] = visitor.visit(p, q, data[k]);
+                    }
+                }
+             }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInColumnOrder(final RealMatrixPreservingVisitor visitor)
+        throws MatrixVisitorException {
+        visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
+        for (int jTile = 0; jTile < tileNumber; ++jTile) {
+            final int qStart = jTile * tileSizeColumns;
+            final int qEnd   = Math.min(qStart + tileSizeColumns, columns);
+            for (int q = qStart; q < qEnd; ++q) {
+                for (int iTile = 0; iTile < tileNumber; ++iTile) {
+                    final int pStart    = iTile * tileSizeRows;
+                    final int pEnd      = Math.min(pStart + tileSizeRows, rows);
+                    final int tileStart = tileIndex(iTile, jTile) *
+                                          tileSizeRows * tileSizeColumns;
+                    final int kStart    = tileStart + (q - qStart);
+                    for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                        visitor.visit(p, q, data[k]);
+                    }
+                }
+             }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInColumnOrder(final RealMatrixChangingVisitor visitor,
+                                    final int startRow, final int endRow,
+                                    final int startColumn, final int endColumn)
+        throws MatrixIndexException, MatrixVisitorException {
+        checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
+        visitor.start(getRowDimension(), getColumnDimension(),
+                      startRow, endRow, startColumn, endColumn);
+        for (int jTile = 0; jTile < tileNumber; ++jTile) {
+            final int q0     = jTile * tileSizeColumns;
+            final int qStart = Math.max(startColumn, q0);
+            final int qEnd   = Math.min((jTile + 1) * tileSizeColumns, 1 + endColumn);
+            for (int q = qStart; q < qEnd; ++q) {
+                for (int iTile = 0; iTile < tileNumber; ++iTile) {
+                    final int p0        = iTile * tileSizeRows;
+                    final int pStart    = Math.max(startRow, p0);
+                    final int pEnd      = Math.min((iTile + 1) * tileSizeRows, 1 + endRow);
+                    final int tileStart = tileIndex(iTile, jTile) *
+                                          tileSizeRows * tileSizeColumns;
+                    final int kStart = tileStart + (pStart - p0) * tileSizeColumns + (q - q0);
+                    for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                        data[k] = visitor.visit(p, q, data[k]);
+                    }
+                }
+             }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInColumnOrder(final RealMatrixPreservingVisitor visitor,
+                                    final int startRow, final int endRow,
+                                    final int startColumn, final int endColumn)
+        throws MatrixIndexException, MatrixVisitorException {
+        checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
+        visitor.start(getRowDimension(), getColumnDimension(),
+                      startRow, endRow, startColumn, endColumn);
+        for (int jTile = 0; jTile < tileNumber; ++jTile) {
+            final int q0     = jTile * tileSizeColumns;
+            final int qStart = Math.max(startColumn, q0);
+            final int qEnd   = Math.min((jTile + 1) * tileSizeColumns, 1 + endColumn);
+            for (int q = qStart; q < qEnd; ++q) {
+                for (int iTile = 0; iTile < tileNumber; ++iTile) {
+                    final int p0        = iTile * tileSizeRows;
+                    final int pStart    = Math.max(startRow, p0);
+                    final int pEnd      = Math.min((iTile + 1) * tileSizeRows, 1 + endRow);
+                    final int tileStart = tileIndex(iTile, jTile) *
+                                           tileSizeRows * tileSizeColumns;
+                    final int kStart = tileStart + (pStart - p0) * tileSizeColumns + (q - q0);
+                    for (int p = pStart, k = kStart; p < pEnd; ++p, k += tileSizeColumns) {
+                        visitor.visit(p, q, data[k]);
+                    }
+                }
+             }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInOptimizedOrder(final RealMatrixChangingVisitor visitor)
+        throws MatrixVisitorException {
+        visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
+        for (int index = 0; index < tileNumber * tileNumber; ++index) {
+            final int tileStart = index * tileSizeRows * tileSizeColumns;
+            final long indices  = tilesIndices(index);
+            final int iTile     = (int) (indices >> 32);
+            final int jTile     = (int) (indices & 0xffffffff);
+            final int pStart    = iTile * tileSizeRows;
+            final int pEnd      = Math.min(pStart + tileSizeRows, rows);
+            final int qStart    = jTile * tileSizeColumns;
+            final int qEnd      = Math.min(qStart + tileSizeColumns, columns);
+            for (int p = pStart; p < pEnd; ++p) {
+                final int kStart = tileStart + (p - pStart) * tileSizeColumns;
+                for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                    data[k] = visitor.visit(p, q, data[k]);
+                }
+            }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInOptimizedOrder(final RealMatrixPreservingVisitor visitor)
+        throws MatrixVisitorException {
+        visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
+        for (int index = 0; index < tileNumber * tileNumber; ++index) {
+            final int tileStart = index * tileSizeRows * tileSizeColumns;
+            final long indices  = tilesIndices(index);
+            final int iTile     = (int) (indices >> 32);
+            final int jTile     = (int) (indices & 0xffffffff);
+            final int pStart    = iTile * tileSizeRows;
+            final int pEnd      = Math.min(pStart + tileSizeRows, rows);
+            final int qStart    = jTile * tileSizeColumns;
+            final int qEnd      = Math.min(qStart + tileSizeColumns, columns);
+            for (int p = pStart; p < pEnd; ++p) {
+                final int kStart = tileStart + (p - pStart) * tileSizeColumns;
+                for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                    visitor.visit(p, q, data[k]);
+                }
+            }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInOptimizedOrder(final RealMatrixChangingVisitor visitor,
+                                       final int startRow, final int endRow,
+                                       final int startColumn, final int endColumn)
+        throws MatrixIndexException, MatrixVisitorException {
+        checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
+        visitor.start(rows, columns, startRow, endRow, startColumn, endColumn);
+        for (int index = 0; index < tileNumber * tileNumber; ++index) {
+            final int tileStart = index * tileSizeRows * tileSizeColumns;
+            final long indices  = tilesIndices(index);
+            final int iTile     = (int) (indices >> 32);
+            final int jTile     = (int) (indices & 0xffffffff);
+            final int p0        = iTile * tileSizeRows;
+            final int pStart    = Math.max(startRow, p0);
+            final int pEnd      = Math.min((iTile + 1) * tileSizeRows, 1 + endRow);
+            final int q0        = jTile * tileSizeColumns;
+            final int qStart    = Math.max(startColumn, q0);
+            final int qEnd      = Math.min((jTile + 1) * tileSizeColumns, 1 + endColumn);
+            for (int p = pStart; p < pEnd; ++p) {
+                final int kStart = tileStart + (p - p0) * tileSizeColumns + (qStart - q0);
+                for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                    data[k] = visitor.visit(p, q, data[k]);
+                }
+            }
+        }
+        return visitor.end();
+    }
+
+    /** {@inheritDoc} */
+    public double walkInOptimizedOrder(final RealMatrixPreservingVisitor visitor,
+                                       final int startRow, final int endRow,
+                                       final int startColumn, final int endColumn)
+        throws MatrixIndexException, MatrixVisitorException {
+        checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
+        visitor.start(rows, columns, startRow, endRow, startColumn, endColumn);
+        for (int index = 0; index < tileNumber * tileNumber; ++index) {
+            final int tileStart = index * tileSizeRows * tileSizeColumns;
+            final long indices  = tilesIndices(index);
+            final int iTile     = (int) (indices >> 32);
+            final int jTile     = (int) (indices & 0xffffffff);
+            final int p0        = iTile * tileSizeRows;
+            final int pStart    = Math.max(startRow, p0);
+            final int pEnd      = Math.min((iTile + 1) * tileSizeRows, 1 + endRow);
+            final int q0        = jTile * tileSizeColumns;
+            final int qStart    = Math.max(startColumn, q0);
+            final int qEnd      = Math.min((jTile + 1) * tileSizeColumns, 1 + endColumn);
+            for (int p = pStart; p < pEnd; ++p) {
+                final int kStart = tileStart + (p - p0) * tileSizeColumns + (qStart - q0);
+                for (int q = qStart, k = kStart; q < qEnd; ++q, ++k) {
+                    visitor.visit(p, q, data[k]);
+                }
+            }
+        }
+        return visitor.end();
+    }
+
+    /**
+     * Get the index of an element.
+     * @param row row index of the element
+     * @param column column index of the element
+     * @return index of the element
+     */
+    private int index(final int row, final int columns) {
+        final int iTile       = row     / tileSizeRows;
+        final int jTile       = columns / tileSizeColumns;
+        final int tileStart   = tileIndex(iTile, jTile) * tileSizeRows * tileSizeColumns;
+        final int indexInTile = (row % tileSizeRows) * tileSizeColumns +
+                                (columns % tileSizeColumns);
+        return tileStart + indexInTile;
+    }
+
+    /**
+     * Get the index of a tile.
+     * @param iTile row index of the tile
+     * @param jTile column index of the tile
+     * @return index of the tile
+     */
+    private static int tileIndex(int iTile, int jTile) {
+
+        // compute n = 2^k such that a nxn square contains the indices
+        int n = Integer.highestOneBit(Math.max(iTile, jTile)) << 1;
+
+        // start recursion by noting the index is somewhere in the nxn
+        // square whose lowest index is 0 and which has direct orientation
+        int lowIndex   = 0;
+        boolean direct = true;
+
+        // the tail-recursion on the square size is replaced by an iteration here
+        while (n > 1) {
+
+            // reduce square to 4 quadrants
+            n >>= 1;
+            final int n2 = n * n;
+
+            // check in which quadrant the element is,
+            // updating the lowest index of the quadrant and its orientation
+            if (iTile < n) {
+                if (jTile < n) {
+                    // the element is in the top-left quadrant
+                    if (!direct) {
+                        lowIndex += 2 * n2;
+                        direct = true;
+                    }
+                } else {
+                    // the element is in the top-right quadrant
+                    jTile -= n;
+                    if (direct) {
+                        lowIndex += n2;
+                        direct = false;
+                    } else {
+                        lowIndex += 3 * n2;
+                    }
+                }
+            } else {
+                iTile -= n;
+                if (jTile < n) {
+                    // the element is in the bottom-left quadrant
+                    if (direct) {
+                        lowIndex += 3 * n2;
+                    } else {
+                        lowIndex += n2;

[... 126 lines stripped ...]


Mime
View raw message