Return-Path: Delivered-To: apmail-lucene-mahout-commits-archive@minotaur.apache.org Received: (qmail 15640 invoked from network); 17 Dec 2009 23:23:48 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 17 Dec 2009 23:23:48 -0000 Received: (qmail 23050 invoked by uid 500); 17 Dec 2009 23:23:47 -0000 Delivered-To: apmail-lucene-mahout-commits-archive@lucene.apache.org Received: (qmail 22993 invoked by uid 500); 17 Dec 2009 23:23:47 -0000 Mailing-List: contact mahout-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: mahout-dev@lucene.apache.org Delivered-To: mailing list mahout-commits@lucene.apache.org Received: (qmail 22983 invoked by uid 99); 17 Dec 2009 23:23:47 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 17 Dec 2009 23:23:47 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 17 Dec 2009 23:23:32 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 1F9032388AF2; Thu, 17 Dec 2009 23:23:10 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r891983 [4/47] - in /lucene/mahout/trunk: ./ core/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ core/src/main/java/org/apache/mahout/clustering/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/ap... Date: Thu, 17 Dec 2009 23:22:41 -0000 To: mahout-commits@lucene.apache.org From: gsingers@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20091217232310.1F9032388AF2@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonMatrixAdapter.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonMatrixAdapter.java?rev=891983&view=auto ============================================================================== --- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonMatrixAdapter.java (added) +++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonMatrixAdapter.java Thu Dec 17 23:22:16 2009 @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.math; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParseException; +import com.google.gson.JsonPrimitive; +import com.google.gson.JsonSerializationContext; +import com.google.gson.JsonSerializer; +import com.google.gson.reflect.TypeToken; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.Type; + +public class JsonMatrixAdapter implements JsonSerializer, + JsonDeserializer { + + private static final Logger log = LoggerFactory.getLogger(JsonMatrixAdapter.class); + public static final String CLASS = "class"; + public static final String MATRIX = "matrix"; + + @Override + public JsonElement serialize(Matrix src, Type typeOfSrc, + JsonSerializationContext context) { + Type vectorType = new TypeToken() { + }.getType(); + Type matrixType = new TypeToken() { + }.getType(); + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeAdapter(vectorType, new JsonVectorAdapter()); + builder.registerTypeAdapter(matrixType, new JsonMatrixAdapter()); + Gson gson = builder.create(); + JsonObject obj = new JsonObject(); + obj.add(CLASS, new JsonPrimitive(src.getClass().getName())); + obj.add(MATRIX, new JsonPrimitive(gson.toJson(src))); + return obj; + } + + @Override + public Matrix deserialize(JsonElement json, Type typeOfT, + JsonDeserializationContext context) throws JsonParseException { + Type vectorType = new TypeToken() { + }.getType(); + Type matrixType = new TypeToken() { + }.getType(); + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeAdapter(vectorType, new JsonVectorAdapter()); + builder.registerTypeAdapter(matrixType, new JsonMatrixAdapter()); + Gson gson = builder.create(); + JsonObject obj = json.getAsJsonObject(); + String klass = obj.get(CLASS).getAsString(); + String matrix = obj.get(MATRIX).getAsString(); + ClassLoader ccl = Thread.currentThread().getContextClassLoader(); + Class cl = null; + try { + cl = ccl.loadClass(klass); + } catch (ClassNotFoundException e) { + log.warn("Error while loading class", e); + } + return (Matrix) gson.fromJson(matrix, cl); + } + +} Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonMatrixAdapter.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonVectorAdapter.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonVectorAdapter.java?rev=891983&view=auto ============================================================================== --- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonVectorAdapter.java (added) +++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonVectorAdapter.java Thu Dec 17 23:22:16 2009 @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.math; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParseException; +import com.google.gson.JsonPrimitive; +import com.google.gson.JsonSerializationContext; +import com.google.gson.JsonSerializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.Type; + +public class JsonVectorAdapter implements JsonSerializer, + JsonDeserializer { + + private static final Logger log = LoggerFactory.getLogger(JsonVectorAdapter.class); + public static final String VECTOR = "vector"; + + @Override + public JsonElement serialize(Vector src, Type typeOfSrc, + JsonSerializationContext context) { + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeAdapter(Vector.class, new JsonVectorAdapter()); + Gson gson = builder.create(); + JsonObject obj = new JsonObject(); + obj.add(JsonMatrixAdapter.CLASS, new JsonPrimitive(src.getClass().getName())); + obj.add(VECTOR, new JsonPrimitive(gson.toJson(src))); + return obj; + } + + @Override + public Vector deserialize(JsonElement json, Type typeOfT, + JsonDeserializationContext context) throws JsonParseException { + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeAdapter(Vector.class, new JsonVectorAdapter()); + Gson gson = builder.create(); + JsonObject obj = json.getAsJsonObject(); + String klass = obj.get(JsonMatrixAdapter.CLASS).getAsString(); + String vector = obj.get(VECTOR).getAsString(); + ClassLoader ccl = Thread.currentThread().getContextClassLoader(); + Class cl = null; + try { + cl = ccl.loadClass(klass); + } catch (ClassNotFoundException e) { + log.warn("Error while loading class", e); + } + return (Vector) gson.fromJson(vector, cl); + } + +} Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/JsonVectorAdapter.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java?rev=891983&view=auto ============================================================================== --- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java (added) +++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java Thu Dec 17 23:22:16 2009 @@ -0,0 +1,395 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math; + +import org.apache.hadoop.io.Writable; + +import java.util.Map; + +/** The basic interface including numerous convenience functions */ +public interface Matrix extends Cloneable, Writable { + + /** @return a formatted String suitable for output */ + String asFormatString(); + + /** + * Assign the value to all elements of the receiver + * + * @param value a double value + * @return the modified receiver + */ + Matrix assign(double value); + + /** + * Assign the values to the receiver + * + * @param values a double[] of values + * @return the modified receiver + * @throws CardinalityException if the cardinalities differ + */ + Matrix assign(double[][] values); + + /** + * Assign the other vector values to the receiver + * + * @param other a Matrix + * @return the modified receiver + * @throws CardinalityException if the cardinalities differ + */ + Matrix assign(Matrix other); + + /** + * Apply the function to each element of the receiver + * + * @param function a UnaryFunction to apply + * @return the modified receiver + */ + Matrix assign(UnaryFunction function); + + /** + * Apply the function to each element of the receiver and the corresponding element of the other argument + * + * @param other a Matrix containing the second arguments to the function + * @param function a BinaryFunction to apply + * @return the modified receiver + * @throws CardinalityException if the cardinalities differ + */ + Matrix assign(Matrix other, BinaryFunction function); + + /** + * Assign the other vector values to the column of the receiver + * + * @param column the int row to assign + * @param other a Vector + * @return the modified receiver + * @throws CardinalityException if the cardinalities differ + */ + Matrix assignColumn(int column, Vector other); + + /** + * Assign the other vector values to the row of the receiver + * + * @param row the int row to assign + * @param other a Vector + * @return the modified receiver + * @throws CardinalityException if the cardinalities differ + */ + Matrix assignRow(int row, Vector other); + + /** + * Return the cardinality of the recipient (the maximum number of values) + * + * @return an int[2] + */ + int[] size(); + + /** Helper method to return the cardinality of the row dimension */ + int numRows(); + + /** Helper method to return the cardinality of the column dimension */ + int numCols(); + + /** + * Return a copy of the recipient + * + * @return a new Matrix + */ + Matrix clone(); + + /** + * Returns matrix determinator using Laplace theorem + * + * @return a matrix determinator + */ + double determinant(); + + /** + * Return a new matrix containing the values of the recipient divided by the argument + * + * @param x a double value + * @return a new Matrix + */ + Matrix divide(double x); + + /** + * Return the value at the given indexes + * + * @param row an int row index + * @param column an int column index + * @return the double at the index + * @throws IndexException if the index is out of bounds + */ + double get(int row, int column); + + /** + * Return the column at the given index + * + * @param column an int column index + * @return a Vector at the index + * @throws IndexException if the index is out of bounds + */ + Vector getColumn(int column); + + /** + * Return the row at the given index + * + * @param row an int row index + * @return a Vector at the index + * @throws IndexException if the index is out of bounds + */ + Vector getRow(int row); + + /** + * Return the value at the given indexes, without checking bounds + * + * @param row an int row index + * @param column an int column index + * @return the double at the index + */ + double getQuick(int row, int column); + + /** + * Return if the other matrix and the receiver share any underlying data cells + * + * @param other a Matrix + * @return true if the other matrix has common data cells + */ + boolean haveSharedCells(Matrix other); + + /** + * Return an empty matrix of the same underlying class as the receiver + * + * @return a Matrix + */ + Matrix like(); + + /** + * Return an empty matrix of the same underlying class as the receiver and of the given cardinality + * + * @param rows the int number of rows + * @param columns the int number of columns + */ + Matrix like(int rows, int columns); + + /** + * Return a new matrix containing the element by element difference of the recipient and the argument + * + * @param x a Matrix + * @return a new Matrix + * @throws CardinalityException if the cardinalities differ + */ + Matrix minus(Matrix x); + + /** + * Return a new matrix containing the sum of each value of the recipient and the argument + * + * @param x a double + * @return a new Matrix + */ + Matrix plus(double x); + + /** + * Return a new matrix containing the element by element sum of the recipient and the argument + * + * @param x a Matrix + * @return a new Matrix + * @throws CardinalityException if the cardinalities differ + */ + Matrix plus(Matrix x); + + /** + * Set the value at the given index + * + * @param row an int row index into the receiver + * @param column an int column index into the receiver + * @param value a double value to set + * @throws IndexException if the index is out of bounds + */ + void set(int row, int column, double value); + + void set(int row, double[] data); + + /** + * Set the value at the given index, without checking bounds + * + * @param row an int row index into the receiver + * @param column an int column index into the receiver + * @param value a double value to set + */ + void setQuick(int row, int column, double value); + + /** + * Return the number of values in the recipient + * + * @return an int[2] containing [row, column] count + */ + int[] getNumNondefaultElements(); + + /** + * Return a new matrix containing the product of each value of the recipient and the argument + * + * @param x a double argument + * @return a new Matrix + */ + Matrix times(double x); + + /** + * Return a new matrix containing the product of the recipient and the argument + * + * @param x a Matrix argument + * @return a new Matrix + * @throws CardinalityException if the cardinalities are incompatible + */ + Matrix times(Matrix x); + + /** + * Return a new vector with cardinality equal to getNumRows() of this matrix which is the matrix product of the + * recipient and the argument + * + * @param v a vector with cardinality equal to getNumCols() of the recipient + * @return a new vector (typically a DenseVector) + * @throws CardinalityException if this.getNumRows() != v.size() + */ + Vector times(Vector v); + + /** + * Convenience method for producing this.transpose().times(this.times(v)), which can be implemented with only one pass + * over the matrix, without making the transpose() call (which can be expensive if the matrix is sparse) + * + * @param v a vector with cardinality equal to getNumCols() of the recipient + * @return a new vector (typically a DenseVector) with cardinality equal to that of the argument. + * @throws CardinalityException if this.getNumCols() != v.size() + */ + Vector timesSquared(Vector v); + + /** + * Return a new matrix that is the transpose of the receiver + * + * @return the transpose + */ + Matrix transpose(); + + /** + * Return a new matrix containing the subset of the recipient + * + * @param offset an int[2] offset into the receiver + * @param size the int[2] size of the desired result + * @return a new Matrix + * @throws CardinalityException if the length is greater than the cardinality of the receiver + * @throws IndexException if the offset is negative or the offset+length is outside of the receiver + */ + Matrix viewPart(int[] offset, int[] size); + + /** + * Return the sum of all the elements of the receiver + * + * @return a double + */ + double zSum(); + + /** + * Return a map of the current column label bindings of the receiver + * + * @return a Map + */ + Map getColumnLabelBindings(); + + /** + * Return a map of the current row label bindings of the receiver + * + * @return a Map + */ + Map getRowLabelBindings(); + + /** + * Sets a map of column label bindings in the receiver + * + * @param bindings a Map of label bindings + */ + void setColumnLabelBindings(Map bindings); + + /** + * Sets a map of row label bindings in the receiver + * + * @param bindings a Map of label bindings + */ + void setRowLabelBindings(Map bindings); + + /** + * Return the value at the given labels + * + * @param rowLabel a String row label + * @param columnLabel a String column label + * @return the double at the index + * @throws IndexException if the index is out of bounds + */ + double get(String rowLabel, String columnLabel) throws IndexException, + UnboundLabelException; + + /** + * Set the value at the given index + * + * @param rowLabel a String row label + * @param columnLabel a String column label + * @param value a double value to set + * @throws IndexException if the index is out of bounds + */ + void set(String rowLabel, String columnLabel, double value) throws IndexException, + UnboundLabelException; + + /** + * Set the value at the given index, updating the row and column label bindings + * + * @param rowLabel a String row label + * @param columnLabel a String column label + * @param row an int row index + * @param column an int column index + * @param value a double value + */ + void set(String rowLabel, String columnLabel, int row, int column, double value) throws IndexException, + UnboundLabelException; + + /** + * Sets the row values at the given row label + * + * @param rowLabel a String row label + * @param rowData a double[] array of row data + */ + void set(String rowLabel, double[] rowData); + + /** + * Sets the row values at the given row index and updates the row labels + * + * @param rowLabel the String row label + * @param row an int the row index + * @param rowData a double[] array of row data + */ + void set(String rowLabel, int row, double[] rowData); + + /* + * Need stories for these but keeping them here for now. + * + */ + // void getNonZeros(IntArrayList jx, DoubleArrayList values); + // void foreachNonZero(IntDoubleFunction f); + // double aggregate(DoubleDoubleFunction aggregator, DoubleFunction map); + // double aggregate(Matrix other, DoubleDoubleFunction aggregator, + // DoubleDoubleFunction map); + // NewMatrix assign(Matrix y, DoubleDoubleFunction function, IntArrayList + // nonZeroIndexes); +} Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java?rev=891983&view=auto ============================================================================== --- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java (added) +++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java Thu Dec 17 23:22:16 2009 @@ -0,0 +1,175 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +/** Implements subset view of a Matrix */ +public class MatrixView extends AbstractMatrix { + + private Matrix matrix; + + // the offset into the Matrix + private int[] offset; + + // the cardinality of the view + private int[] cardinality; + + public MatrixView() { + super(); + } + + /** + * Construct a view of the matrix with given offset and cardinality + * + * @param matrix an underlying Matrix + * @param offset the int[2] offset into the underlying matrix + * @param cardinality the int[2] cardinality of the view + */ + public MatrixView(Matrix matrix, int[] offset, int[] cardinality) { + this.matrix = matrix; + this.offset = offset; + this.cardinality = cardinality; + } + + @Override + public int[] size() { + return cardinality; + } + + @Override + public Matrix clone() { + MatrixView clone = (MatrixView) super.clone(); + clone.matrix = matrix.clone(); + clone.offset = offset.clone(); + clone.cardinality = cardinality.clone(); + return clone; + } + + @Override + public double getQuick(int row, int column) { + return matrix.getQuick(offset[ROW] + row, offset[COL] + column); + } + + @Override + public Matrix like() { + return matrix.like(cardinality[ROW], cardinality[COL]); + } + + @Override + public Matrix like(int rows, int columns) { + + return matrix.like(rows, columns); + } + + @Override + public void setQuick(int row, int column, double value) { + matrix.setQuick(offset[ROW] + row, offset[COL] + column, value); + } + + @Override + public int[] getNumNondefaultElements() { + return cardinality; + } + + @Override + public Matrix viewPart(int[] offset, int[] size) { + if (size[ROW] > cardinality[ROW] || size[COL] > cardinality[COL]) { + throw new CardinalityException(); + } + if ((offset[ROW] < ROW || offset[ROW] + size[ROW] > cardinality[ROW]) + || (offset[COL] < ROW || offset[COL] + size[COL] > cardinality[COL])) { + throw new IndexException(); + } + int[] origin = offset.clone(); + origin[ROW] += offset[ROW]; + origin[COL] += offset[COL]; + return new MatrixView(matrix, origin, size); + } + + @Override + public boolean haveSharedCells(Matrix other) { + if (other instanceof MatrixView) { + return other == this || matrix.haveSharedCells(other); + } else { + return other.haveSharedCells(matrix); + } + } + + @Override + public Matrix assignColumn(int column, Vector other) { + if (cardinality[ROW] != other.size()) { + throw new CardinalityException(); + } + for (int row = 0; row < cardinality[ROW]; row++) { + matrix.setQuick(row + offset[ROW], column + offset[COL], other + .getQuick(row)); + } + return this; + } + + @Override + public Matrix assignRow(int row, Vector other) { + if (cardinality[COL] != other.size()) { + throw new CardinalityException(); + } + for (int col = 0; col < cardinality[COL]; col++) { + matrix + .setQuick(row + offset[ROW], col + offset[COL], other.getQuick(col)); + } + return this; + } + + @Override + public Vector getColumn(int column) { + if (column < 0 || column >= cardinality[COL]) { + throw new IndexException(); + } + return new VectorView(matrix.getColumn(column + offset[COL]), offset[ROW], + cardinality[ROW]); + } + + @Override + public Vector getRow(int row) { + if (row < 0 || row >= cardinality[ROW]) { + throw new IndexException(); + } + return new VectorView(matrix.getRow(row + offset[ROW]), offset[COL], + cardinality[COL]); + } + + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + this.offset = new int[]{in.readInt(), in.readInt()}; + this.cardinality = new int[]{in.readInt(), in.readInt()}; + this.matrix = readMatrix(in); + } + + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + out.writeInt(offset[ROW]); + out.writeInt(offset[COL]); + out.writeInt(cardinality[ROW]); + out.writeInt(cardinality[COL]); + writeMatrix(out, this.matrix); + } +} Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/NegateFunction.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/NegateFunction.java?rev=891983&view=auto ============================================================================== --- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/NegateFunction.java (added) +++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/NegateFunction.java Thu Dec 17 23:22:16 2009 @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math; + +public class NegateFunction implements UnaryFunction { + + @Override + public double apply(double arg1) { + return -arg1; + } + +} Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/NegateFunction.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java?rev=891983&view=auto ============================================================================== --- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java (added) +++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java Thu Dec 17 23:22:16 2009 @@ -0,0 +1,167 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math; + +import java.io.Serializable; + +final class OrderedIntDoubleMapping implements Serializable, Cloneable { + + private static final double DEFAULT_VALUE = 0.0; + + private int[] indices; + private double[] values; + private int numMappings; + + OrderedIntDoubleMapping() { + // no-arg constructor for deserializer + this(11); + } + + OrderedIntDoubleMapping(int capacity) { + indices = new int[capacity]; + values = new double[capacity]; + numMappings = 0; + } + + private OrderedIntDoubleMapping(int[] indices, double[] values, int numMappings) { + this.indices = indices; + this.values = values; + this.numMappings = numMappings; + } + + int[] getIndices() { + return indices; + } + + double[] getValues() { + return values; + } + + int getNumMappings() { + return numMappings; + } + + private void growTo(int newCapacity) { + if (newCapacity > indices.length) { + int[] newIndices = new int[newCapacity]; + System.arraycopy(indices, 0, newIndices, 0, numMappings); + indices = newIndices; + double[] newValues = new double[newCapacity]; + System.arraycopy(values, 0, newValues, 0, numMappings); + values = newValues; + } + } + + private int find(int index) { + int low = 0; + int high = numMappings - 1; + while (low <= high) { + int mid = low + ((high - low) >>> 1); + int midVal = indices[mid]; + if (midVal < index) { + low = mid + 1; + } else if (midVal > index) { + high = mid - 1; + } else { + return mid; + } + } + return -(low + 1); + } + + public double get(int index) { + int offset = find(index); + return offset >= 0 ? values[offset] : DEFAULT_VALUE; + } + + public void set(int index, double value) { + int offset = find(index); + if (offset >= 0) { + if (value == DEFAULT_VALUE) { + for (int i = offset + 1, j = offset; i < numMappings; i++, j++) { + indices[j] = indices[i]; + values[j] = values[i]; + } + numMappings--; + } else { + values[offset] = value; + } + } else { + if (value != DEFAULT_VALUE) { + if (numMappings >= indices.length) { + growTo(Math.max((int) (1.2 * numMappings), numMappings + 1)); + } + int at = -offset - 1; + if (numMappings > at) { + for (int i = numMappings - 1, j = numMappings; i >= at; i--, j--) { + indices[j] = indices[i]; + values[j] = values[i]; + } + } + indices[at] = index; + values[at] = value; + numMappings++; + } + } + } + + @Override + public int hashCode() { + int result = 0; + for (int i = 0; i < numMappings; i++) { + result = 31 * result + indices[i]; + result = 31 * result + (int) Double.doubleToRawLongBits(values[i]); + } + return result; + } + + @Override + public boolean equals(Object o) { + if (o instanceof OrderedIntDoubleMapping) { + OrderedIntDoubleMapping other = (OrderedIntDoubleMapping) o; + if (numMappings == other.getNumMappings()) { + for (int i = 0; i < numMappings; i++) { + if (indices[i] != other.getIndices()[i] || values[i] != other.getValues()[i]) { + return false; + } + } + return true; + } + } + return false; + } + + @Override + public String toString() { + StringBuilder result = new StringBuilder(10 * numMappings); + for (int i = 0; i < numMappings; i++) { + result.append('('); + result.append(indices[i]); + result.append(','); + result.append(values[i]); + result.append(')'); + } + return result.toString(); + } + + @Override + public OrderedIntDoubleMapping clone() { + return new OrderedIntDoubleMapping(indices.clone(), values.clone(), numMappings); + } + +} Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/OrderedIntDoubleMapping.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Partitioning.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Partitioning.java?rev=891983&view=auto ============================================================================== --- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Partitioning.java (added) +++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Partitioning.java Thu Dec 17 23:22:16 2009 @@ -0,0 +1,1263 @@ +/* +Copyright � 1999 CERN - European Organization for Nuclear Research. +Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose +is hereby granted without fee, provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear in supporting documentation. +CERN makes no representations about the suitability of this software for any purpose. +It is provided "as is" without expressed or implied warranty. +*/ +package org.apache.mahout.math; + +import org.apache.mahout.math.function.IntComparator; +import org.apache.mahout.math.list.DoubleArrayList; +import org.apache.mahout.math.list.IntArrayList; + +import java.util.Comparator; +/** + * Given some interval boundaries, partitions arrays such that all elements falling into an interval are placed next to each other. + *

+ * The algorithms partition arrays into two or more intervals. + * They distinguish between synchronously partitioning either one, two or three arrays. + * They further come in templated versions, either partitioning int[] arrays or double[] arrays. + *

+ * You may want to start out reading about the simplest case: Partitioning one int[] array into two intervals. + * To do so, read {@link #partition(int[],int,int,int)}. + * + * Next, building upon that foundation comes a method partitioning int[] arrays into multiple intervals. + * See {@link #partition(int[],int,int,int[],int,int,int[])} for related documentation. + *

+ * All other methods are no different than the one's you now already understand, except that they operate on slightly different data types. + *

+ * Performance + *

+ * Partitioning into two intervals is O( N ). + * Partitioning into k intervals is O( N * log(k)). + * Constants factors are minimized. + * No temporary memory is allocated; Partitioning is in-place. + * + * @see org.apache.mahout.math.matrix.doublealgo.Partitioning + * + */ + +/** @deprecated until unit tests are in place. Until this time, this class/interface is unsupported. */ +@Deprecated +public class Partitioning { + + private static final int SMALL = 7; + private static final int MEDIUM = 40; + + // benchmark only + protected static int steps = 0; + //public static int swappedElements = 0; + + /** Makes this class non instantiable, but still let's others inherit from it. */ + private Partitioning() { + } + + /** + * Finds the given key "a" within some generic data using the binary search algorithm. + * + * @param a the index of the key to search for. + * @param from the leftmost search position, inclusive. + * @param to the rightmost search position, inclusive. + * @param comp the comparator determining the order of the generic data. Takes as first argument the index a + * within the generic splitters s. Takes as second argument the index b within the + * generic data g. + * @return index of the search key, if it is contained in the list; otherwise, (-(insertion point) - + * 1). The insertion point is defined as the the point at which the value would be inserted into + * the list: the index of the first element greater than the key, or list.length, if all elements in + * the list are less than the specified key. Note that this guarantees that the return value will be >= 0 + * if and only if the key is found. + */ + private static int binarySearchFromTo(int a, int from, int to, IntComparator comp) { + while (from <= to) { + int mid = (from + to) / 2; + int comparison = comp.compare(mid, a); + if (comparison < 0) { + from = mid + 1; + } else if (comparison > 0) { + to = mid - 1; + } else { + return mid; + } // key found + } + return -(from + 1); // key not found. + } + + /** + * Same as {@link #dualPartition(int[],int[],int,int,int[],int,int,int[])} except that it synchronously + * partitions double[] rather than int[] arrays. + */ + public static void dualPartition(double[] list, double[] secondary, int from, int to, double[] splitters, + int splitFrom, int splitTo, int[] splitIndexes) { + + if (splitFrom > splitTo) { + return; + } // nothing to do + if (from > to) { // all bins are empty + from--; + for (int i = splitFrom; i <= splitTo;) { + splitIndexes[i++] = from; + } + return; + } + + // Choose a partition (pivot) index, m + // Ideally, the pivot should be the median, because a median splits a list into two equal sized sublists. + // However, computing the median is expensive, so we use an approximation. + int medianIndex; + if (splitFrom == splitTo) { // we don't really have a choice + medianIndex = splitFrom; + } else { // we do have a choice + int m = (from + to) / 2; // Small arrays, middle element + int len = to - from + 1; + if (len > SMALL) { + int l = from; + int n = to; + if (len > MEDIUM) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(list, l, l + s, l + 2 * s); + m = med3(list, m - s, m, m + s); + n = med3(list, n - 2 * s, n - s, n); + } + m = med3(list, l, m, n); // Mid-size, pseudomedian of 3 + } + + // Find the splitter closest to the pivot, i.e. the splitter that best splits the list into two equal sized sublists. + medianIndex = Sorting.binarySearchFromTo(splitters, list[m], splitFrom, splitTo); + if (medianIndex < 0) { + medianIndex = -medianIndex - 1; + } // not found + if (medianIndex > splitTo) { + medianIndex = splitTo; + } // not found, one past the end + + } + double splitter = splitters[medianIndex]; // int, double --> template type dependent + + // Partition the list according to the splitter, i.e. + // Establish invariant: list[i] < splitter <= list[j] for i=from..medianIndex and j=medianIndex+1 .. to + int splitIndex = dualPartition(list, secondary, from, to, splitter); + splitIndexes[medianIndex] = splitIndex; + + // Optimization: Handle special cases to cut down recursions. + if (splitIndex < from) { // no element falls into this bin + // all bins with splitters[i] <= splitter are empty + int i = medianIndex - 1; + while (i >= splitFrom && (!(splitter < splitters[i]))) { + splitIndexes[i--] = splitIndex; + } + splitFrom = medianIndex + 1; + } else if (splitIndex >= to) { // all elements fall into this bin + // all bins with splitters[i] >= splitter are empty + int i = medianIndex + 1; + while (i <= splitTo && (!(splitter > splitters[i]))) { + splitIndexes[i++] = splitIndex; + } + splitTo = medianIndex - 1; + } + + // recursively partition left half + if (splitFrom <= medianIndex - 1) { + dualPartition(list, secondary, from, splitIndex, splitters, splitFrom, medianIndex - 1, splitIndexes); + } + + // recursively partition right half + if (medianIndex + 1 <= splitTo) { + dualPartition(list, secondary, splitIndex + 1, to, splitters, medianIndex + 1, splitTo, splitIndexes); + } + } + + /** + * Same as {@link #dualPartition(int[],int[],int,int,int)} except that it synchronously partitions + * double[] rather than int[] arrays. + */ + public static int dualPartition(double[] list, double[] secondary, int from, int to, double splitter) { + for (int i = from - 1; ++i <= to;) { + double element = list[i]; // int, double --> template type dependent + if (element < splitter) { + // swap x[i] with x[from] + list[i] = list[from]; + list[from] = element; + + element = secondary[i]; + secondary[i] = secondary[from]; + secondary[from++] = element; + } + } + return from - 1; + } + + /** + * Same as {@link #partition(int[],int,int,int[],int,int,int[])} except that this method synchronously + * partitions two arrays at the same time; both arrays are partially sorted according to the elements of the primary + * array. In other words, each time an element in the primary array is moved from index A to B, the correspoding + * element within the secondary array is also moved from index A to B.

Use cases:

Image having a large + * list of 2-dimensional points. If memory consumption and performance matter, it is a good idea to physically lay + * them out as two 1-dimensional arrays (using something like Point2D objects would be prohibitively + * expensive, both in terms of time and space). Now imagine wanting to histogram the points. We may want to partially + * sort the points by x-coordinate into intervals. This method efficiently does the job.

Performance:

+ * Same as for single-partition methods. + */ + public static void dualPartition(int[] list, int[] secondary, int from, int to, int[] splitters, int splitFrom, + int splitTo, int[] splitIndexes) { + + if (splitFrom > splitTo) { + return; + } // nothing to do + if (from > to) { // all bins are empty + from--; + for (int i = splitFrom; i <= splitTo;) { + splitIndexes[i++] = from; + } + return; + } + + // Choose a partition (pivot) index, m + // Ideally, the pivot should be the median, because a median splits a list into two equal sized sublists. + // However, computing the median is expensive, so we use an approximation. + int medianIndex; + if (splitFrom == splitTo) { // we don't really have a choice + medianIndex = splitFrom; + } else { // we do have a choice + int m = (from + to) / 2; // Small arrays, middle element + int len = to - from + 1; + if (len > SMALL) { + int l = from; + int n = to; + if (len > MEDIUM) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(list, l, l + s, l + 2 * s); + m = med3(list, m - s, m, m + s); + n = med3(list, n - 2 * s, n - s, n); + } + m = med3(list, l, m, n); // Mid-size, pseudomedian of 3 + } + + // Find the splitter closest to the pivot, i.e. the splitter that best splits the list into two equal sized sublists. + medianIndex = Sorting.binarySearchFromTo(splitters, list[m], splitFrom, splitTo); + if (medianIndex < 0) { + medianIndex = -medianIndex - 1; + } // not found + if (medianIndex > splitTo) { + medianIndex = splitTo; + } // not found, one past the end + + } + int splitter = splitters[medianIndex]; // int, double --> template type dependent + + // Partition the list according to the splitter, i.e. + // Establish invariant: list[i] < splitter <= list[j] for i=from..medianIndex and j=medianIndex+1 .. to + int splitIndex = dualPartition(list, secondary, from, to, splitter); + splitIndexes[medianIndex] = splitIndex; + + // Optimization: Handle special cases to cut down recursions. + if (splitIndex < from) { // no element falls into this bin + // all bins with splitters[i] <= splitter are empty + int i = medianIndex - 1; + while (i >= splitFrom && (!(splitter < splitters[i]))) { + splitIndexes[i--] = splitIndex; + } + splitFrom = medianIndex + 1; + } else if (splitIndex >= to) { // all elements fall into this bin + // all bins with splitters[i] >= splitter are empty + int i = medianIndex + 1; + while (i <= splitTo && (!(splitter > splitters[i]))) { + splitIndexes[i++] = splitIndex; + } + splitTo = medianIndex - 1; + } + + // recursively partition left half + if (splitFrom <= medianIndex - 1) { + dualPartition(list, secondary, from, splitIndex, splitters, splitFrom, medianIndex - 1, splitIndexes); + } + + // recursively partition right half + if (medianIndex + 1 <= splitTo) { + dualPartition(list, secondary, splitIndex + 1, to, splitters, medianIndex + 1, splitTo, splitIndexes); + } + } + + /** + * Same as {@link #partition(int[],int,int,int)} except that this method synchronously partitions two arrays at + * the same time; both arrays are partially sorted according to the elements of the primary array. In other words, + * each time an element in the primary array is moved from index A to B, the correspoding element within the secondary + * array is also moved from index A to B.

Performance:

Same as for single-partition methods. + */ + public static int dualPartition(int[] list, int[] secondary, int from, int to, int splitter) { + for (int i = from - 1; ++i <= to;) { + int element = list[i]; // int, double --> template type dependent + if (element < splitter) { + // swap x[i] with x[from] + list[i] = list[from]; + list[from] = element; + + element = secondary[i]; + secondary[i] = secondary[from]; + secondary[from++] = element; + } + } + return from - 1; + } + + /** + * Same as {@link #partition(int[],int,int,int[],int,int,int[])} except that it generically partitions + * arbitrary shaped data (for example matrices or multiple arrays) rather than int[] arrays.

This method + * operates on arbitrary shaped data and arbitrary shaped splitters. In fact, it has no idea what kind of data by what + * kind of splitters it is partitioning. Comparisons and swapping are delegated to user provided objects which know + * their data and can do the job.

Lets call the generic data g (it may be a matrix, one array, three + * linked lists or whatever). Lets call the generic splitters s. This class takes a user comparison function + * operating on two indexes (a,b), namely an {@link IntComparator}. The comparison function determines + * whether s[a] is equal, less or greater than g[b]. This method can then decide to swap the data + * g[b] with the data g[c] (yes, c, not a). It calls a user provided {@link + * org.apache.mahout.math.Swapper} object that knows how to swap the data of these two indexes.

Again, note the + * details: Comparisons compare s[a] with g[b]. Swaps swap g[b] with g[c]. Prior + * to calling this method, the generic splitters s must be sorted ascending and must not contain multiple + * equal values. These preconditions are not checked; be sure that they are met. + * + * @param from the index of the first element within g to be considered. + * @param to the index of the last element within g to be considered. The method considers the + * elements g[from] .. g[to]. + * @param splitFrom the index of the first splitter element to be considered. + * @param splitTo the index of the last splitter element to be considered. The method considers the splitter + * elements s[splitFrom] .. s[splitTo]. + * @param splitIndexes a list into which this method fills the indexes of elements delimiting intervals. Upon return + * splitIndexes[splitFrom..splitTo] will be set accordingly. Therefore, must satisfy + * splitIndexes.length > splitTo. + * @param comp the comparator comparing a splitter with an element of the generic data. Takes as first + * argument the index a within the generic splitters s. Takes as second argument + * the index b within the generic data g. + * @param comp2 the comparator to determine the order of the generic data. Takes as first argument the index + * a within the generic data g. Takes as second argument the index b + * within the generic data g. + * @param comp3 the comparator comparing a splitter with another splitter. Takes as first argument the index + * a within the generic splitters s. Takes as second argument the index + * b within the generic splitters g. + * @param swapper an object that knows how to swap the elements at any two indexes (a,b). Takes as first argument + * the index b within the generic data g. Takes as second argument the index + * c within the generic data g. + * + *

Tip: Normally you will have splitIndexes.length == s.length as well as + * from==0, to==g.length-1 and splitFrom==0, splitTo==s.length-1. + * @see Sorting#binarySearchFromTo(int,int,IntComparator) + */ + public static void genericPartition(int from, int to, int splitFrom, int splitTo, int[] splitIndexes, + IntComparator comp, IntComparator comp2, IntComparator comp3, Swapper swapper) { + + if (splitFrom > splitTo) { + return; + } // nothing to do + if (from > to) { // all bins are empty + from--; + for (int i = splitFrom; i <= splitTo;) { + splitIndexes[i++] = from; + } + return; + } + + // Choose a partition (pivot) index, m + // Ideally, the pivot should be the median, because a median splits a list into two equal sized sublists. + // However, computing the median is expensive, so we use an approximation. + int medianIndex; + if (splitFrom == splitTo) { // we don't really have a choice + medianIndex = splitFrom; + } else { // we do have a choice + int m = (from + to) / 2; // Small arrays, middle element + int len = to - from + 1; + if (len > SMALL) { + int l = from; + int n = to; + if (len > MEDIUM) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(l, l + s, l + 2 * s, comp2); + m = med3(m - s, m, m + s, comp2); + n = med3(n - 2 * s, n - s, n, comp2); + } + m = med3(l, m, n, comp2); // Mid-size, pseudomedian of 3 + } + + // Find the splitter closest to the pivot, i.e. the splitter that best splits the list into two equal sized sublists. + medianIndex = binarySearchFromTo(m, splitFrom, splitTo, comp); + if (medianIndex < 0) { + medianIndex = -medianIndex - 1; + } // not found + if (medianIndex > splitTo) { + medianIndex = splitTo; + } // not found, one past the end + + } + int splitter = medianIndex; // int, double --> template type dependent + + // Partition the list according to the splitter, i.e. + // Establish invariant: list[i] < splitter <= list[j] for i=from..medianIndex and j=medianIndex+1 .. to + int splitIndex = genericPartition(from, to, splitter, comp, swapper); + splitIndexes[medianIndex] = splitIndex; + + + // Optimization: Handle special cases to cut down recursions. + if (splitIndex < from) { // no element falls into this bin + // all bins with splitters[i] <= splitter are empty + int i = medianIndex - 1; + while (i >= splitFrom && (!(comp3.compare(splitter, i) < 0))) { + splitIndexes[i--] = splitIndex; + } + splitFrom = medianIndex + 1; + } else if (splitIndex >= to) { // all elements fall into this bin + // all bins with splitters[i] >= splitter are empty + int i = medianIndex + 1; + while (i <= splitTo && (!(comp3.compare(splitter, i) > 0))) { + splitIndexes[i++] = splitIndex; + } + splitTo = medianIndex - 1; + } + + + // recursively partition left half + if (splitFrom <= medianIndex - 1) { + genericPartition(from, splitIndex, splitFrom, medianIndex - 1, splitIndexes, comp, comp2, comp3, swapper); + } + + // recursively partition right half + if (medianIndex + 1 <= splitTo) { + genericPartition(splitIndex + 1, to, medianIndex + 1, splitTo, splitIndexes, comp, comp2, comp3, swapper); + } + } + + /** + * Same as {@link #partition(int[],int,int,int)} except that it generically partitions arbitrary shaped data + * (for example matrices or multiple arrays) rather than int[] arrays. + */ + private static int genericPartition(int from, int to, int splitter, IntComparator comp, Swapper swapper) { + for (int i = from - 1; ++i <= to;) { + if (comp.compare(splitter, i) > 0) { + // swap x[i] with x[from] + swapper.swap(i, from); + from++; + } + } + return from - 1; + } + + /** Returns the index of the median of the three indexed elements. */ + private static int med3(double[] x, int a, int b, int c) { + return (x[a] < x[b] ? + (x[b] < x[c] ? b : x[a] < x[c] ? c : a) : + (x[b] > x[c] ? b : x[a] > x[c] ? c : a)); + } + + /** Returns the index of the median of the three indexed elements. */ + private static int med3(int[] x, int a, int b, int c) { + return (x[a] < x[b] ? + (x[b] < x[c] ? b : x[a] < x[c] ? c : a) : + (x[b] > x[c] ? b : x[a] > x[c] ? c : a)); + } + + /** Returns the index of the median of the three indexed chars. */ + private static int med3(Object[] x, int a, int b, int c, Comparator comp) { + int ab = comp.compare(x[a], x[b]); + int ac = comp.compare(x[a], x[c]); + int bc = comp.compare(x[b], x[c]); + return (ab < 0 ? + (bc < 0 ? b : ac < 0 ? c : a) : + (bc > 0 ? b : ac > 0 ? c : a)); + } + + /** Returns the index of the median of the three indexed chars. */ + private static int med3(int a, int b, int c, IntComparator comp) { + int ab = comp.compare(a, b); + int ac = comp.compare(a, c); + int bc = comp.compare(b, c); + return (ab < 0 ? + (bc < 0 ? b : ac < 0 ? c : a) : + (bc > 0 ? b : ac > 0 ? c : a)); + } + + /** + * Same as {@link #partition(int[],int,int,int[],int,int,int[])} except that it partitions double[] rather + * than int[] arrays. + */ + public static void partition(double[] list, int from, int to, double[] splitters, int splitFrom, int splitTo, + int[] splitIndexes) { + + if (splitFrom > splitTo) { + return; + } // nothing to do + if (from > to) { // all bins are empty + from--; + for (int i = splitFrom; i <= splitTo;) { + splitIndexes[i++] = from; + } + return; + } + + // Choose a partition (pivot) index, m + // Ideally, the pivot should be the median, because a median splits a list into two equal sized sublists. + // However, computing the median is expensive, so we use an approximation. + int medianIndex; + if (splitFrom == splitTo) { // we don't really have a choice + medianIndex = splitFrom; + } else { // we do have a choice + int m = (from + to) / 2; // Small arrays, middle element + int len = to - from + 1; + if (len > SMALL) { + int l = from; + int n = to; + if (len > MEDIUM) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(list, l, l + s, l + 2 * s); + m = med3(list, m - s, m, m + s); + n = med3(list, n - 2 * s, n - s, n); + } + m = med3(list, l, m, n); // Mid-size, pseudomedian of 3 + } + + // Find the splitter closest to the pivot, i.e. the splitter that best splits the list into two equal sized sublists. + medianIndex = Sorting.binarySearchFromTo(splitters, list[m], splitFrom, splitTo); + if (medianIndex < 0) { + medianIndex = -medianIndex - 1; + } // not found + if (medianIndex > splitTo) { + medianIndex = splitTo; + } // not found, one past the end + + } + double splitter = splitters[medianIndex]; // int, double --> template type dependent + + // Partition the list according to the splitter, i.e. + // Establish invariant: list[i] < splitter <= list[j] for i=from..medianIndex and j=medianIndex+1 .. to + int splitIndex = partition(list, from, to, splitter); + splitIndexes[medianIndex] = splitIndex; + + // Optimization: Handle special cases to cut down recursions. + if (splitIndex < from) { // no element falls into this bin + // all bins with splitters[i] <= splitter are empty + int i = medianIndex - 1; + while (i >= splitFrom && (!(splitter < splitters[i]))) { + splitIndexes[i--] = splitIndex; + } + splitFrom = medianIndex + 1; + } else if (splitIndex >= to) { // all elements fall into this bin + // all bins with splitters[i] >= splitter are empty + int i = medianIndex + 1; + while (i <= splitTo && (!(splitter > splitters[i]))) { + splitIndexes[i++] = splitIndex; + } + splitTo = medianIndex - 1; + } + + // recursively partition left half + if (splitFrom <= medianIndex - 1) { + partition(list, from, splitIndex, splitters, splitFrom, medianIndex - 1, splitIndexes); + } + + // recursively partition right half + if (medianIndex + 1 <= splitTo) { + partition(list, splitIndex + 1, to, splitters, medianIndex + 1, splitTo, splitIndexes); + } + } + + /** + * Same as {@link #partition(int[],int,int,int)} except that it partitions double[] rather than + * int[] arrays. + */ + public static int partition(double[] list, int from, int to, double splitter) { + for (int i = from - 1; ++i <= to;) { + double element = list[i]; // int, double --> template type dependent + if (element < splitter) { + // swap x[i] with x[from] + list[i] = list[from]; + list[from++] = element; + } + } + return from - 1; + } + + /** + * Partitions (partially sorts) the given list such that all elements falling into some intervals are placed next to + * each other. Returns the indexes of elements delimiting intervals.

Example:

list = (7, 4, 5, 50, + * 6, 4, 3, 6), splitters = (5, 10, 30) defines the three intervals [-infinity,5), [5,10), [10,30). Lets + * define to sort the entire list (from=0, to=7) using all splitters (splitFrom==0, splitTo=2).

+ * The method modifies the list to be list = (4, 4, 3, 6, 7, 5, 6, 50) and returns the splitIndexes = (2, + * 6, 6). In other words,

  • All values list[0..2] fall into [-infinity,5).
  • All + * values list[3..6] fall into [5,10).
  • All values list[7..6] fall into + * [10,30), i.e. no elements, since 7>6.
  • All values list[7 .. 7=list.length-1] fall + * into [30,infinity].
  • In general, all values list[splitIndexes[j-1]+1 .. splitIndexes[j]] fall + * into interval j.
As can be seen, the list is partially sorted such that values falling into a + * certain interval are placed next to each other. Note that within an interval, elements are entirelly + * unsorted. They are only sorted across interval boundaries. In particular, this partitioning algorithm is not + * stable: the relative order of elements is not preserved (Producing a stable algorithm would require no more + * than minor modifications to method partition(int[],int,int,int)).

More formally, this method guarantees that + * upon return for all j = splitFrom .. splitTo there holds:
for all i = splitIndexes[j-1]+1 .. + * splitIndexes[j]: splitters[j-1] <= list[i] < splitters[j].

Performance:

Let + * N=to-from+1 be the number of elements to be partitioned. Let k=splitTo-splitFrom+1 be the number + * of splitter elements. Then we have the following time complexities

  • Worst case: O( N * log(k) ). + *
  • Average case: O( N * log(k) ).
  • Best case: O( N ). In general, the more uniform (skewed) + * the data is spread across intervals, the more performance approaches the worst (best) case. If no elements fall + * into the given intervals, running time is linear.
No temporary memory is allocated; the sort is in-place.

+ * Implementation:

The algorithm can be seen as a Bentley/McIlroy quicksort where swapping and insertion + * sort are omitted. It is designed to detect and take advantage of skew while maintaining good performance in the + * uniform case. + * + * @param list the list to be partially sorted. + * @param from the index of the first element within list to be considered. + * @param to the index of the last element within list to be considered. The method considers the + * elements list[from] .. list[to]. + * @param splitters the values at which the list shall be split into intervals. Must be sorted ascending and must + * not contain multiple identical values. These preconditions are not checked; be sure that they + * are met. + * @param splitFrom the index of the first splitter element to be considered. + * @param splitTo the index of the last splitter element to be considered. The method considers the splitter + * elements splitters[splitFrom] .. splitters[splitTo]. + * @param splitIndexes a list into which this method fills the indexes of elements delimiting intervals. Upon return + * splitIndexes[splitFrom..splitTo] will be set accordingly. Therefore, must satisfy + * splitIndexes.length > splitTo.

Tip: Normally you will have splitIndexes.length + * == splitters.length as well as from==0, to==list.length-1 and splitFrom==0, + * splitTo==splitters.length-1. + * @see org.apache.mahout.math.Arrays + * @see org.apache.mahout.math.GenericSorting + * @see java.util.Arrays + */ + public static void partition(int[] list, int from, int to, int[] splitters, int splitFrom, int splitTo, + int[] splitIndexes) { + //int element; // int, double --> template type dependent + + if (splitFrom > splitTo) { + return; + } // nothing to do + if (from > to) { // all bins are empty + from--; + for (int i = splitFrom; i <= splitTo;) { + splitIndexes[i++] = from; + } + return; + } + + // Choose a partition (pivot) index, m + // Ideally, the pivot should be the median, because a median splits a list into two equal sized sublists. + // However, computing the median is expensive, so we use an approximation. + int medianIndex; + if (splitFrom == splitTo) { // we don't really have a choice + medianIndex = splitFrom; + } else { // we do have a choice + int m = (from + to) / 2; // Small arrays, middle element + int len = to - from + 1; + if (len > SMALL) { + int l = from; + int n = to; + if (len > MEDIUM) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(list, l, l + s, l + 2 * s); + m = med3(list, m - s, m, m + s); + n = med3(list, n - 2 * s, n - s, n); + } + m = med3(list, l, m, n); // Mid-size, pseudomedian of 3 + } + + // Find the splitter closest to the pivot, i.e. the splitter that best splits the list into two equal sized sublists. + medianIndex = Sorting.binarySearchFromTo(splitters, list[m], splitFrom, splitTo); + + //int key = list[m]; + /* + if (splitTo-splitFrom+1 < 5) { // on short lists linear search is quicker + int i=splitFrom-1; + while (++i <= splitTo && list[i] < key); + if (i > splitTo || list[i] > key) i = -i-1; // not found + medianIndex = i; + } + */ + //else { + /* + + int low = splitFrom; + int high = splitTo; + int comparison; + + int mid=0; + while (low <= high) { + mid = (low + high) / 2; + comparison = splitters[mid]-key; + if (comparison < 0) low = mid + 1; + else if (comparison > 0) high = mid - 1; + else break; //return mid; // key found + } + medianIndex = mid; + if (low > high) medianIndex = -(medianIndex + 1); // key not found. + //} + */ + + + if (medianIndex < 0) { + medianIndex = -medianIndex - 1; + } // not found + if (medianIndex > splitTo) { + medianIndex = splitTo; + } // not found, one past the end + + } + int splitter = splitters[medianIndex]; + + //log.info("medianIndex="+medianIndex); + // Partition the list according to the splitter, i.e. + // Establish invariant: list[i] < splitter <= list[j] for i=from..medianIndex and j=medianIndex+1 .. to + // Could simply call: + int splitIndex = partition(list, from, to, splitter); + // but for speed the code is manually inlined. + /* + steps += to-from+1; + int head = from; + for (int i=from-1; ++i<=to; ) { // swap all elements < splitter to front + element = list[i]; + if (element < splitter) { + list[i] = list[head]; + list[head++] = element; + //swappedElements++; + } + } + int splitIndex = head-1; + */ + + + //log.info("splitIndex="+splitIndex); + splitIndexes[medianIndex] = splitIndex; + + //if (splitFrom == splitTo) return; // done + + // Optimization: Handle special cases to cut down recursions. + if (splitIndex < from) { // no element falls into this bin + // all bins with splitters[i] <= splitter are empty + int i = medianIndex - 1; + while (i >= splitFrom && (!(splitter < splitters[i]))) { + splitIndexes[i--] = splitIndex; + } + splitFrom = medianIndex + 1; + } else if (splitIndex >= to) { // all elements fall into this bin + // all bins with splitters[i] >= splitter are empty + int i = medianIndex + 1; + while (i <= splitTo && (!(splitter > splitters[i]))) { + splitIndexes[i++] = splitIndex; + } + splitTo = medianIndex - 1; + } + + // recursively partition left half + if (splitFrom <= medianIndex - 1) { + //log.info("1.recursive: from="+from+", to="+splitIndex+", splitFrom="+splitFrom+", splitTo="+(medianIndex-1)); + partition(list, from, splitIndex, splitters, splitFrom, medianIndex - 1, splitIndexes); + } + + // recursively partition right half + if (medianIndex + 1 <= splitTo) { + //log.info("2.recursive: from="+(splitIndex+1)+", to="+to+", splitFrom="+(medianIndex+1)+", splitTo="+splitTo); + partition(list, splitIndex + 1, to, splitters, medianIndex + 1, splitTo, splitIndexes); + } + //log.info("BACK TRACKING\n\n"); + } + + /** + * Partitions (partially sorts) the given list such that all elements falling into the given interval are placed next + * to each other. Returns the index of the element delimiting the interval.

Example:

list = (7, 4, + * 5, 50, 6, 4, 3, 6), splitter = 5 defines the two intervals [-infinity,5), [5,+infinity].

The + * method modifies the list to be list = (4, 4, 3, 50, 6, 7, 5, 6) and returns the split index 2. In + * other words,

  • All values list[0..2] fall into [-infinity,5).
  • All values + * list[3=2+1 .. 7=list.length-1] fall into [5,+infinity].
As can be seen, the list is + * partially sorted such that values falling into a certain interval are placed next to each other. Note that + * within an interval, elements are entirelly unsorted. They are only sorted across interval boundaries. In + * particular, this partitioning algorithm is not stable.

More formally, this method guarantees that upon + * return there holds:

  • for all i = from .. returnValue: list[i] < splitter and
  • for all i = + * returnValue+1 .. list.length-1: !(list[i] < splitter).

Performance:

Let + * N=to-from+1 be the number of elements to be partially sorted. Then the time complexity is O( N ). + * No temporary memory is allocated; the sort is in-place. + * + *

+ * + * @param list the list to be partially sorted. + * @param from the index of the first element within list to be considered. + * @param to the index of the last element within list to be considered. The method considers the + * elements list[from] .. list[to]. + * @param splitter the value at which the list shall be split. + * @return the index of the largest element falling into the interval [-infinity,splitter), as seen after + * partitioning. + */ + public static int partition(int[] list, int from, int to, int splitter) { + steps += to - from + 1; + + /* + log.info(); + if (from<=to) { + log.info("SORT WORKING: from="+from+", to="+to+", splitter="+splitter); + } + else { + log.info("SORT WORKING: NOTHING TO DO."); + } + */ + + + // returns index of last element < splitter + + + /* + for (int i=from-1; ++i<=to; ) { + if (list[i] < splitter) { + int element = list[i]; + list[i] = list[from]; + list[from++] = element; + } + } + */ + + + for (int i = from - 1; ++i <= to;) { + int element = list[i]; + if (element < splitter) { + // swap x[i] with x[from] + list[i] = list[from]; + list[from++] = element; + //swappedElements++; + } + } + //if (from<=to) log.info("Swapped "+(head-from)+" elements"); + + + /* + //JAL: + int first = from; + int last = to+1; + --first; + while (true) { + while (++first < last && list[first] < splitter); + while (first < --last && !(list[last] < splitter)); + if (first >= last) return first-1; + int tmp = list[first]; + list[first] = list[last]; + list[last] = tmp; + } + */ + + + /* + log.info("splitter="+splitter); + log.info("before="+new IntArrayList(list)); + int head = from; + int trail = to; + int element; + while (head<=trail) { + head--; + while (++head < trail && list[head] < splitter); + + trail++; + while (--trail > head && list[trail] >= splitter); + + if (head != trail) { + element = list[head]; + list[head] = list[trail]; + list[trail] = element; + } + head++; + trail--; + log.info("after ="+new IntArrayList(list)+", head="+head); + } + */ + + + /* + //log.info("splitter="+splitter); + //log.info("before="+new IntArrayList(list)); + to++; + //int head = from; + int element; + //int oldHead; + while (--to >= from) { + element = list[to]; + if (element < splitter) { + from--; + while (++from < to && list[from] < splitter); + //if (head != to) { + list[to] = list[from]; + list[from++] = element; + //oldHead = list[head]; + //list[head] = element; + //list[i] = oldHead; + + //head++; + //} + //head++; + } + //log.info("after ="+new IntArrayList(list)+", head="+head); + } + */ + + /* + int i=from-1; + int head = from; + int trail = to; + while (++i <= trail) { + int element = list[i]; + if (element < splitter) { + if (head == i) head++; + else { + // swap list[i] with list[from] + int oldHead = list[head]; + int oldTrail = list[trail]; + list[head++] = element; + list[i--] = oldTrail; + list[trail--] = oldHead; + } + } + //log.info(new IntArrayList(list)); + + } + */ + + + return from - 1; + //return head-1; + } + + /** + * Same as {@link #partition(int[],int,int,int[],int,int,int[])} except that it partitions Object[] rather + * than int[] arrays. + */ + public static void partition(Object[] list, int from, int to, Object[] splitters, int splitFrom, int splitTo, + int[] splitIndexes, Comparator comp) { + + if (splitFrom > splitTo) { + return; + } // nothing to do + if (from > to) { // all bins are empty + from--; + for (int i = splitFrom; i <= splitTo;) { + splitIndexes[i++] = from; + } + return; + } + + // Choose a partition (pivot) index, m + // Ideally, the pivot should be the median, because a median splits a list into two equal sized sublists. + // However, computing the median is expensive, so we use an approximation. + int medianIndex; + if (splitFrom == splitTo) { // we don't really have a choice + medianIndex = splitFrom; + } else { // we do have a choice + int m = (from + to) / 2; // Small arrays, middle element + int len = to - from + 1; + if (len > SMALL) { + int l = from; + int n = to; + if (len > MEDIUM) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(list, l, l + s, l + 2 * s, comp); + m = med3(list, m - s, m, m + s, comp); + n = med3(list, n - 2 * s, n - s, n, comp); + } + m = med3(list, l, m, n, comp); // Mid-size, pseudomedian of 3 + } + + // Find the splitter closest to the pivot, i.e. the splitter that best splits the list into two equal sized sublists. + medianIndex = Sorting.binarySearchFromTo(splitters, list[m], splitFrom, splitTo, comp); + if (medianIndex < 0) { + medianIndex = -medianIndex - 1; + } // not found + if (medianIndex > splitTo) { + medianIndex = splitTo; + } // not found, one past the end + + } + Object splitter = splitters[medianIndex]; // int, double --> template type dependent + + // Partition the list according to the splitter, i.e. + // Establish invariant: list[i] < splitter <= list[j] for i=from..medianIndex and j=medianIndex+1 .. to + int splitIndex = partition(list, from, to, splitter, comp); + splitIndexes[medianIndex] = splitIndex; + + // Optimization: Handle special cases to cut down recursions. + if (splitIndex < from) { // no element falls into this bin + // all bins with splitters[i] <= splitter are empty + int i = medianIndex - 1; + while (i >= splitFrom && (!(comp.compare(splitter, splitters[i]) < 0))) { + splitIndexes[i--] = splitIndex; + } + splitFrom = medianIndex + 1; + } else if (splitIndex >= to) { // all elements fall into this bin + // all bins with splitters[i] >= splitter are empty + int i = medianIndex + 1; + while (i <= splitTo && (!(comp.compare(splitter, splitters[i]) > 0))) { + splitIndexes[i++] = splitIndex; + } + splitTo = medianIndex - 1; + } + + // recursively partition left half + if (splitFrom <= medianIndex - 1) { + partition(list, from, splitIndex, splitters, splitFrom, medianIndex - 1, splitIndexes, comp); + } + + // recursively partition right half + if (medianIndex + 1 <= splitTo) { + partition(list, splitIndex + 1, to, splitters, medianIndex + 1, splitTo, splitIndexes, comp); + } + } + + /** + * Same as {@link #partition(int[],int,int,int)} except that it synchronously partitions the objects of the + * given list by the order of the given comparator. + */ + public static int partition(Object[] list, int from, int to, Object splitter, Comparator comp) { + for (int i = from - 1; ++i <= to;) { + Object element = list[i]; // int, double --> template type dependent + if (comp.compare(element, splitter) < 0) { + // swap x[i] with x[from] + list[i] = list[from]; + list[from] = element; + from++; + } + } + return from - 1; + } + + /** + * Equivalent to partition(list.elements(), from, to, splitters.elements(), 0, splitters.size()-1, + * splitIndexes.elements()). + */ + public static void partition(DoubleArrayList list, int from, int to, DoubleArrayList splitters, + IntArrayList splitIndexes) { + partition(list.elements(), from, to, splitters.elements(), 0, splitters.size() - 1, splitIndexes.elements()); + } + + /** + * Equivalent to partition(list.elements(), from, to, splitters.elements(), 0, splitters.size()-1, + * splitIndexes.elements()). + */ + public static void partition(IntArrayList list, int from, int to, IntArrayList splitters, IntArrayList splitIndexes) { + partition(list.elements(), from, to, splitters.elements(), 0, splitters.size() - 1, splitIndexes.elements()); + } + + /** + * Same as {@link #triplePartition(int[],int[],int[],int,int,int[],int,int,int[])} except that it synchronously + * partitions double[] rather than int[] arrays. + */ + public static void triplePartition(double[] list, double[] secondary, double[] tertiary, int from, int to, + double[] splitters, int splitFrom, int splitTo, int[] splitIndexes) { + + if (splitFrom > splitTo) { + return; + } // nothing to do + if (from > to) { // all bins are empty + from--; + for (int i = splitFrom; i <= splitTo;) { + splitIndexes[i++] = from; + } + return; + } + + // Choose a partition (pivot) index, m + // Ideally, the pivot should be the median, because a median splits a list into two equal sized sublists. + // However, computing the median is expensive, so we use an approximation. + int medianIndex; + if (splitFrom == splitTo) { // we don't really have a choice + medianIndex = splitFrom; + } else { // we do have a choice + int m = (from + to) / 2; // Small arrays, middle element + int len = to - from + 1; + if (len > SMALL) { + int l = from; + int n = to; + if (len > MEDIUM) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(list, l, l + s, l + 2 * s); + m = med3(list, m - s, m, m + s); + n = med3(list, n - 2 * s, n - s, n); + } + m = med3(list, l, m, n); // Mid-size, pseudomedian of 3 + } + + // Find the splitter closest to the pivot, i.e. the splitter that best splits the list into two equal sized sublists. + medianIndex = Sorting.binarySearchFromTo(splitters, list[m], splitFrom, splitTo); + if (medianIndex < 0) { + medianIndex = -medianIndex - 1; + } // not found + if (medianIndex > splitTo) { + medianIndex = splitTo; + } // not found, one past the end + + } + double splitter = splitters[medianIndex]; // int, double --> template type dependent + + // Partition the list according to the splitter, i.e. + // Establish invariant: list[i] < splitter <= list[j] for i=from..medianIndex and j=medianIndex+1 .. to + int splitIndex = triplePartition(list, secondary, tertiary, from, to, splitter); + splitIndexes[medianIndex] = splitIndex; + + // Optimization: Handle special cases to cut down recursions. + if (splitIndex < from) { // no element falls into this bin + // all bins with splitters[i] <= splitter are empty + int i = medianIndex - 1; + while (i >= splitFrom && (!(splitter < splitters[i]))) { + splitIndexes[i--] = splitIndex; + } + splitFrom = medianIndex + 1; + } else if (splitIndex >= to) { // all elements fall into this bin + // all bins with splitters[i] >= splitter are empty + int i = medianIndex + 1; + while (i <= splitTo && (!(splitter > splitters[i]))) { + splitIndexes[i++] = splitIndex; + } + splitTo = medianIndex - 1; + } + + // recursively partition left half + if (splitFrom <= medianIndex - 1) { + triplePartition(list, secondary, tertiary, from, splitIndex, splitters, splitFrom, medianIndex - 1, splitIndexes); + } + + // recursively partition right half + if (medianIndex + 1 <= splitTo) { + triplePartition(list, secondary, tertiary, splitIndex + 1, to, splitters, medianIndex + 1, splitTo, splitIndexes); + } + } + + /** + * Same as {@link #triplePartition(int[],int[],int[],int,int,int)} except that it synchronously partitions + * double[] rather than int[] arrays. + */ + public static int triplePartition(double[] list, double[] secondary, double[] tertiary, int from, int to, + double splitter) { + for (int i = from - 1; ++i <= to;) { + double element = list[i]; // int, double --> template type dependent + if (element < splitter) { + // swap x[i] with x[from] + list[i] = list[from]; + list[from] = element; + + element = secondary[i]; + secondary[i] = secondary[from]; + secondary[from] = element; + + element = tertiary[i]; + tertiary[i] = tertiary[from]; + tertiary[from++] = element; + } + } + + return from - 1; + } + + /** + * Same as {@link #partition(int[],int,int,int[],int,int,int[])} except that this method synchronously + * partitions three arrays at the same time; all three arrays are partially sorted according to the elements of the + * primary array. In other words, each time an element in the primary array is moved from index A to B, the + * correspoding element within the secondary array as well as the corresponding element within the tertiary array are + * also moved from index A to B.

Use cases:

Image having a large list of 3-dimensional points. If + * memory consumption and performance matter, it is a good idea to physically lay them out as three 1-dimensional + * arrays (using something like Point3D objects would be prohibitively expensive, both in terms of time and + * space). Now imagine wanting to histogram the points. We may want to partially sort the points by x-coordinate into + * intervals. This method efficiently does the job.

Performance:

Same as for single-partition methods. + */ + public static void triplePartition(int[] list, int[] secondary, int[] tertiary, int from, int to, int[] splitters, + int splitFrom, int splitTo, int[] splitIndexes) { + + if (splitFrom > splitTo) { + return; + } // nothing to do + if (from > to) { // all bins are empty + from--; + for (int i = splitFrom; i <= splitTo;) { + splitIndexes[i++] = from; + } + return; + } + + // Choose a partition (pivot) index, m + // Ideally, the pivot should be the median, because a median splits a list into two equal sized sublists. + // However, computing the median is expensive, so we use an approximation. + int medianIndex; + if (splitFrom == splitTo) { // we don't really have a choice + medianIndex = splitFrom; + } else { // we do have a choice + int m = (from + to) / 2; // Small arrays, middle element + int len = to - from + 1; + if (len > SMALL) { + int l = from; + int n = to; + if (len > MEDIUM) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(list, l, l + s, l + 2 * s); + m = med3(list, m - s, m, m + s); + n = med3(list, n - 2 * s, n - s, n); + } + m = med3(list, l, m, n); // Mid-size, pseudomedian of 3 + } + + // Find the splitter closest to the pivot, i.e. the splitter that best splits the list into two equal sized sublists. + medianIndex = Sorting.binarySearchFromTo(splitters, list[m], splitFrom, splitTo); + if (medianIndex < 0) { + medianIndex = -medianIndex - 1; + } // not found + if (medianIndex > splitTo) { + medianIndex = splitTo; + } // not found, one past the end + + } + int splitter = splitters[medianIndex]; // int, double --> template type dependent + + // Partition the list according to the splitter, i.e. + // Establish invariant: list[i] < splitter <= list[j] for i=from..medianIndex and j=medianIndex+1 .. to + int splitIndex = triplePartition(list, secondary, tertiary, from, to, splitter); + splitIndexes[medianIndex] = splitIndex; + + // Optimization: Handle special cases to cut down recursions. + if (splitIndex < from) { // no element falls into this bin + // all bins with splitters[i] <= splitter are empty + int i = medianIndex - 1; + while (i >= splitFrom && (!(splitter < splitters[i]))) { + splitIndexes[i--] = splitIndex; + } + splitFrom = medianIndex + 1; + } else if (splitIndex >= to) { // all elements fall into this bin + // all bins with splitters[i] >= splitter are empty + int i = medianIndex + 1; + while (i <= splitTo && (!(splitter > splitters[i]))) { + splitIndexes[i++] = splitIndex; + } + splitTo = medianIndex - 1; + } + + // recursively partition left half + if (splitFrom <= medianIndex - 1) { + triplePartition(list, secondary, tertiary, from, splitIndex, splitters, splitFrom, medianIndex - 1, splitIndexes); + } + + // recursively partition right half + if (medianIndex + 1 <= splitTo) { + triplePartition(list, secondary, tertiary, splitIndex + 1, to, splitters, medianIndex + 1, splitTo, splitIndexes); + } + } + + /** + * Same as {@link #partition(int[],int,int,int)} except that this method synchronously partitions three arrays + * at the same time; all three arrays are partially sorted according to the elements of the primary array. In other + * words, each time an element in the primary array is moved from index A to B, the correspoding element within the + * secondary array as well as the corresponding element within the tertiary array are also moved from index A to B. + *

Performance:

Same as for single-partition methods. + */ + public static int triplePartition(int[] list, int[] secondary, int[] tertiary, int from, int to, int splitter) { + for (int i = from - 1; ++i <= to;) { + int element = list[i]; // int, double --> template type dependent + if (element < splitter) { + // swap x[i] with x[from] + list[i] = list[from]; + list[from] = element; + + element = secondary[i]; + secondary[i] = secondary[from]; + secondary[from] = element; + + element = tertiary[i]; + tertiary[i] = tertiary[from]; + tertiary[from++] = element; + } + } + + return from - 1; + } +} Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Partitioning.java ------------------------------------------------------------------------------ svn:eol-style = native