mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r943236 - in /lucene/mahout/trunk/core/src: main/java/org/apache/mahout/math/Varint.java main/java/org/apache/mahout/math/VectorWritable.java test/java/org/apache/mahout/math/VarintTest.java
Date Tue, 11 May 2010 19:22:23 GMT
Author: srowen
Date: Tue May 11 19:22:23 2010
New Revision: 943236

URL: http://svn.apache.org/viewvc?rev=943236&view=rev
Log:
MAHOUT-391

Added:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java?rev=943236&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java Tue May 11 19:22:23
2010
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * <p>Encodes signed and unsigned values using a common variable-length
+ * scheme, found for example in
+ * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
+ * Google's Protocol Buffers</a>. It uses fewer bytes to encode smaller values,
+ * but will use slightly more bytes to encode large values.</p>
+ *
+ * <p>Signed values are further encoded using so-called zig-zag encoding
+ * in order to make them "compatible" with variable-length encoding.</p>
+ */
+public final class Varint {
+
+  public static final long MIN_SIGNED_VAR_LONG = -(1L << 62);
+  public static final long MAX_SIGNED_VAR_LONG = (1L << 62) - 1;
+  public static final int MIN_SIGNED_VAR_INT = -(1 << 30);
+  public static final int MAX_SIGNED_VAR_INT = (1 << 30) - 1;
+
+  private Varint() {
+  }
+
+  /**
+   * Encodes a value using the variable-length encoding from
+   * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
+   * Google Protocol Buffers</a>. It uses zig-zag encoding to efficiently
+   * encode signed values. If values are known to be nonnegative,
+   * {@link #writeUnsignedVarLong(long, DataOutput)} should be used.
+   *
+   * @param value value to encode
+   * @param out to write bytes to
+   * @throws IOException if {@link DataOutput} throws {@link IOException}
+   * @throws IllegalArgumentException if value is not between {@link #MIN_SIGNED_VAR_LONG}
+   *  and {@link #MAX_SIGNED_VAR_LONG} inclusive
+   */
+  public static void writeSignedVarLong(long value, DataOutput out) throws IOException {
+    if (value < MIN_SIGNED_VAR_LONG || value > MAX_SIGNED_VAR_LONG) {
+      throw new IllegalArgumentException("Can't encode value as signed: " + value);
+    }
+    // Great trick from http://code.google.com/apis/protocolbuffers/docs/encoding.html#types
+    writeUnsignedVarLong((value << 1) ^ (value >> 63), out);
+  }
+
+  /**
+   * Encodes a value using the variable-length encoding from
+   * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
+   * Google Protocol Buffers</a>. Zig-zag is not used, so input must not be negative.
+   * If values can be negative, use {@link #writeSignedVarLong(long, DataOutput)}
+   * instead.
+   *
+   * @param value value to encode
+   * @param out to write bytes to
+   * @throws IOException if {@link DataOutput} throws {@link IOException}
+   * @throws IllegalArgumentException if value is negative
+   */
+  public static void writeUnsignedVarLong(long value, DataOutput out) throws IOException
{
+    if (value < 0L) {
+      throw new IllegalArgumentException("Can't encode negative value: " + value);
+    }
+    while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) {
+      out.writeByte(((int) value & 0x7F) | 0x80);
+      value >>>= 7;
+    }
+    out.writeByte((int) value & 0x7F);
+  }
+
+  /**
+   * See {@link #writeSignedVarLong(long, DataOutput)}
+   */
+  public static void writeSignedVarInt(int value, DataOutput out) throws IOException {
+    if (value < MIN_SIGNED_VAR_INT || value > MAX_SIGNED_VAR_INT) {
+      throw new IllegalArgumentException("Can't encode value as signed: " + value);
+    }
+    // Great trick from http://code.google.com/apis/protocolbuffers/docs/encoding.html#types
+    writeUnsignedVarInt((value << 1) ^ (value >> 31), out);
+  }
+
+  /**
+   * See {@link #writeUnsignedVarLong(long, DataOutput)}
+   */
+  public static void writeUnsignedVarInt(int value, DataOutput out) throws IOException {
+    if (value < 0) {
+      throw new IllegalArgumentException("Can't encode negative value: " + value);
+    }
+    while ((value & 0xFFFFFF80) != 0L) {
+      out.writeByte((value & 0x7F) | 0x80);
+      value >>>= 7;
+    }
+    out.writeByte(value & 0x7F);
+  }
+
+  /**
+   * See {@link #writeSignedVarLong(long, DataOutput)}.
+   *
+   * @param in to read bytes from
+   * @return decode value
+   * @throws IOException if {@link DataInput} throws {@link IOException}
+   * @throws IllegalArgumentException if variable-length value does not terminate
+   *  after 8 bytes have been read
+   */
+  public static long readSignedVarLong(DataInput in) throws IOException {
+    long raw = readUnsignedVarLong(in);
+    // This undoes the trick in writeSignedVarLong()
+    return (((raw << 63) >> 63) ^ raw) >> 1;
+  }
+
+  /**
+   * See {@link #writeUnsignedVarLong(long, DataOutput)}.
+   *
+   * @param in to read bytes from
+   * @return decode value
+   * @throws IOException if {@link DataInput} throws {@link IOException}
+   * @throws IllegalArgumentException if variable-length value does not terminate
+   *  after 8 bytes have been read
+   */
+  public static long readUnsignedVarLong(DataInput in) throws IOException {
+    long value = 0L;
+    int i = 0;
+    long b;
+    while (((b = in.readByte()) & 0x80L) != 0) {
+      value |= (b & 0x7F) << i;
+      i += 7;
+      if (i > 56) {
+        throw new IllegalArgumentException("Variable length quantity is too long");
+      }
+    }
+    return value | (b << i);
+  }
+
+  /**
+   * See {@link #readSignedVarLong(DataInput)}
+   */
+  public static int readSignedVarInt(DataInput in) throws IOException {
+    int raw = readUnsignedVarInt(in);
+    // This undoes the trick in writeSignedVarInt()
+    return (((raw << 31) >> 31) ^ raw) >> 1;
+  }
+
+  /**
+   * See {@link #readUnsignedVarLong(DataInput)}
+   */
+  public static int readUnsignedVarInt(DataInput in) throws IOException {
+    int value = 0;
+    int i = 0;
+    int b;
+    while (((b = in.readByte()) & 0x80) != 0) {
+      value |= (b & 0x7F) << i;
+      i += 7;
+      if (i > 28) {
+        throw new IllegalArgumentException("Variable length quantity is too long");
+      }
+    }
+    return value | (b << i);
+  }
+
+}

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java?rev=943236&r1=943235&r2=943236&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java Tue
May 11 19:22:23 2010
@@ -81,8 +81,8 @@ public class VectorWritable extends Conf
                   (named ? FLAG_NAMED : 0) |
                   (writesLaxPrecision ? FLAG_LAX_PRECISION : 0));
 
+    Varint.writeUnsignedVarInt(vector.size(), out);
     if (dense) {
-      out.writeInt(vector.size());
       for (Vector.Element element : vector) {
         if (writesLaxPrecision) {
           out.writeFloat((float) element.get());
@@ -91,16 +91,31 @@ public class VectorWritable extends Conf
         }
       }
     } else {
-      out.writeInt(vector.size());
-      out.writeInt(vector.getNumNondefaultElements());
+      Varint.writeUnsignedVarInt(vector.getNumNondefaultElements(), out);
       Iterator<Vector.Element> iter = vector.iterateNonZero();
-      while (iter.hasNext()) {
-        Vector.Element element = iter.next();
-        out.writeInt(element.index());
-        if (writesLaxPrecision) {
-          out.writeFloat((float) element.get());
-        } else {
-          out.writeDouble(element.get());
+      if (sequential) {
+        int lastIndex = 0;
+        while (iter.hasNext()) {
+          Vector.Element element = iter.next();
+          int thisIndex = element.index();
+          // Delta-code indices:
+          Varint.writeUnsignedVarInt(thisIndex - lastIndex, out);
+          lastIndex = thisIndex;
+          if (writesLaxPrecision) {
+            out.writeFloat((float) element.get());
+          } else {
+            out.writeDouble(element.get());
+          }
+        }
+      } else {
+        while (iter.hasNext()) {
+          Vector.Element element = iter.next();
+          Varint.writeUnsignedVarInt(element.index(), out);
+          if (writesLaxPrecision) {
+            out.writeFloat((float) element.get());
+          } else {
+            out.writeDouble(element.get());
+          }
         }
       }
     }
@@ -120,26 +135,34 @@ public class VectorWritable extends Conf
     boolean named = (flags & FLAG_NAMED) != 0;
     boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0;
 
+    int size = Varint.readUnsignedVarInt(in);
     Vector v;
     if (dense) {
-      int size = in.readInt();
       double[] values = new double[size];
       for (int i = 0; i < size; i++) {
         values[i] = laxPrecision ? in.readFloat() : in.readDouble();
       }
       v = new DenseVector(values);
     } else {
-      int size = in.readInt();
-      int numNonDefaultElements = in.readInt();
+      int numNonDefaultElements = Varint.readUnsignedVarInt(in);
+      v = sequential ?
+          new SequentialAccessSparseVector(size, numNonDefaultElements) :
+          new RandomAccessSparseVector(size, numNonDefaultElements);
       if (sequential) {
-        v = new SequentialAccessSparseVector(size, numNonDefaultElements);
+        int lastIndex = 0;
+        for (int i = 0; i < numNonDefaultElements; i++) {
+          int delta = Varint.readUnsignedVarInt(in);
+          int index = lastIndex + delta;
+          lastIndex = index;
+          double value = laxPrecision ? in.readFloat() : in.readDouble();
+          v.setQuick(index, value);
+        }
       } else {
-        v = new RandomAccessSparseVector(size, numNonDefaultElements);
-      }
-      for (int i = 0; i < numNonDefaultElements; i++) {
-        int index = in.readInt();
-        double value = laxPrecision ? in.readFloat() : in.readDouble();
-        v.setQuick(index, value);
+        for (int i = 0; i < numNonDefaultElements; i++) {
+          int index = Varint.readUnsignedVarInt(in);
+          double value = laxPrecision ? in.readFloat() : in.readDouble();
+          v.setQuick(index, value);
+        }
       }
     }
     if (named) {

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java?rev=943236&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java Tue May
11 19:22:23 2010
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+/**
+ * Tests {@link Varint}.
+ */
+public final class VarintTest extends MahoutTestCase {
+
+  public void testUnsignedLong() throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput out = new DataOutputStream(baos);
+    Varint.writeUnsignedVarLong(0L, out);
+    for (long i = 1L; i > 0L && i <= (1L << 62); i <<= 1) {
+      Varint.writeUnsignedVarLong(i-1, out);
+      Varint.writeUnsignedVarLong(i, out);
+    }
+    Varint.writeUnsignedVarLong(Long.MAX_VALUE, out);
+
+    DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
+    assertEquals(0L, Varint.readUnsignedVarLong(in));
+    for (long i = 1L; i > 0L && i <= (1L << 62); i <<= 1) {
+      assertEquals(i-1, Varint.readUnsignedVarLong(in));
+      assertEquals(i, Varint.readUnsignedVarLong(in));
+    }
+    assertEquals(Long.MAX_VALUE, Varint.readUnsignedVarLong(in));
+  }
+
+  public void testSignedPositiveLong() throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput out = new DataOutputStream(baos);
+    Varint.writeSignedVarLong(0L, out);
+    for (long i = 1L; i <= (1L << 61); i <<= 1) {
+      Varint.writeSignedVarLong(i-1, out);
+      Varint.writeSignedVarLong(i, out);
+    }
+    Varint.writeSignedVarLong((1L << 62) - 1, out);
+
+    DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
+    assertEquals(0L, Varint.readSignedVarLong(in));
+    for (long i = 1L; i <= (1L << 61); i <<= 1) {
+      assertEquals(i-1, Varint.readSignedVarLong(in));
+      assertEquals(i, Varint.readSignedVarLong(in));
+    }
+    assertEquals((1L << 62) - 1, Varint.readSignedVarLong(in));
+  }
+
+  public void testSignedNegativeLong() throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput out = new DataOutputStream(baos);
+    for (long i = -1L; i >= -(1L << 62); i <<= 1) {
+      Varint.writeSignedVarLong(i, out);
+      Varint.writeSignedVarLong(i+1, out);
+    }
+    DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
+    for (long i = -1L; i >= -(1L << 62); i <<= 1) {
+      assertEquals(i, Varint.readSignedVarLong(in));
+      assertEquals(i+1, Varint.readSignedVarLong(in));
+    }
+  }
+
+  public void testUnsignedInt() throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput out = new DataOutputStream(baos);
+    Varint.writeUnsignedVarInt(0, out);
+    for (int i = 1; i > 0 && i <= (1 << 30); i <<= 1) {
+      Varint.writeUnsignedVarLong(i-1, out);
+      Varint.writeUnsignedVarLong(i, out);
+    }
+    Varint.writeUnsignedVarLong(Integer.MAX_VALUE, out);
+
+    DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
+    assertEquals(0, Varint.readUnsignedVarInt(in));
+    for (int i = 1; i > 0 && i <= (1 << 30); i <<= 1) {
+      assertEquals(i-1, Varint.readUnsignedVarInt(in));
+      assertEquals(i, Varint.readUnsignedVarInt(in));
+    }
+    assertEquals(Integer.MAX_VALUE, Varint.readUnsignedVarInt(in));
+  }
+
+  public void testSignedPositiveInt() throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput out = new DataOutputStream(baos);
+    Varint.writeSignedVarInt(0, out);
+    for (int i = 1; i <= (1 << 29); i <<= 1) {
+      Varint.writeSignedVarLong(i-1, out);
+      Varint.writeSignedVarLong(i, out);
+    }
+    Varint.writeSignedVarInt((1 << 30) - 1, out);
+
+    DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
+    assertEquals(0, Varint.readSignedVarInt(in));
+    for (int i = 1; i <= (1 << 29); i <<= 1) {
+      assertEquals(i-1, Varint.readSignedVarInt(in));
+      assertEquals(i, Varint.readSignedVarInt(in));
+    }
+    assertEquals((1 << 30) - 1, Varint.readSignedVarInt(in));
+  }
+
+  public void testSignedNegativeInt() throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput out = new DataOutputStream(baos);
+    for (int i = -1; i >= -(1 << 30); i <<= 1) {
+      Varint.writeSignedVarInt(i, out);
+      Varint.writeSignedVarInt(i+1, out);
+    }
+    DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray()));
+    for (int i = -1; i >= -(1 << 30); i <<= 1) {
+      assertEquals(i, Varint.readSignedVarInt(in));
+      assertEquals(i+1, Varint.readSignedVarInt(in));
+    }
+  }
+
+  public void testUnsignedSize() throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput out = new DataOutputStream(baos);
+    int expectedSize = 0;
+    for (int exponent = 0; exponent <= 62; exponent++) {
+      Varint.writeUnsignedVarLong(1L << exponent, out);
+      expectedSize += 1 + exponent / 7;
+      assertEquals(expectedSize, baos.size());
+    }
+  }
+
+  public void testSignedSize() throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutput out = new DataOutputStream(baos);
+    int expectedSize = 0;
+    for (int exponent = 0; exponent <= 61; exponent++) {
+      Varint.writeSignedVarLong(1L << exponent, out);
+      expectedSize += 1 + ((exponent + 1) / 7);
+      assertEquals(expectedSize, baos.size());
+    }
+    for (int exponent = 0; exponent <= 61; exponent++) {
+      Varint.writeSignedVarLong(-(1L << exponent)-1, out);
+      expectedSize += 1 + ((exponent + 1) / 7);
+      assertEquals(expectedSize, baos.size());
+    }
+  }
+
+  public void testExceptions() throws IOException {
+    try {
+      Varint.writeUnsignedVarLong(-1L, null);
+      fail();
+    } catch (IllegalArgumentException iae) {
+      // good
+    }
+    try {
+      Varint.writeSignedVarLong(Varint.MIN_SIGNED_VAR_LONG - 1, null);
+      fail();
+    } catch (IllegalArgumentException iae) {
+      // good
+    }
+    try {
+      Varint.writeSignedVarLong(Varint.MAX_SIGNED_VAR_LONG + 1, null);
+      fail();
+    } catch (IllegalArgumentException iae) {
+      // good
+    }
+    try {
+      Varint.writeUnsignedVarInt(-1, null);
+      fail();
+    } catch (IllegalArgumentException iae) {
+      // good
+    }
+    try {
+      Varint.writeSignedVarInt(Varint.MIN_SIGNED_VAR_INT - 1, null);
+      fail();
+    } catch (IllegalArgumentException iae) {
+      // good
+    }
+    try {
+      Varint.writeSignedVarInt(Varint.MAX_SIGNED_VAR_INT + 1, null);
+      fail();
+    } catch (IllegalArgumentException iae) {
+      // good
+    }
+  }
+
+}



Mime
View raw message