avro-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cutt...@apache.org
Subject svn commit: r896985 - in /hadoop/avro/trunk: CHANGES.txt src/py/avro/io.py src/test/py/test_io.py
Date Thu, 07 Jan 2010 19:51:49 GMT
Author: cutting
Date: Thu Jan  7 19:51:49 2010
New Revision: 896985

URL: http://svn.apache.org/viewvc?rev=896985&view=rev
Log:
AVRO-292.  Fix Python skipping of ints and longs.  Contributed by Jeff Hammerbacher.

Modified:
    hadoop/avro/trunk/CHANGES.txt
    hadoop/avro/trunk/src/py/avro/io.py
    hadoop/avro/trunk/src/test/py/test_io.py

Modified: hadoop/avro/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/CHANGES.txt?rev=896985&r1=896984&r2=896985&view=diff
==============================================================================
--- hadoop/avro/trunk/CHANGES.txt (original)
+++ hadoop/avro/trunk/CHANGES.txt Thu Jan  7 19:51:49 2010
@@ -247,6 +247,10 @@
 
     AVRO-280. Fix file header schema in specification.  Also fix
     "forrestdoc" build target to work on clean checkout.
+    (Jeff Hammerbacher & cutting)	 
+
+    AVRO-292. Fix Python skipping of ints and longs.
+    (Jeff Hammerbacher via cutting)
 
 Avro 1.2.0 (14 October 2009)
 

Modified: hadoop/avro/trunk/src/py/avro/io.py
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/src/py/avro/io.py?rev=896985&r1=896984&r2=896985&view=diff
==============================================================================
--- hadoop/avro/trunk/src/py/avro/io.py (original)
+++ hadoop/avro/trunk/src/py/avro/io.py Thu Jan  7 19:51:49 2010
@@ -129,6 +129,12 @@
   # read-only properties
   reader = property(lambda self: self._reader)
 
+  def read(self, n):
+    """
+    Read n bytes.
+    """
+    return self.reader.read(n)
+
   def read_null(self):
     """
     null is written as zero bytes
@@ -140,7 +146,7 @@
     a boolean is written as a single byte 
     whose value is either 0 (false) or 1 (true).
     """
-    return ord(self.reader.read(1)) == 1
+    return ord(self.read(1)) == 1
 
   def read_int(self):
     """
@@ -152,11 +158,11 @@
     """
     int and long values are written using variable-length, zig-zag coding.
     """
-    b = ord(self.reader.read(1))
+    b = ord(self.read(1))
     n = b & 0x7F
     shift = 7
     while (b & 0x80) != 0:
-      b = ord(self.reader.read(1))
+      b = ord(self.read(1))
       n |= (b & 0x7F) << shift
       shift += 7
     datum = (n >> 1) ^ -(n & 1)
@@ -168,10 +174,10 @@
     The float is converted into a 32-bit integer using a method equivalent to
     Java's floatToIntBits and then encoded in little-endian format.
     """
-    bits = (((ord(self.reader.read(1)) & 0xffL)) |
-      ((ord(self.reader.read(1)) & 0xffL) <<  8) |
-      ((ord(self.reader.read(1)) & 0xffL) << 16) |
-      ((ord(self.reader.read(1)) & 0xffL) << 24))
+    bits = (((ord(self.read(1)) & 0xffL)) |
+      ((ord(self.read(1)) & 0xffL) <<  8) |
+      ((ord(self.read(1)) & 0xffL) << 16) |
+      ((ord(self.read(1)) & 0xffL) << 24))
     return STRUCT_FLOAT.unpack(STRUCT_INT.pack(bits))[0]
 
   def read_double(self):
@@ -180,14 +186,14 @@
     The double is converted into a 64-bit integer using a method equivalent to
     Java's doubleToLongBits and then encoded in little-endian format.
     """
-    bits = (((ord(self.reader.read(1)) & 0xffL)) |
-      ((ord(self.reader.read(1)) & 0xffL) <<  8) |
-      ((ord(self.reader.read(1)) & 0xffL) << 16) |
-      ((ord(self.reader.read(1)) & 0xffL) << 24) |
-      ((ord(self.reader.read(1)) & 0xffL) << 32) |
-      ((ord(self.reader.read(1)) & 0xffL) << 40) |
-      ((ord(self.reader.read(1)) & 0xffL) << 48) |
-      ((ord(self.reader.read(1)) & 0xffL) << 56))
+    bits = (((ord(self.read(1)) & 0xffL)) |
+      ((ord(self.read(1)) & 0xffL) <<  8) |
+      ((ord(self.read(1)) & 0xffL) << 16) |
+      ((ord(self.read(1)) & 0xffL) << 24) |
+      ((ord(self.read(1)) & 0xffL) << 32) |
+      ((ord(self.read(1)) & 0xffL) << 40) |
+      ((ord(self.read(1)) & 0xffL) << 48) |
+      ((ord(self.read(1)) & 0xffL) << 56))
     return STRUCT_DOUBLE.unpack(STRUCT_LONG.pack(bits))[0]
 
   def read_bytes(self):
@@ -203,25 +209,19 @@
     """
     return unicode(self.read_bytes(), "utf-8")
 
-  def read(self, n):
-    """
-    Read n bytes.
-    """
-    return struct.unpack('%ds' % n, self.reader.read(n))[0]
-
   def skip_null(self):
     pass
 
   def skip_boolean(self):
     self.skip(1)
 
-  # TODO(hammer): I thought ints were VLE?
   def skip_int(self):
-    self.skip(4)
+    self.skip_long()
 
-  # TODO(hammer): I thought longs were VLE?
   def skip_long(self):
-    self.skip(8)
+    b = ord(self.read(1))
+    while (b & 0x80) != 0:
+      b = ord(self.read(1))
 
   def skip_float(self):
     self.skip(4)

Modified: hadoop/avro/trunk/src/test/py/test_io.py
URL: http://svn.apache.org/viewvc/hadoop/avro/trunk/src/test/py/test_io.py?rev=896985&r1=896984&r2=896985&view=diff
==============================================================================
--- hadoop/avro/trunk/src/test/py/test_io.py (original)
+++ hadoop/avro/trunk/src/test/py/test_io.py Thu Jan  7 19:51:49 2010
@@ -15,6 +15,7 @@
 # limitations under the License.
 import unittest
 import cStringIO
+from binascii import hexlify
 from avro import schema
 from avro import io
 
@@ -49,6 +50,28 @@
    """, {'value': {'car': {'value': 'head'}, 'cdr': {'value': None}}}),
 )
 
+BINARY_INT_ENCODINGS = (
+  (0, '00'),
+  (-1, '01'),
+  (1, '02'),
+  (-2, '03'),
+  (2, '04'),
+  (-64, '7f'),
+  (64, '80 01'),
+  (8192, '80 80 01'),
+  (-8193, '81 80 01'),
+)
+
+def avro_hexlify(reader):
+  """Return the hex value, as a string, of a binary-encoded int or long."""
+  bytes = []
+  current_byte = reader.read(1)
+  bytes.append(hexlify(current_byte))
+  while (ord(current_byte) & 0x80) != 0:
+    current_byte = reader.read(1)
+    bytes.append(hexlify(current_byte))
+  return ' '.join(bytes)
+
 class TestIO(unittest.TestCase):
   def test_validate(self):
     print ''
@@ -150,5 +173,121 @@
       print ''
     self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
 
+  def test_binary_int_encoding(self):
+    print ''
+    print 'TEST BINARY INT ENCODING'
+    print '========================'
+    print ''
+    correct = 0
+    for value, hex_encoding in BINARY_INT_ENCODINGS:
+      print 'Value: %d' % value
+      print 'Correct Encoding: %s' % hex_encoding
+
+      # write datum in binary to string buffer
+      buffer = cStringIO.StringIO()
+      encoder = io.BinaryEncoder(buffer)
+      datum_writer = io.DatumWriter(schema.parse('"int"'))
+      datum_writer.write(value, encoder)
+
+      # read it out of the buffer and hexlify it
+      buffer.seek(0)
+      hex_val = avro_hexlify(buffer)
+
+      # check it
+      print 'Read Encoding: %s' % hex_val
+      if hex_encoding == hex_val: correct += 1
+      print ''
+    self.assertEquals(correct, len(BINARY_INT_ENCODINGS))
+
+  def test_binary_long_encoding(self):
+    print ''
+    print 'TEST BINARY LONG ENCODING'
+    print '========================='
+    print ''
+    correct = 0
+    for value, hex_encoding in BINARY_INT_ENCODINGS:
+      print 'Value: %d' % value
+      print 'Correct Encoding: %s' % hex_encoding
+
+      # write datum in binary to string buffer
+      buffer = cStringIO.StringIO()
+      encoder = io.BinaryEncoder(buffer)
+      datum_writer = io.DatumWriter(schema.parse('"long"'))
+      datum_writer.write(value, encoder)
+
+      # read it out of the buffer and hexlify it
+      buffer.seek(0)
+      hex_val = avro_hexlify(buffer)
+
+      # check it
+      print 'Read Encoding: %s' % hex_val
+      if hex_encoding == hex_val: correct += 1
+      print ''
+    self.assertEquals(correct, len(BINARY_INT_ENCODINGS))
+
+  def test_skip_long(self):
+    print ''
+    print 'TEST SKIP LONG'
+    print '=============='
+    print ''
+    correct = 0
+    for value_to_skip, hex_encoding in BINARY_INT_ENCODINGS:
+      VALUE_TO_READ = 6253
+      print 'Value to Skip: %d' % value_to_skip
+
+      # write some data in binary to string buffer
+      writer = cStringIO.StringIO()
+      encoder = io.BinaryEncoder(writer)
+      datum_writer = io.DatumWriter(schema.parse('"long"'))
+      datum_writer.write(value_to_skip, encoder)
+      datum_writer.write(VALUE_TO_READ, encoder)
+
+      # skip the value
+      reader = cStringIO.StringIO(writer.getvalue())
+      decoder = io.BinaryDecoder(reader)
+      decoder.skip_long()
+
+      # read data from string buffer
+      datum_reader = io.DatumReader(schema.parse('"long"'))
+      read_value = datum_reader.read(decoder)
+
+      # check it
+      print 'Read Value: %d' % read_value
+      if read_value == VALUE_TO_READ: correct += 1
+      print ''
+    self.assertEquals(correct, len(BINARY_INT_ENCODINGS))
+
+  def test_skip_int(self):
+    print ''
+    print 'TEST SKIP INT'
+    print '============='
+    print ''
+    correct = 0
+    for value_to_skip, hex_encoding in BINARY_INT_ENCODINGS:
+      VALUE_TO_READ = 6253
+      print 'Value to Skip: %d' % value_to_skip
+
+      # write some data in binary to string buffer
+      writer = cStringIO.StringIO()
+      encoder = io.BinaryEncoder(writer)
+      datum_writer = io.DatumWriter(schema.parse('"int"'))
+      datum_writer.write(value_to_skip, encoder)
+      datum_writer.write(VALUE_TO_READ, encoder)
+
+      # skip the value
+      reader = cStringIO.StringIO(writer.getvalue())
+      decoder = io.BinaryDecoder(reader)
+      decoder.skip_int()
+
+      # read data from string buffer
+      datum_reader = io.DatumReader(schema.parse('"int"'))
+      read_value = datum_reader.read(decoder)
+
+      # check it
+      print 'Read Value: %d' % read_value
+      if read_value == VALUE_TO_READ: correct += 1
+      print ''
+    self.assertEquals(correct, len(BINARY_INT_ENCODINGS))
+
 if __name__ == '__main__':
   unittest.main()



Mime
View raw message