hadoop-common-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Teppo Kurki <...@iki.fi>
Subject TextInputFormat performance improvements
Date Sat, 01 Apr 2006 22:29:15 GMT
Just a few patches that made a surprisingly big difference in my pet 
project:

As TextInputFormat parses the input it creates first a String which is 
stored in an UTF8. If the Map operation does not use directly UTF8's 
internal byte array and accesses the contents via toString() the 
encoding and decoding is just cpu overhead.

By using a separate StringHolder (for lack of a better name) interface 
would make it possible to swap out another, maybe more efficient  
implementation via JobConf.setInputValueClass.

It seems that TextInputFormat.readLine does not have to be static and 
the StringBuffer can be reused.  If it needs to be static the 
StringBuffer's internal buffer can be preallocated.

Hope this helps someone...

Index: 
C:/data/eclipse_workspace/hadoop/src/java/org/apache/hadoop/io/UTF8.java
===================================================================
--- 
C:/data/eclipse_workspace/hadoop/src/java/org/apache/hadoop/io/UTF8.java    
(revision 389671)
+++ 
C:/data/eclipse_workspace/hadoop/src/java/org/apache/hadoop/io/UTF8.java    
(working copy)
@@ -21,8 +21,11 @@
 import java.io.DataOutput;
 
 import java.util.logging.Logger;
+
+import org.apache.hadoop.mapred.TextInputFormat;
 import org.apache.hadoop.util.LogFormatter;
 
+
 /** A WritableComparable for strings that uses the UTF8 encoding.
  *
  * <p>Also includes utilities for efficiently reading and writing UTF-8.
@@ -29,7 +32,7 @@
  *
  * @author Doug Cutting
  */
-public class UTF8 implements WritableComparable {
+public class UTF8 implements WritableComparable, 
TextInputFormat.StringHolder {
   private static final Logger LOG= 
LogFormatter.getLogger("org.apache.hadoop.io.UTF8");
   private static final DataOutputBuffer OBUF = new DataOutputBuffer();
   private static final DataInputBuffer IBUF = new DataInputBuffer();
Index: 
C:/data/eclipse_workspace/hadoop/src/java/org/apache/hadoop/mapred/TextInputFormat.java
===================================================================
--- 
C:/data/eclipse_workspace/hadoop/src/java/org/apache/hadoop/mapred/TextInputFormat.java  
 
(revision 389671)
+++ 
C:/data/eclipse_workspace/hadoop/src/java/org/apache/hadoop/mapred/TextInputFormat.java  
 
(working copy)
@@ -29,6 +29,10 @@
  * Either linefeed or carriage-return are used to signal end of line.  
Keys are
  * the position in the file, and values are the line of text.. */
 public class TextInputFormat extends InputFormatBase {
+  private StringBuffer buffer = new StringBuffer(24 * 80); //preallocate
+  public interface StringHolder extends Writable {
+    public void set(String s);
+  }
 
   public RecordReader getRecordReader(FileSystem fs, FileSplit split,
                                       JobConf job, Reporter reporter)
@@ -61,7 +65,7 @@
             return false;
 
           ((LongWritable)key).set(pos);           // key is position
-          ((UTF8)value).set(readLine(in));        // value is line
+          ((StringHolder)value).set(readLine(in));        // value is line
           return true;
         }
        
@@ -74,8 +78,8 @@
       };
   }
 
-  private static String readLine(FSDataInputStream in) throws IOException {
-    StringBuffer buffer = new StringBuffer();
+  private String readLine(FSDataInputStream in) throws IOException {
+    buffer.setLength(0);
     while (true) {
 
       int b = in.read();


Mime
View raw message