hbase-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From stack <st...@duboce.net>
Subject Re: Randomize your input during huge imports
Date Wed, 22 Apr 2009 16:41:32 GMT
Should we add this to examples Ryan?  Or do you think it general enough that
we add it under src and add it as an option when you do:

./bin/hadoop -jar hbase.jar

Currently the only option when you do the above is rowcounter.

Is below generic enough that we can add a 'randomizer' option on above?

St.Ack

On Tue, Apr 21, 2009 at 9:53 PM, Ryan Rawson <ryanobjc@gmail.com> wrote:

> Here is the MapReduce I use to randomize the lines of a file.  I've omitted
> the imports for brevity - your IDE can fix that.
>
> Enjoy!
> -ryan
>
> public class Randomize {
>
>  // technically text/text could be 'object'.
>  public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, IntWritable, Text> {
>    Random rnd = new Random();
>
>    public void map(LongWritable key, Text value,
>        OutputCollector<IntWritable, Text> output, Reporter reporter)
>    throws IOException {
>      IntWritable redKey = new IntWritable(rnd.nextInt(100000));
>      output.collect(redKey, value);
>      reporter.setStatus("Map emitting cell for: " + redKey);
>
>    }
>
>  }
>
>  // This combiner reduces the time of a map-reduce from 1h18m -> 48m.
>  // That is a 38% improvement (!!).
>  public static class Combiner extends MapReduceBase
>  implements Reducer<IntWritable, Text, IntWritable, Text> {
>
>    public void reduce(IntWritable key, Iterator<Text> values,
>        OutputCollector<IntWritable, Text> output, Reporter reporter)
>        throws IOException {
>      Text out = new Text();
>      byte newline [] = {'\n'};
>      int siz = 0;
>      while (values.hasNext())
>      {
>        Text txt = values.next();
>        out.append(txt.getBytes(), 0, txt.getLength());
>
>        if (++siz > 500) {
>          output.collect(key, out);
>          siz = 0;
>          out = new Text();
>        } else {
>          if (values.hasNext())
>            out.append(newline, 0, newline.length);
>        }
>      }
>      output.collect(key, out);
>    }
>  }
>
>  public static class Reduce extends MapReduceBase implements
> Reducer<IntWritable, Text, NullWritable, Text> {
>    public void reduce(IntWritable key, Iterator<Text> values,
>        OutputCollector<NullWritable, Text> output, Reporter reporter)
>    throws IOException {
>      while (values.hasNext())
>      {
>        output.collect(NullWritable.get(), values.next());
>      }
>    }
>
>  }
>  public static void main(String [] argv) throws IOException {
>    if (argv.length < 2) {
>      System.out.println("Usage: <input> <randomized output>");
>      return;
>    }
>    JobConf job = new JobConf(Randomize.class);
>    job.setJobName("Randomize: " + argv[0]);
>    FileInputFormat.setInputPaths(job, new Path(argv[0]));
>    job.setInputFormat(TextInputFormat.class);
>    job.setMapperClass(Map.class);
>    job.setReducerClass(Reduce.class);
>    job.setCombinerClass(Combiner.class);
>    FileOutputFormat.setOutputPath(job, new Path(argv[1]));
>    job.setOutputFormat(TextOutputFormat.class);
>
>    job.setOutputKeyClass(IntWritable.class);
>    job.setOutputValueClass(Text.class);
>
>    JobClient jc = new JobClient(job);
>    jc.submitJob(job);
>  }
> }
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message