hadoop-mapreduce-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Brett Hoerner <bretthoer...@gmail.com>
Subject Multiple input files, no reducer, output is "stomped" by the just one of the files
Date Thu, 06 Jan 2011 20:51:33 GMT
Hello,

I'm running a very simple job that returns the input with a null key
and uses no reducer (see below).  I'm using
MultipleSequenceFileOutputFormat to "split" the input into different
files, but for simplicity's sake right now I'm just returning "file1"
from generateFileNameForKeyValue so that all input goes into one file.

I'm running it against 64 input files that are all about 38MB in size.
 The job completes without error, having read 2,528,625,665 bytes
(correct for 64 * 38MB).

The problem is that the output is a single 38MB file, instead of one
about 2432MB.

What do I need to do so that the inputs/mappers don't "stomp" each
other?  My guess is that the last one wins right now, and I'm not sure
if this is something normal or something specific to using
MultipleOutputFormat.

Thanks!
Brett



public class MultiFileByURL extends Configured implements Tool {
   public static class MapClass extends MapReduceBase
       implements Mapper<LongWritable, Text, NullWritable, Text> {

       public void map(LongWritable key, Text value,
                       OutputCollector<NullWritable, Text> output,
                       Reporter reporter) throws IOException {
           output.collect(NullWritable.get(), value);
       }
   }

   public static class PartitionByURL
       extends MultipleSequenceFileOutputFormat<NullWritable,Text>
   {
       protected String generateFileNameForKeyValue(NullWritable key,
                                                    Text value,
                                                    String filename)
       {
           return "file1";
       }
   }

   public int run(String[] args) throws Exception {
       Configuration conf = getConf();
       JobConf job = new JobConf(conf, MultiFileByURL.class);
       Path in = new Path(args[0]);
       Path out = new Path(args[1]);
       FileInputFormat.setInputPaths(job, in);
       FileOutputFormat.setOutputPath(job, out);
       job.setJobName("MultiFileByURL");
       job.setMapperClass(MapClass.class);
       job.setInputFormat(TextInputFormat.class);
       job.setOutputFormat(PartitionByURL.class);
       job.setOutputKeyClass(NullWritable.class);
       job.setOutputValueClass(Text.class);
       SequenceFileOutputFormat.setOutputCompressionType(job,
CompressionType.BLOCK);
       FileOutputFormat.setCompressOutput(job, true);
       FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
       job.setNumReduceTasks(0);
       JobClient.runJob(job);
       return 0;
   }

   public static void main(String[] args) throws Exception {
       int res = ToolRunner.run(new Configuration(),
                                new MultiFileByURL(),
                                args);
       System.exit(res);
   }
}

Mime
View raw message