hadoop-mapreduce-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Brett Hoerner <bretthoer...@gmail.com>
Subject Re: Multiple input files, no reducer, output is "stomped" by the just one of the files
Date Sat, 08 Jan 2011 23:47:31 GMT
I found my issue, for future readers:

I forgot to append the 3rd argument of generateFileNameForKeyValue
(the String) to the returned filenames.  That String is the
"part0001", etc which gives each mapper a unique filename.



On Thu, Jan 6, 2011 at 2:51 PM, Brett Hoerner <bretthoerner@gmail.com> wrote:
> Hello,
>
> I'm running a very simple job that returns the input with a null key
> and uses no reducer (see below).  I'm using
> MultipleSequenceFileOutputFormat to "split" the input into different
> files, but for simplicity's sake right now I'm just returning "file1"
> from generateFileNameForKeyValue so that all input goes into one file.
>
> I'm running it against 64 input files that are all about 38MB in size.
>  The job completes without error, having read 2,528,625,665 bytes
> (correct for 64 * 38MB).
>
> The problem is that the output is a single 38MB file, instead of one
> about 2432MB.
>
> What do I need to do so that the inputs/mappers don't "stomp" each
> other?  My guess is that the last one wins right now, and I'm not sure
> if this is something normal or something specific to using
> MultipleOutputFormat.
>
> Thanks!
> Brett
>
>
>
> public class MultiFileByURL extends Configured implements Tool {
>   public static class MapClass extends MapReduceBase
>       implements Mapper<LongWritable, Text, NullWritable, Text> {
>
>       public void map(LongWritable key, Text value,
>                       OutputCollector<NullWritable, Text> output,
>                       Reporter reporter) throws IOException {
>           output.collect(NullWritable.get(), value);
>       }
>   }
>
>   public static class PartitionByURL
>       extends MultipleSequenceFileOutputFormat<NullWritable,Text>
>   {
>       protected String generateFileNameForKeyValue(NullWritable key,
>                                                    Text value,
>                                                    String filename)
>       {
>           return "file1";
>       }
>   }
>
>   public int run(String[] args) throws Exception {
>       Configuration conf = getConf();
>       JobConf job = new JobConf(conf, MultiFileByURL.class);
>       Path in = new Path(args[0]);
>       Path out = new Path(args[1]);
>       FileInputFormat.setInputPaths(job, in);
>       FileOutputFormat.setOutputPath(job, out);
>       job.setJobName("MultiFileByURL");
>       job.setMapperClass(MapClass.class);
>       job.setInputFormat(TextInputFormat.class);
>       job.setOutputFormat(PartitionByURL.class);
>       job.setOutputKeyClass(NullWritable.class);
>       job.setOutputValueClass(Text.class);
>       SequenceFileOutputFormat.setOutputCompressionType(job,
> CompressionType.BLOCK);
>       FileOutputFormat.setCompressOutput(job, true);
>       FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
>       job.setNumReduceTasks(0);
>       JobClient.runJob(job);
>       return 0;
>   }
>
>   public static void main(String[] args) throws Exception {
>       int res = ToolRunner.run(new Configuration(),
>                                new MultiFileByURL(),
>                                args);
>       System.exit(res);
>   }
> }
>

Mime
View raw message