Return-Path: Delivered-To: apmail-hadoop-mapreduce-commits-archive@minotaur.apache.org Received: (qmail 83571 invoked from network); 8 Mar 2011 05:59:42 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 8 Mar 2011 05:59:42 -0000 Received: (qmail 68029 invoked by uid 500); 8 Mar 2011 05:59:42 -0000 Delivered-To: apmail-hadoop-mapreduce-commits-archive@hadoop.apache.org Received: (qmail 67919 invoked by uid 500); 8 Mar 2011 05:59:41 -0000 Mailing-List: contact mapreduce-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: mapreduce-dev@hadoop.apache.org Delivered-To: mailing list mapreduce-commits@hadoop.apache.org Received: (qmail 67910 invoked by uid 99); 8 Mar 2011 05:59:41 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 08 Mar 2011 05:59:41 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 08 Mar 2011 05:59:37 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 9719C2388A66; Tue, 8 Mar 2011 05:59:16 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1079239 - in /hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix: CompressionEmulationUtil.java GenerateData.java Gridmix.java RandomTextDataGenerator.java Date: Tue, 08 Mar 2011 05:59:16 -0000 To: mapreduce-commits@hadoop.apache.org From: omalley@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20110308055916.9719C2388A66@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: omalley Date: Tue Mar 8 05:59:16 2011 New Revision: 1079239 URL: http://svn.apache.org/viewvc?rev=1079239&view=rev Log: commit 8362e614ab0b7a829c8cf73bff1b8e4d24d23444 Author: Amar Ramesh Kamat Date: Sat Jan 8 11:17:19 2011 +0530 : Publish compression ratio in Gridmix (amarrk) +++ b/YAHOO-CHANGES.txt + : Publish compression ratio in Gridmix. Patch is + available at (amarrk) + Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java?rev=1079239&r1=1079238&r2=1079239&view=diff ============================================================================== --- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java (original) +++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java Tue Mar 8 05:59:16 2011 @@ -27,6 +27,7 @@ import org.apache.commons.logging.LogFac import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; @@ -39,6 +40,7 @@ import org.apache.hadoop.io.compress.Com import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Utils; import org.apache.hadoop.mapred.gridmix.GenerateData.GenDataFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.MRJobConfig; @@ -46,6 +48,7 @@ import org.apache.hadoop.mapreduce.Mappe import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; +import org.apache.hadoop.util.StringUtils; /** * This is a utility class for all the compression related modules. @@ -78,13 +81,11 @@ class CompressionEmulationUtil { protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); - int size = - conf.getInt(RandomTextDataGenerator.GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE, - 100); + int listSize = + RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf); int wordSize = - conf.getInt(RandomTextDataGenerator.GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE, - 10); - rtg = new RandomTextDataGenerator(size, null, wordSize); + RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf); + rtg = new RandomTextDataGenerator(listSize, wordSize); } /** @@ -112,7 +113,7 @@ class CompressionEmulationUtil { */ static void configure(final Job job) throws IOException, InterruptedException, ClassNotFoundException { - LOG.info("Gridmix is configured to use compressed data."); + LOG.info("Gridmix is configured to generate compressed input data."); // set the random text mapper job.setMapperClass(RandomTextDataMapper.class); job.setNumReduceTasks(0); @@ -130,6 +131,60 @@ class CompressionEmulationUtil { } } + /** Publishes compression related data statistics. Following statistics are + * published + *
    + *
  • Total compressed input data size
  • + *
  • Number of compressed input data files
  • + *
  • Compression Ratio
  • + *
  • Text data dictionary size
  • + *
  • Random text word size
  • + *
+ */ + static void publishCompressedDataStatistics(Path inputDir, Configuration conf, + long uncompressedDataSize) + throws IOException { + LOG.info("Generation of compressed data successful."); + FileSystem fs = inputDir.getFileSystem(conf); + CompressionCodecFactory compressionCodecs = + new CompressionCodecFactory(conf); + + // iterate over compressed files and sum up the compressed file sizes + long compressedDataSize = 0; + int numCompressedFiles = 0; + // obtain input data file statuses + FileStatus[] outFileStatuses = + fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter()); + for (FileStatus status : outFileStatuses) { + // check if the input file is compressed + if (compressionCodecs != null) { + CompressionCodec codec = compressionCodecs.getCodec(status.getPath()); + if (codec != null) { + ++numCompressedFiles; + compressedDataSize += status.getLen(); + } + } + } + + // publish the input data size + LOG.info("Total size of compressed input data (bytes) : " + + StringUtils.humanReadableInt(compressedDataSize)); + LOG.info("Total number of compressed input data files : " + + numCompressedFiles); + + // compute the compression ratio + double ratio = ((double)compressedDataSize) / uncompressedDataSize; + + // publish the compression ratio + LOG.info("Input Data Compression Ratio : " + ratio); + + // publish the random text data generator configuration parameters + LOG.info("Compressed data generator list size : " + + RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf)); + LOG.info("Compressed data generator word size : " + + RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf)); + } + /** * Enables/Disables compression emulation. * @param conf Target configuration where the parameter @@ -179,13 +234,15 @@ class CompressionEmulationUtil { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(file); - Decompressor decompressor = CodecPool.getDecompressor(codec); if (codec != null) { - CompressionInputStream in = - codec.createInputStream(fs.open(file), decompressor); - //TODO Seek doesnt work with compressed input stream. - // Use SplittableCompressionCodec? - return (InputStream)in; + Decompressor decompressor = CodecPool.getDecompressor(codec); + if (decompressor != null) { + CompressionInputStream in = + codec.createInputStream(fs.open(file), decompressor); + //TODO Seek doesnt work with compressed input stream. + // Use SplittableCompressionCodec? + return (InputStream)in; + } } } FSDataInputStream in = fs.open(file); Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java?rev=1079239&r1=1079238&r2=1079239&view=diff ============================================================================== --- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java (original) +++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java Tue Mar 8 05:59:16 2011 @@ -30,6 +30,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; @@ -41,6 +42,7 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.ClusterStatus; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Utils; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; @@ -52,6 +54,7 @@ import org.apache.hadoop.mapreduce.TaskA import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.util.StringUtils; // TODO can replace with form of GridmixJob class GenerateData extends GridmixJob { @@ -94,6 +97,40 @@ class GenerateData extends GridmixJob { FileOutputFormat.setOutputPath(job, outdir); } + /** + * Publish the data statistics. + */ + void publishDataStatistics(Path inputDir, long genBytes) throws IOException { + if (CompressionEmulationUtil + .isCompressionEmulationEnabled(job.getConfiguration())) { + CompressionEmulationUtil.publishCompressedDataStatistics(inputDir, + job.getConfiguration(), genBytes); + } else { + publishPlainDataStatistics(job.getConfiguration(), inputDir); + } + } + + static void publishPlainDataStatistics(Configuration conf, Path inputDir) + throws IOException { + LOG.info("Input data generation successful."); + FileSystem fs = inputDir.getFileSystem(conf); + + // obtain input data file statuses + FileStatus[] outFileStatuses = + fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter()); + long dataSize = 0; + + for (FileStatus status : outFileStatuses) { + // check if the input file is compressed + dataSize += status.getLen(); + } + + // publish the plain data statistics + LOG.info("Total size of input data : " + + StringUtils.humanReadableInt(dataSize)); + LOG.info("Total number of input data files : " + outFileStatuses.length); + } + @Override public Job call() throws IOException, InterruptedException, ClassNotFoundException { Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java?rev=1079239&r1=1079238&r2=1079239&view=diff ============================================================================== --- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java (original) +++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java Tue Mar 8 05:59:16 2011 @@ -111,7 +111,7 @@ public class Gridmix extends Configured throws IOException, InterruptedException { Path inputDir = new Path(ioPath, "input"); final Configuration conf = getConf(); - final GridmixJob genData = new GenerateData(conf, inputDir, genbytes); + final GenerateData genData = new GenerateData(conf, inputDir, genbytes); LOG.info("Generating " + StringUtils.humanReadableInt(genbytes) + " of test data..."); launchGridmixJob(genData); @@ -124,6 +124,9 @@ public class Gridmix extends Configured LOG.error("Couldnt change the file permissions " , e); throw new IOException(e); } + + // publish the data statistics + genData.publishDataStatistics(inputDir, genbytes); } /** Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java?rev=1079239&r1=1079238&r2=1079239&view=diff ============================================================================== --- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java (original) +++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java Tue Mar 8 05:59:16 2011 @@ -22,42 +22,64 @@ import java.util.List; import java.util.Random; import org.apache.commons.lang.RandomStringUtils; +import org.apache.hadoop.conf.Configuration; /** * A random text generator. The words are simply sequences of alphabets. */ class RandomTextDataGenerator { /** - * Random words list size. + * Configuration key for random text data generator's list size. */ static final String GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE = "gridmix.datagenerator.randomtext.listsize"; /** - * Random words size. + * Configuration key for random text data generator's word size. */ static final String GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE = "gridmix.datagenerator.randomtext.wordsize"; /** + * Default random text data generator's list size. + */ + static final int DEFAULT_LIST_SIZE = 100; + + /** + * Default random text data generator's word size. + */ + static final int DEFAULT_WORD_SIZE = 10; + + /** + * Default random text data generator's seed. + */ + static final long DEFAULT_SEED = 0L; + + /** * A list of random words */ private String[] words; private Random random; /** + * Constructor for {@link RandomTextDataGenerator} with default seed. + * @param size the total number of words to consider. + * @param wordSize Size of each word + */ + RandomTextDataGenerator(int size, int wordSize) { + this(size, DEFAULT_SEED , wordSize); + } + + /** * Constructor for {@link RandomTextDataGenerator}. * @param size the total number of words to consider. * @param seed Random number generator seed for repeatability * @param wordSize Size of each word */ RandomTextDataGenerator(int size, Long seed, int wordSize) { - if (seed == null) { - random = new Random(); - } else { - random = new Random(seed); - } + random = new Random(seed); words = new String[size]; + //TODO change the default with the actual stats //TODO do u need varied sized words? for (int i = 0; i < size; ++i) { @@ -67,6 +89,20 @@ class RandomTextDataGenerator { } /** + * Get the configured random text data generator list size. + */ + static int getRandomTextDataGeneratorListSize(Configuration conf) { + return conf.getInt(GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE, DEFAULT_LIST_SIZE); + } + + /** + * Get the configured random text data generator word size. + */ + static int getRandomTextDataGeneratorWordSize(Configuration conf) { + return conf.getInt(GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE, DEFAULT_WORD_SIZE); + } + + /** * Returns a randomly selected word from a list of random words. */ String getRandomWord() {