hadoop-common-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ranjini Rathinam <ranjinibe...@gmail.com>
Subject Re: Need FileName with Content
Date Fri, 21 Mar 2014 11:08:59 GMT
Hi,

Thanks for the great support i have fixed the issue. I have now got
the output.

But , i have one query ,Possible to give runtime argument for mapper class

like,

Giving the value C,JAVA in runtime.



* if((sp[k].equalsIgnoreCase("C"))){*
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }

*    if((sp[k].equalsIgnoreCase("JAVA"))){*
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);

 Thanks a lot .

Ranjini



On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam
<ranjinibecse@gmail.com>wrote:

> Hi,
>
>
> Thanks a lot for the great support. I am just learning hadoop and
> mapreduce.
>
> I have used the way you have guided me.
>
> But the output is coming without Aggreating
>
> vinitha.txt C    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
>
>
> *I need the output has *
>
>  *vinitha       C    1*
>
> *vinitha      Java  4*
>
>
> I have reduce class but still not able to fix it, I am still trying .
>
> I have given my code below, Please let me know where i have gone wrong.
>
>
> my code
>
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.mapreduce.InputSplit;
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
> import org.apache.hadoop.mapreduce.*;
> import org.apache.hadoop.mapreduce.Job;
> import org.apache.hadoop.mapreduce.Mapper;
> import org.apache.hadoop.mapreduce.Reducer;
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import java.util.*;
> import java.util.logging.Level;
> import java.util.logging.Logger;
>
>  public class FileCount {
>     public static class TokenizerMapper extends Mapper<LongWritable, Text,
> Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(LongWritable key, Text value, Context context) throws
> IOException, InterruptedException {
>
>             FileSplit fileSplit;
>               InputSplit is = context.getInputSplit();
>               FileSystem fs = FileSystem.get(context.getConfiguration());
>               fileSplit = (FileSplit) is;
>               Path pp = fileSplit.getPath();
>                     String line=value.toString();
>                     int i=0;int k=0;
>                     //Path pp = ((FileSplit)
> context.getInputSplit()).getPath();
>
>                     String[] splited = line.split("\\s+");
>                         for( i=0;i<splited.length;i++)
>                             {
>                                  String sp[]=splited[i].split(",");
>                          for( k=0;k<sp.length;k++)
>                             {
>
>                                if(!sp[k].isEmpty())
>                             {
>
>                                   StringTokenizer itr = new
> StringTokenizer(sp[k]);
>
>                                   //log.info("map on string: " + new
> String(value.getBytes()));
>
>                                 if((sp[k].equalsIgnoreCase("C"))){
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                 }
>                              }
>                             }
>                         }
>
>           }
>
>   }
>
>   public static class Reduce extends Reducer<Text, IntWritable, Text,
> IntWritable> {
>
>     public void reduce(Text key, Iterator<IntWritable> values, Context
> context) throws IOException, InterruptedException {
>
>
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>        context.write(key, new IntWritable(sum));
>
>       }
>     }
>     public static void main(String[] args) throws Exception {
>             Configuration conf = new Configuration();
> Job job = new Job(conf, "jobName");
>
> String input="/user/hduser/INPUT/";
> String output="/user/hduser/OUTPUT/";
> FileInputFormat.setInputPaths(job, input);
> job.setJarByClass(FileCount.class);
> job.setMapperClass(TokenizerMapper.class);
> job.setReducerClass(Reduce.class);
> job.setCombinerClass(Reduce.class);
> job.setInputFormatClass(TextInputFormat.class);
> job.setOutputKeyClass(Text.class);
> job.setOutputValueClass(IntWritable.class);
> Path outPath = new Path(output);
> FileOutputFormat.setOutputPath(job, outPath);
> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
> if (dfs.exists(outPath)) {
> dfs.delete(outPath, true);
> }
>
>
> try {
>
> job.waitForCompletion(true);
>
> } catch (InterruptedException ex) {
> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
> } catch (ClassNotFoundException ex) {
> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
> }
>
> }
>
> }
>
>
> Thanks in advance for the great help and support to fix the issue .
>
> Please help to fix it.
>
> Thanks a lot.
>
> Regards,
> Ranjini
>
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>> ----------
>> From: *Stanley Shi* <sshi@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ranjinibecse@gmail.com>
>> Date: Thu, Mar 20, 2014 at 10:56 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ranjinibecse@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:20 AM
>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>
>>
>> Hi,
>>
>> If we give the below code,
>> =======================
>>  word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>>  filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>> ----------
>> From: *Felix Chern* <idryman@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:25 PM
>> To: user@hadoop.apache.org
>> Cc: sshi@gopivotal.com
>>
>>
>>  I've written two blog post of how to get directory context in hadoop
>> mapper.
>>
>>
>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>
>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>
>> Cheers,
>> Felix
>>
>> ----------
>> From: *Stanley Shi* <sshi@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>
>> To: Ranjini Rathinam <ranjinibecse@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Just reviewed the code again, you are not really using map-reduce. you
>> are reading all files in one map process, this is not a normal map-reduce
>> job works.
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Stanley Shi* <sshi@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> To: Ranjini Rathinam <ranjinibecse@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Change you mapper to be something like this:
>>
>>  public static class TokenizerMapper extends
>>
>>       Mapper<Object, Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(Object key, Text value, Context context)
>>
>>         throws IOException, InterruptedException {
>>
>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>
>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>
>>       log.info("map on string: " + new String(value.getBytes()));
>>
>>       while (itr.hasMoreTokens()) {
>>
>>         word.set(pp.getName() + " " + itr.nextToken());
>>
>>         context.write(word, one);
>>
>>       }
>>
>>     }
>>
>>   }
>>
>> Note: add your filtering code here;
>>
>> and then when running the command, use you input path as param;
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ranjinibecse@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:57 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>  ---------- Forwarded message ----------
>> From: Stanley Shi <sshi@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> Subject: Re: Need FileName with Content
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ranjinibecse@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:58 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>
>

Mime
View raw message