hadoop-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ranjini Rathinam <ranjinibe...@gmail.com>
Subject Re: Need FileName with Content
Date Fri, 21 Mar 2014 06:15:37 GMT
Hi,


Thanks a lot for the great support. I am just learning hadoop and
mapreduce.

I have used the way you have guided me.

But the output is coming without Aggreating

vinitha.txt C    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1


*I need the output has *

 *vinitha       C    1*

*vinitha      Java  4*


I have reduce class but still not able to fix it, I am still trying .

I have given my code below, Please let me know where i have gone wrong.


my code


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;

 public class FileCount {
    public static class TokenizerMapper extends Mapper<LongWritable, Text,
Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(LongWritable key, Text value, Context context) throws
IOException, InterruptedException {

            FileSplit fileSplit;
              InputSplit is = context.getInputSplit();
              FileSystem fs = FileSystem.get(context.getConfiguration());
              fileSplit = (FileSplit) is;
              Path pp = fileSplit.getPath();
                    String line=value.toString();
                    int i=0;int k=0;
                    //Path pp = ((FileSplit)
context.getInputSplit()).getPath();

                    String[] splited = line.split("\\s+");
                        for( i=0;i<splited.length;i++)
                            {
                                 String sp[]=splited[i].split(",");
                         for( k=0;k<sp.length;k++)
                            {

                               if(!sp[k].isEmpty())
                            {

                                  StringTokenizer itr = new
StringTokenizer(sp[k]);

                                  //log.info("map on string: " + new
String(value.getBytes()));
                                if((sp[k].equalsIgnoreCase("C"))){
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }
                                if((sp[k].equalsIgnoreCase("JAVA"))){
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                }
                             }
                            }
                        }

          }

  }

  public static class Reduce extends Reducer<Text, IntWritable, Text,
IntWritable> {

    public void reduce(Text key, Iterator<IntWritable> values, Context
context) throws IOException, InterruptedException {

        int sum = 0;
        while (values.hasNext()) {
          sum += values.next().get();
        }
       context.write(key, new IntWritable(sum));
      }
    }
    public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
Job job = new Job(conf, "jobName");

String input="/user/hduser/INPUT/";
String output="/user/hduser/OUTPUT/";
FileInputFormat.setInputPaths(job, input);
job.setJarByClass(FileCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(Reduce.class);
job.setCombinerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(job, outPath);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}


try {

job.waitForCompletion(true);

} catch (InterruptedException ex) {
//Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
} catch (ClassNotFoundException ex) {
//Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
}

}

}


Thanks in advance for the great help and support to fix the issue .

Please help to fix it.

Thanks a lot.

Regards,
Ranjini


> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>
> ----------
> From: *Stanley Shi* <sshi@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ranjinibecse@gmail.com>
> Date: Thu, Mar 20, 2014 at 10:56 AM
> To: ranjini.r@polarisft.com
>
>
>
> ----------
> From: *Ranjini Rathinam* <ranjinibecse@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:20 AM
> To: user@hadoop.apache.org, sshi@gopivotal.com
>
>
> Hi,
>
> If we give the below code,
> =======================
>  word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
>  filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
> ----------
> From: *Felix Chern* <idryman@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:25 PM
> To: user@hadoop.apache.org
> Cc: sshi@gopivotal.com
>
>
>  I've written two blog post of how to get directory context in hadoop
> mapper.
>
>
> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>
> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>
> Cheers,
> Felix
>
> ----------
> From: *Stanley Shi* <sshi@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:02 AM
>
> To: Ranjini Rathinam <ranjinibecse@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Stanley Shi* <sshi@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> To: Ranjini Rathinam <ranjinibecse@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Change you mapper to be something like this:
>
>  public static class TokenizerMapper extends
>
>       Mapper<Object, Text, Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(Object key, Text value, Context context)
>
>         throws IOException, InterruptedException {
>
>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>
>       StringTokenizer itr = new StringTokenizer(value.toString());
>
>       log.info("map on string: " + new String(value.getBytes()));
>
>       while (itr.hasMoreTokens()) {
>
>         word.set(pp.getName() + " " + itr.nextToken());
>
>         context.write(word, one);
>
>       }
>
>     }
>
>   }
>
> Note: add your filtering code here;
>
> and then when running the command, use you input path as param;
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ranjinibecse@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:57 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>  ---------- Forwarded message ----------
> From: Stanley Shi <sshi@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> Subject: Re: Need FileName with Content
>
>
> ----------
> From: *Ranjini Rathinam* <ranjinibecse@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:58 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>

Mime
View raw message