Return-Path: X-Original-To: apmail-hadoop-mapreduce-user-archive@minotaur.apache.org Delivered-To: apmail-hadoop-mapreduce-user-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id EAEBAC4E4 for ; Thu, 20 Mar 2014 02:10:10 +0000 (UTC) Received: (qmail 41120 invoked by uid 500); 20 Mar 2014 02:10:02 -0000 Delivered-To: apmail-hadoop-mapreduce-user-archive@hadoop.apache.org Received: (qmail 40864 invoked by uid 500); 20 Mar 2014 02:10:02 -0000 Mailing-List: contact user-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: user@hadoop.apache.org Delivered-To: mailing list user@hadoop.apache.org Received: (qmail 40857 invoked by uid 99); 20 Mar 2014 02:10:02 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 20 Mar 2014 02:10:02 +0000 X-ASF-Spam-Status: No, hits=1.5 required=5.0 tests=HTML_MESSAGE,RCVD_IN_DNSWL_LOW,SPF_PASS X-Spam-Check-By: apache.org Received-SPF: pass (nike.apache.org: domain of sshi@gopivotal.com designates 209.85.220.45 as permitted sender) Received: from [209.85.220.45] (HELO mail-pa0-f45.google.com) (209.85.220.45) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 20 Mar 2014 02:09:56 +0000 Received: by mail-pa0-f45.google.com with SMTP id kl14so208874pab.32 for ; Wed, 19 Mar 2014 19:09:34 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:mime-version:in-reply-to:references:date :message-id:subject:from:to:content-type; bh=+V+Y6BHQdWRjTOfiJ8NuXHcxy46IQ3zssQzYZU/Vj6k=; b=dttWe/QFHNIdDSL9ppbASf51KcNGfxI0mDoYyAIXPve98qQ4ZkhKfoS9fBRl1K/M1y GEJdqgbewNRg+gky3VaHeuMXP2I3bDnaMQDwC221oMfm99AiU6NNq5f/EGwfQ3RLi6x2 65Hn57DRiiNUY9brTf8RND8qDXCeICzJ8PgZPDpPg1GvJ2TfdSYi5RGSz88KGBr6gASZ TiaOYHEDDIRdH+uWdhGGVtyBhgX4tpUIY08NfNumz8V1zLl02yKZjVXrYfWTTk5xhxJh +axxexPaA43f8tizUPtIF124lUrhEasAGEsmVXuiEYjRN3QEIMP0sTB+kPR+rXoIEhaQ oOog== X-Gm-Message-State: ALoCoQm6CXyG6a3C8jR5/JszNW++Z16zaDu32xSVBX9E8Q53IPg8DdYyx3ZLEDjYfcOE023E88Yd MIME-Version: 1.0 X-Received: by 10.68.28.69 with SMTP id z5mr43455202pbg.121.1395281374676; Wed, 19 Mar 2014 19:09:34 -0700 (PDT) Received: by 10.70.92.110 with HTTP; Wed, 19 Mar 2014 19:09:34 -0700 (PDT) In-Reply-To: References: Date: Thu, 20 Mar 2014 10:09:34 +0800 Message-ID: Subject: Re: Need FileName with Content From: Stanley Shi To: user@hadoop.apache.org Content-Type: multipart/alternative; boundary=bcaec520ef33b3290804f50042a4 X-Virus-Checked: Checked by ClamAV on apache.org --bcaec520ef33b3290804f50042a4 Content-Type: text/plain; charset=UTF-8 You want to do a word count for each file, but the code give you a word count for all the files, right? ===== word.set(tokenizer.nextToken()); output.collect(word, one); ====== change it to: word.set("filename"+" "+tokenizer.nextToken()); output.collect(word,one); Regards, *Stanley Shi,* On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam wrote: > Hi, > > I have folder named INPUT. > > Inside INPUT i have 5 resume are there. > > hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT > Found 5 items > -rw-r--r-- 1 hduser supergroup 5438 2014-03-18 15:20 > /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt > -rw-r--r-- 1 hduser supergroup 6022 2014-03-18 15:22 > /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt > -rw-r--r-- 1 hduser supergroup 3517 2014-03-18 15:21 > /user/hduser/INPUT/vinitha.txt > -rw-r--r-- 1 hduser supergroup 3517 2014-03-18 15:21 > /user/hduser/INPUT/sony.txt > -rw-r--r-- 1 hduser supergroup 3517 2014-03-18 15:21 > /user/hduser/INPUT/ravi.txt > hduser@localhost:~/Ranjini$ > > I have to process the folder and the content . > > I need ouput has > > filename word occurance > vinitha java 4 > sony oracle 3 > > > > But iam not getting the filename. Has the input file content are merged > file name is not getting correct . > > > please help in this issue to fix. I have given by code below > > > import java.io.IOException; > import java.util.*; > import org.apache.hadoop.fs.Path; > import org.apache.hadoop.conf.*; > import org.apache.hadoop.io.*; > import org.apache.hadoop.mapred.*; > import org.apache.hadoop.util.*; > import java.io.File; > import java.io.FileReader; > import java.io.FileWriter; > import java.io.IOException; > import org.apache.hadoop.fs.Path; > import org.apache.hadoop.conf.Configuration; > import org.apache.hadoop.fs.FileSystem; > import org.apache.hadoop.fs.FileStatus; > import org.apache.hadoop.conf.*; > import org.apache.hadoop.io.*; > import org.apache.hadoop.mapred.*; > import org.apache.hadoop.util.*; > import org.apache.hadoop.mapred.lib.*; > > public class WordCount { > public static class Map extends MapReduceBase implements > Mapper { > private final static IntWritable one = new IntWritable(1); > private Text word = new Text(); > public void map(LongWritable key, Text value, OutputCollector IntWritable> output, Reporter reporter) throws IOException { > FSDataInputStream fs=null; > FileSystem hdfs = null; > String line = value.toString(); > int i=0,k=0; > try{ > Configuration configuration = new Configuration(); > configuration.set("fs.default.name", "hdfs://localhost:4440/"); > > Path srcPath = new Path("/user/hduser/INPUT/"); > > hdfs = FileSystem.get(configuration); > FileStatus[] status = hdfs.listStatus(srcPath); > fs=hdfs.open(srcPath); > BufferedReader br=new BufferedReader(new > InputStreamReader(hdfs.open(srcPath))); > > String[] splited = line.split("\\s+"); > for( i=0;i { > String sp[]=splited[i].split(","); > for( k=0;k { > > if(!sp[k].isEmpty()){ > StringTokenizer tokenizer = new StringTokenizer(sp[k]); > if((sp[k].equalsIgnoreCase("C"))){ > while (tokenizer.hasMoreTokens()) { > word.set(tokenizer.nextToken()); > output.collect(word, one); > } > } > if((sp[k].equalsIgnoreCase("JAVA"))){ > while (tokenizer.hasMoreTokens()) { > word.set(tokenizer.nextToken()); > output.collect(word, one); > } > } > } > } > } > } catch (IOException e) { > e.printStackTrace(); > } > } > } > public static class Reduce extends MapReduceBase implements > Reducer { > public void reduce(Text key, Iterator values, > OutputCollector output, Reporter reporter) throws > IOException { > int sum = 0; > while (values.hasNext()) { > sum += values.next().get(); > } > output.collect(key, new IntWritable(sum)); > } > } > public static void main(String[] args) throws Exception { > > > JobConf conf = new JobConf(WordCount.class); > conf.setJobName("wordcount"); > conf.setOutputKeyClass(Text.class); > conf.setOutputValueClass(IntWritable.class); > conf.setMapperClass(Map.class); > conf.setCombinerClass(Reduce.class); > conf.setReducerClass(Reduce.class); > conf.setInputFormat(TextInputFormat.class); > conf.setOutputFormat(TextOutputFormat.class); > FileInputFormat.setInputPaths(conf, new Path(args[0])); > FileOutputFormat.setOutputPath(conf, new Path(args[1])); > JobClient.runJob(conf); > } > } > > > > Please help > > Thanks in advance. > > Ranjini > > > --bcaec520ef33b3290804f50042a4 Content-Type: text/html; charset=UTF-8 Content-Transfer-Encoding: quoted-printable
You want to do a word count for each file, but the code gi= ve you a word count for all the files, right?

=3D=3D=3D= =3D=3D
word.set(tokenizer.nextToken());
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0 output.collect(word, one);
=3D=3D=3D=3D=3D=3D
change it to:
word.set("filename&qu= ot;+" =C2=A0 =C2=A0"+tokenizer.nextToken());
= output.collect(word,one);




Regards,
Stanley Shi,



On Wed, Mar 19, 2014 at 8:50 PM, Ranjini= Rathinam <ranjinibecse@gmail.com> wrote:
Hi,

I have folder named INPUT.

Inside INPUT i have= 5 resume are there.

hduser@localhost:~/Ranjini$ hadoop fs -ls /user= /hduser/INPUT
Found 5 items
-rw-r--r--=C2=A0=C2=A0 1 hduser supergrou= p=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 5438 2014-03-18 15:20 /user/hduser/IN= PUT/Rakesh Chowdary_Microstrategy.txt
-rw-r--r--=C2=A0=C2=A0 1 hduser supergroup=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0 6022 2014-03-18 15:22 /user/hduser/INPUT/Ramarao Devineni_Microstrategy= .txt
-rw-r--r--=C2=A0=C2=A0 1 hduser supergroup=C2=A0=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0 3517 2014-03-18 15:21 /user/hduser/INPUT/vinitha.txt
-rw-r-= -r--=C2=A0=C2=A0 1 hduser supergroup=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 35= 17 2014-03-18 15:21 /user/hduser/INPUT/sony.txt
-rw-r--r--=C2=A0=C2=A0 1 hduser supergroup=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0 3517 2014-03-18 15:21 /user/hduser/INPUT/ravi.txt
hduser@localhost:~= /Ranjini$

I have to process the folder and the content .

I need ouput has


filename=C2=A0=C2=A0 word=C2=A0=C2=A0 occurance
vinitha=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0 java=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 4
sony= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 oracle=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0 3



But iam not getting the filename.=C2=A0= Has the input file content are merged file name is not getting correct .

please help in this issue to fix.=C2=A0 I have given by code below
=C2=A0
=C2=A0
=C2=A0import java.io.IOException;
=C2=A0import java.util.*;=C2=A0import org.apache.hadoop.fs.Path;
=C2=A0import org.apache.hadoop= .conf.*;
=C2=A0import org.apache.hadoop.io.*;
=C2=A0import org.apache= .hadoop.mapred.*;
=C2=A0import org.apache.hadoop.util.*;
import java.io.File;
import java.io.FileReader;
import java.io.FileWr= iter;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.co= nf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.= apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.*;
import = org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
i= mport org.apache.hadoop.mapred.lib.*;

=C2=A0public class WordCount {
=C2=A0=C2=A0=C2=A0 public static= class Map extends MapReduceBase implements Mapper<LongWritable, Text, T= ext, IntWritable> {
=C2=A0=C2=A0=C2=A0=C2=A0 private final static Int= Writable one =3D new IntWritable(1);
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 private Text word =3D new Text();
= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 public void map(LongWritable key, Text value= , OutputCollector<Text, IntWritable> output, Reporter reporter) throw= s IOException {
=C2=A0=C2=A0=C2=A0FSDataInputStream fs=3Dnull;
=C2=A0=C2=A0=C2=A0FileSystem hdfs =3D null;
=C2=A0=C2=A0=C2=A0String lin= e =3D value.toString();
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 =C2= =A0int i=3D0,k=3D0;
=C2=A0=C2=A0try{
=C2=A0=C2=A0=C2=A0Configuration configuration =3D = new Configuration();
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 configuration.set(&q= uot;fs.default.name", "hdfs://localhost:4440/");
=C2=A0=C2=A0=C2=A0
=C2=A0=C2=A0=C2=A0Path srcPath =3D new Path("/user/hduser/INPUT/"= );
=C2=A0=C2=A0=C2=A0
=C2=A0=C2=A0=C2=A0hdfs =3D FileSystem.get(configurati= on);
=C2=A0=C2=A0=C2=A0FileStatus[] status =3D hdfs.listStatus(srcPath);=
=C2=A0=C2=A0=C2=A0fs=3Dhdfs.open(srcPath);
=C2=A0=C2=A0=C2=A0Buffere= dReader br=3Dnew BufferedReader(new InputStreamReader(hdfs.open(srcPath)));=
=C2=A0=C2=A0=C2=A0
String[] splited =3D line.split("
\\s+&quo= t;);
=C2=A0=C2=A0=C2=A0 for( i=3D0;i<splited.length;i++)
=C2=A0{=C2=A0=C2=A0=C2=A0=C2=A0 String sp[]=3Dsplited[i].split(",");=C2=A0=C2=A0=C2=A0=C2=A0 for( k=3D0;k<sp.length;k++)
=C2=A0{
=C2=A0=C2=A0 =C2=A0=C2=A0
=C2=A0=C2=A0 if(!sp[k].isEmpty()){
StringTo= kenizer tokenizer =3D new StringTokenizer(sp[k]);
if((sp[k].equalsIgnoreCase("C"))){
=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0 while (tokenizer.hasMoreTokens()) {
=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 word.set(tokenizer.nextToken());=
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 output.collect(= word, one);
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 }
}
if((sp[= k].equalsIgnoreCase("JAVA"))){
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0 while (tokenizer.hasMoreTokens()) {
=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 word.set(tokenizer.nextToken());
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 output.collect(= word, one);
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 }
}
=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0 }
=C2=A0=C2=A0=C2=A0 }
}
=C2=A0} catch (IOException e) {
=C2=A0=C2=A0=C2=A0=C2=A0e.printStac= kTrace();
=C2=A0}=C2=A0
}
}
=C2=A0=C2=A0=C2=A0 public static cl= ass Reduce extends MapReduceBase implements Reducer<Text, IntWritable, T= ext, IntWritable> {
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 public void reduce= (Text key, Iterator<IntWritable> values, OutputCollector<Text, Int= Writable> output, Reporter reporter) throws IOException {
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 int sum =3D 0;
=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 while (values.hasNext()) {
=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 sum +=3D values.next().get();=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 }
=C2=A0=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0 output.collect(key, new IntWritable(sum));
=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0 }
=C2=A0=C2=A0=C2=A0 }
=C2=A0=C2=A0=C2=A0 publi= c static void main(String[] args) throws Exception {
=C2=A0
=C2=A0
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 JobConf conf =3D new Job= Conf(WordCount.class);
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setJobName(&q= uot;wordcount");
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setOutputKeyCl= ass(Text.class);
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setOutputValueClass= (IntWritable.class);
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setMapperClass(= Map.class);
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setCombinerClass(Reduce.class);
=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setReducerClass(Reduce.class);
=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0 conf.setInputFormat(TextInputFormat.class);
=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setOutputFormat(TextOutputFormat.class);=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 FileInputFormat.setInputPaths(conf, new Pa= th(args[0]));
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 FileOutputFormat.setOutputPath(conf, new Pat= h(args[1]));
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 JobClient.runJob(conf);
= =C2=A0=C2=A0=C2=A0 }
=C2=A0}
=C2=A0

=C2=A0

Please help
=C2=A0
Thanks in advance.
=C2=A0
Ranjini



--bcaec520ef33b3290804f50042a4--