Mailing-List: contact user-help@hadoop.apache.org; run by ezmlm
Precedence: bulk
Reply-To: user@hadoop.apache.org
Received-SPF: pass (nike.apache.org: domain of sshi@gopivotal.com designates
 209.85.220.45 as permitted sender)
MIME-Version: 1.0
In-Reply-To: 
 <CALGZSrCdt61zV6kv7VpMMwx8chtaXTh+CNSeydApCvFg0X0-ew@mail.gmail.com>
References: 
 <CALGZSrCdt61zV6kv7VpMMwx8chtaXTh+CNSeydApCvFg0X0-ew@mail.gmail.com>
Date: Thu, 20 Mar 2014 10:09:34 +0800
Message-ID: 
 <CAOAr05v-akGWgmeRdt=DvC25A61vhijjkN=B2u8=7mAJsmLSCw@mail.gmail.com>
Subject: Re: Need FileName with Content
From: Stanley Shi <sshi@gopivotal.com>
To: user@hadoop.apache.org
Content-Type: multipart/alternative; boundary=bcaec520ef33b3290804f50042a4

--bcaec520ef33b3290804f50042a4
Content-Type: text/plain; charset=UTF-8

You want to do a word count for each file, but the code give you a word
count for all the files, right?

=====
word.set(tokenizer.nextToken());
          output.collect(word, one);
======
change it to:
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);


Regards,
*Stanley Shi,*


On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ranjinibecse@gmail.com>wrote:

> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>

--bcaec520ef33b3290804f50042a4
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable

<div dir=3D"ltr">You want to do a word count for each file, but the code gi=
ve you a word count for all the files, right?<div><br></div><div>=3D=3D=3D=
=3D=3D</div><div><div style=3D"font-family:arial,sans-serif;font-size:12.80=
0000190734863px">
word.set(tokenizer.nextToken());</div><div style=3D"font-family:arial,sans-=
serif;font-size:12.800000190734863px">=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0 output.collect(word, one);</div><div style=3D"font-famil=
y:arial,sans-serif;font-size:12.800000190734863px">
=3D=3D=3D=3D=3D=3D</div></div><div style=3D"font-family:arial,sans-serif;fo=
nt-size:12.800000190734863px">change it to:</div><div style=3D"font-family:=
arial,sans-serif;font-size:12.800000190734863px">word.set(&quot;filename&qu=
ot;+&quot; =C2=A0 =C2=A0&quot;+tokenizer.nextToken());</div>
<div style=3D"font-family:arial,sans-serif;font-size:12.800000190734863px">=
output.collect(word,one);</div><div style=3D"font-family:arial,sans-serif;f=
ont-size:12.800000190734863px"><br></div><div style=3D"font-family:arial,sa=
ns-serif;font-size:12.800000190734863px">
<br></div><div style=3D"font-family:arial,sans-serif;font-size:12.800000190=
734863px"><br></div></div><div class=3D"gmail_extra"><br clear=3D"all"><div=
><div dir=3D"ltr"><div>Regards,</div><div><b>Stanley Shi,</b></div><img src=
=3D"http://www.gopivotal.com/files/media/logos/pivotal-logo-email-signature=
.png"><br>
</div></div>
<br><br><div class=3D"gmail_quote">On Wed, Mar 19, 2014 at 8:50 PM, Ranjini=
 Rathinam <span dir=3D"ltr">&lt;<a href=3D"mailto:ranjinibecse@gmail.com" t=
arget=3D"_blank">ranjinibecse@gmail.com</a>&gt;</span> wrote:<br><blockquot=
e class=3D"gmail_quote" style=3D"margin:0 0 0 .8ex;border-left:1px #ccc sol=
id;padding-left:1ex">
<div><span>Hi,<br><br>I have folder named INPUT.<br><br>Inside INPUT i have=
 5 resume are there.<br><br>hduser@localhost:~/Ranjini$ hadoop fs -ls /user=
/hduser/INPUT<br>Found 5 items<br>-rw-r--r--=C2=A0=C2=A0 1 hduser supergrou=
p=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 5438 2014-03-18 15:20 /user/hduser/IN=
PUT/Rakesh Chowdary_Microstrategy.txt<br>

-rw-r--r--=C2=A0=C2=A0 1 hduser supergroup=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=
=A0 6022 2014-03-18 15:22 /user/hduser/INPUT/Ramarao Devineni_Microstrategy=
.txt<br>-rw-r--r--=C2=A0=C2=A0 1 hduser supergroup=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0 3517 2014-03-18 15:21 /user/hduser/INPUT/vinitha.txt<br>-rw-r-=
-r--=C2=A0=C2=A0 1 hduser supergroup=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 35=
17 2014-03-18 15:21 /user/hduser/INPUT/sony.txt<br>

-rw-r--r--=C2=A0=C2=A0 1 hduser supergroup=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=
=A0 3517 2014-03-18 15:21 /user/hduser/INPUT/ravi.txt<br>hduser@localhost:~=
/Ranjini$ <br><br>I have to process the folder and the content .<br><br><sp=
an style=3D"FONT-WEIGHT:bold">I need ouput has </span><br>

<br>filename=C2=A0=C2=A0 word=C2=A0=C2=A0 occurance<br>vinitha=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0 java=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 4<br>sony=
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 oracle=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0 3<br><br><br><br>But iam not getting the filename.=C2=A0=
 Has the input file content are merged file name is not getting correct .<b=
r><br>

<br>please help in this issue to fix.=C2=A0 I have given by code below </sp=
an></div>
<div><span></span>=C2=A0</div>
<div><span></span>=C2=A0</div>
<div><span>=C2=A0import java.io.IOException;<br>=C2=A0import java.util.*;<b=
r>=C2=A0import org.apache.hadoop.fs.Path;<br>=C2=A0import org.apache.hadoop=
.conf.*;<br>=C2=A0import org.apache.hadoop.io.*;<br>=C2=A0import org.apache=
.hadoop.mapred.*;<br>=C2=A0import org.apache.hadoop.util.*;<br>

import java.io.File;<br>import java.io.FileReader;<br>import java.io.FileWr=
iter;<br>import java.io.IOException;</span></div>
<div><span>import org.apache.hadoop.fs.Path;<br>import org.apache.hadoop.co=
nf.Configuration;<br>import org.apache.hadoop.fs.FileSystem;<br>import org.=
apache.hadoop.fs.FileStatus;<br>import org.apache.hadoop.conf.*;<br>import =
org.apache.hadoop.io.*;<br>

import org.apache.hadoop.mapred.*;<br>import org.apache.hadoop.util.*;<br>i=
mport org.apache.hadoop.mapred.lib.*;</span></div><span>
<div><br>=C2=A0public class WordCount {<br>=C2=A0=C2=A0=C2=A0 public static=
 class Map extends MapReduceBase implements Mapper&lt;LongWritable, Text, T=
ext, IntWritable&gt; {<br>=C2=A0=C2=A0=C2=A0=C2=A0 private final static Int=
Writable one =3D new IntWritable(1);</div>


<div>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 private Text word =3D new Text();<br>=
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 public void map(LongWritable key, Text value=
, OutputCollector&lt;Text, IntWritable&gt; output, Reporter reporter) throw=
s IOException {<br>=C2=A0=C2=A0=C2=A0FSDataInputStream fs=3Dnull;<br>

=C2=A0=C2=A0=C2=A0FileSystem hdfs =3D null;<br>=C2=A0=C2=A0=C2=A0String lin=
e =3D value.toString();<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 =C2=
=A0int i=3D0,k=3D0;</div>
<div>=C2=A0=C2=A0try{<br>=C2=A0=C2=A0=C2=A0Configuration configuration =3D =
new Configuration();<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 configuration.set(&q=
uot;<a href=3D"http://fs.default.name" target=3D"_blank">fs.default.name</a=
>&quot;, &quot;hdfs://localhost:4440/&quot;);<br>=C2=A0=C2=A0=C2=A0<br>
=C2=A0=C2=A0=C2=A0Path srcPath =3D new Path(&quot;/user/hduser/INPUT/&quot;=
);<br>
=C2=A0=C2=A0=C2=A0<br>=C2=A0=C2=A0=C2=A0hdfs =3D FileSystem.get(configurati=
on);<br>=C2=A0=C2=A0=C2=A0FileStatus[] status =3D hdfs.listStatus(srcPath);=
<br>=C2=A0=C2=A0=C2=A0fs=3Dhdfs.open(srcPath);<br>=C2=A0=C2=A0=C2=A0Buffere=
dReader br=3Dnew BufferedReader(new InputStreamReader(hdfs.open(srcPath)));=
<br>

=C2=A0=C2=A0=C2=A0<br>String[] splited =3D line.split(&quot;<a>\\s</a>+&quo=
t;);<br>=C2=A0=C2=A0=C2=A0 for( i=3D0;i&lt;splited.length;i++)<br>=C2=A0{<b=
r>=C2=A0=C2=A0=C2=A0=C2=A0 String sp[]=3Dsplited[i].split(&quot;,&quot;);<b=
r>=C2=A0=C2=A0=C2=A0=C2=A0 for( k=3D0;k&lt;sp.length;k++)<br>=C2=A0{<br>
=C2=A0=C2=A0 =C2=A0=C2=A0<br>=C2=A0=C2=A0 if(!sp[k].isEmpty()){<br>StringTo=
kenizer tokenizer =3D new StringTokenizer(sp[k]);</div>
<div>if((sp[k].equalsIgnoreCase(&quot;C&quot;))){<br>=C2=A0=C2=A0=C2=A0=C2=
=A0=C2=A0=C2=A0=C2=A0 while (tokenizer.hasMoreTokens()) {<br>=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 word.set(tokenizer.nextToken());=
</div>
<div>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 output.collect(=
word, one);<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 }<br>}<br>if((sp[=
k].equalsIgnoreCase(&quot;JAVA&quot;))){<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0 while (tokenizer.hasMoreTokens()) {<br>=C2=A0=C2=A0=C2=A0=C2=
=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 word.set(tokenizer.nextToken());</div>
<div>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 output.collect(=
word, one);<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 }<br>}<br>=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0 }<br>=C2=A0=C2=A0=C2=A0 }<br>}</div>
<div>=C2=A0} catch (IOException e) {<br>=C2=A0=C2=A0=C2=A0=C2=A0e.printStac=
kTrace();<br>=C2=A0}=C2=A0<br>}<br>}<br>=C2=A0=C2=A0=C2=A0 public static cl=
ass Reduce extends MapReduceBase implements Reducer&lt;Text, IntWritable, T=
ext, IntWritable&gt; {<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 public void reduce=
(Text key, Iterator&lt;IntWritable&gt; values, OutputCollector&lt;Text, Int=
Writable&gt; output, Reporter reporter) throws IOException {<br>

=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 int sum =3D 0;<br>=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 while (values.hasNext()) {<br>=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 sum +=3D values.next().get();<br=
>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 }<br>=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0 output.collect(key, new IntWritable(sum));<br>=C2=A0=C2=
=A0=C2=A0=C2=A0=C2=A0 }<br>=C2=A0=C2=A0=C2=A0 }<br>=C2=A0=C2=A0=C2=A0 publi=
c static void main(String[] args) throws Exception {<br>

=C2=A0<br>=C2=A0<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 JobConf conf =3D new Job=
Conf(WordCount.class);<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setJobName(&q=
uot;wordcount&quot;);<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setOutputKeyCl=
ass(Text.class);<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setOutputValueClass=
(IntWritable.class);<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setMapperClass(=
Map.class);<br>

=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setCombinerClass(Reduce.class);<br>=C2=
=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setReducerClass(Reduce.class);<br>=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0 conf.setInputFormat(TextInputFormat.class);<br>=C2=
=A0=C2=A0=C2=A0=C2=A0=C2=A0 conf.setOutputFormat(TextOutputFormat.class);<b=
r>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 FileInputFormat.setInputPaths(conf, new Pa=
th(args[0]));<br>

=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 FileOutputFormat.setOutputPath(conf, new Pat=
h(args[1]));<br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 JobClient.runJob(conf);<br>=
=C2=A0=C2=A0=C2=A0 }<br>=C2=A0}</div>
<div></div></span>=C2=A0
<p><span></span>=C2=A0</p>
<div><span>Please help</span></div>
<div><span></span>=C2=A0</div>
<div><span>Thanks in advance.</span></div><span class=3D"HOEnZb"><font colo=
r=3D"#888888">
<div><span></span>=C2=A0</div>
<div><span>Ranjini</span></div>
<div><br><br></div>
</font></span></blockquote></div><br></div>

--bcaec520ef33b3290804f50042a4--