hadoop-mapreduce-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From unmesha sreeveni <unmeshab...@gmail.com>
Subject Re: XML parsing in Hadoop
Date Thu, 28 Nov 2013 08:52:42 GMT
How much is ur size of input file?


On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

> Hi,
>
>
>
> Yes I have run it without MR it takes few seconds to run. So I think its
> MR issue only
>
> I have a single node cluster its launching 4 map tasks. Trying with only
> one file.
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
>
>
>
> *From:* Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
> *Sent:* Thursday, November 28, 2013 12:53 PM
> *To:* user@hadoop.apache.org
> *Subject:* Re: XML parsing in Hadoop
>
>
>
> Chhaya,
>
>
>
> did you run the same code in stand alone mode without MapReduce framework?
>
> How long takes the code in you map() function standalone?
>
> Compare those two different times (t_0 MR mode, t_1 standalone mode) to
> find out
>
> if it is a MR issue or something which comes from the xml-parser logic or
> the data ...
>
>
>
> Usually it should be not that slow. But what cluster do you have and how
> many mappers / reducers and how many of such 2NB files do you have?
>
>
>
> Best wishes
>
> Mirko
>
>
>
>
>
> 2013/11/28 Chhaya Vishwakarma <Chhaya.Vishwakarma@lntinfotech.com>
>
> Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
> ------------------------------
>
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>
>
>



-- 
*Thanks & Regards*

Unmesha Sreeveni U.B

*Junior Developer*

Mime
View raw message