hadoop-common-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Devaraj K <deva...@apache.org>
Subject Re: XML parsing in Hadoop
Date Thu, 28 Nov 2013 08:09:01 GMT
Hi,

Here this map() function will be called for every (key,value) pair (i.e.
for every line of split in your Job because of TextInputFormat). This xml
parsing code which you have written in map() function will be executed for
every line of your input which is causing the problem.

You can customize your InputFormat to read the xml file, instead of parsing
in map() or you could place this parsing code in run() method by overriding
it from Mapper.run(Context context).


On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>



-- 


Thanks
Devaraj K

Mime
View raw message