hadoop-common-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ranjini Rathinam <ranjinibe...@gmail.com>
Subject Re: XML to TEXT
Date Mon, 06 Jan 2014 09:14:50 GMT
Hi,

Thanks a lot .

Ranjini

On Fri, Jan 3, 2014 at 10:40 PM, Diego Gutierrez <
diego.gutierrez@ucsp.edu.pe> wrote:

>  Hi,
>
> I suggest to use the XPath, this is a native java support for parse xml
> and json formats.
>
> For the main problem, like distcp command(
> http://hadoop.apache.org/docs/r0.19.0/distcp.pdf ) there is no need of a
> reduce function, because you can parse the xml input file and create the
> file you need in the map function.For example the following code reads an
> xml file in HDFS, parse it and create a new file ( "/result.txt" ) with the
> expected format:
> id,name
> 100,RR
>
>
> Mapper function:
>
> import java.io.ByteArrayInputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.net.URI;
>
> import javax.xml.namespace.QName;
> import javax.xml.parsers.DocumentBuilder;
> import javax.xml.parsers.DocumentBuilderFactory;
> import javax.xml.parsers.ParserConfigurationException;
> import javax.xml.xpath.XPath;
> import javax.xml.xpath.XPathConstants;
> import javax.xml.xpath.XPathExpressionException;
> import javax.xml.xpath.XPathFactory;
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FSDataOutputStream;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.IOUtils;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.mapreduce.Mapper;
> import org.w3c.dom.Document;
> import org.w3c.dom.Node;
> import org.w3c.dom.NodeList;
> import org.xml.sax.SAXException;
>
> import com.sun.org.apache.xml.internal.dtm.ref.DTMNodeList;
>
> public class XmlToTextMapper extends Mapper<LongWritable, Text, Text,
> Text> {
>
>     private static final XPathFactory xpathFactory =
> XPathFactory.newInstance();
>
>     @Override
>     public void map(LongWritable key, Text value, Context context)
>             throws IOException, InterruptedException {
>
>         String resultFileName = "/result.txt";
>
>
>         Configuration conf = new Configuration();
>         FileSystem fs = FileSystem.get(URI.create(resultFileName), conf);
>         FSDataOutputStream out = fs.create(new Path(resultFileName));
>
>         InputStream resultIS = new ByteArrayInputStream(new byte[0]);
>
>         String header = "id,name\n";
>         out.write(header.getBytes());
>
>         String xmlContent = value.toString();
>         InputStream is = new ByteArrayInputStream(xmlContent.getBytes());
>         DocumentBuilderFactory factory =
> DocumentBuilderFactory.newInstance();
>         DocumentBuilder builder;
>         try {
>             builder = factory.newDocumentBuilder();
>             Document doc = builder.parse(is);
>             DTMNodeList list = (DTMNodeList) getNode("/main/data", doc,
>                     XPathConstants.NODESET);
>
>             int size = list.getLength();
>             for (int i = 0; i < size; i++) {
>                 Node node = list.item(i);
>                 String line = "";
>                 NodeList nodeList = node.getChildNodes();
>                 int childNumber = nodeList.getLength();
>                 for (int j = 0; j < childNumber; j++) {
>                     line += nodeList.item(j).getTextContent() + ",";
>                 }
>                 if (line.endsWith(","))
>                     line = line.substring(0, line.length() - 1);
>                 line += "\n";
>                 out.write(line.getBytes());
>
>             }
>
>         } catch (ParserConfigurationException e) {
>             MyLogguer.log("error: " + e.getMessage());
>             e.printStackTrace();
>         } catch (SAXException e) {
>             MyLogguer.log("error: " + e.getMessage());
>             e.printStackTrace();
>         } catch (XPathExpressionException e) {
>             MyLogguer.log("error: " + e.getMessage());
>             e.printStackTrace();
>         }
>
>         IOUtils.copyBytes(resultIS, out, 4096, true);
>         out.close();
>     }
>
>     public static Object getNode(String xpathStr, Node node, QName
> retunType)
>             throws XPathExpressionException {
>         XPath xpath = xpathFactory.newXPath();
>         return xpath.evaluate(xpathStr, node, retunType);
>     }
> }
>
>
>
> --------------------------------------
> Main class:
>
>
> public class Main {
>
>     public static void main(String[] args) throws Exception {
>
>         if (args.length != 2) {
>             System.err
>                     .println("Usage: XMLtoText <input path> <output
> path>");
>             System.exit(-1);
>         }
>
>         Job job = new Job();
>         job.setJarByClass(Main.class);
>         job.setJobName("XML to Text");
>         FileInputFormat.addInputPath(job, new Path(args[0]));
>         FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>         job.setMapperClass(XmlToTextMapper.class);
>         job.setNumReduceTasks(0);
>         job.setMapOutputKeyClass(Text.class);
>         job.setMapOutputValueClass(Text.class);
>         System.exit(job.waitForCompletion(true) ? 0 : 1);
>
>     }
> }
>
> To execute the job you can use :
>
>          bin/hadoop Main /data.xml /output.
>
>
> Then you can use this to see result.txt file:
>
>           hadoop fs -cat /result.txt
>
>
> I'm using this xml as input:
>
>
> <Comp><Emp><id>1</id><name>NameA</name></data><data><id>2</id><name>NameB</name></Emp></Comp>
>
> and the content in result.txt is like this:
>
> id,name
> 1,NameA
> 2,NameB
>
>
> Hope this helps.
>
>
> 2014/1/3 Ranjini Rathinam <ranjinibecse@gmail.com>
>
>> Hi,
>>
>> Need to convert XML into text using mapreduce.
>>
>> I have used DOM and SAX parser.
>>
>> After using SAX Builder in mapper class. the child node act as root
>> Element.
>>
>> While seeing in Sys out i found thar root element is taking the child
>> element and printing.
>>
>> For Eg,
>>
>> <Comp><Emp><id>100</id><name>RR</name></Emp></Comp>
>> when this xml is passed in mapper , in sys out printing the root element
>>
>> I am getting the the root element as
>>
>> <id>
>> <name>
>>
>> Please suggest and help to fix this.
>>
>> I need to convert the xml into text using mapreduce code. Please provide
>> with example.
>>
>> Required output is
>>
>> id,name
>> 100,RR
>>
>> Please help.
>>
>> Thanks in advance,
>> Ranjini R
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
>

Mime
View raw message