hadoop-common-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Wasim Bari <wasimb...@msn.com>
Subject Customized InputFormat
Date Tue, 18 Aug 2009 14:35:13 GMT



   I tried anotherway to implement the InputFileFormat which returns <Key,MultipleLines>
as record to mapper.


I used this logic: Used a LineRecordReader to read file line by line and keep storing these
lines in buffer. 

when i encouters an empty string , Set the buffer to value and return the result. Please see
the attached code. 



But i get Java Heap error. apparently its because of buffer writing, but data is not so big
and i am unable to find the solution. 


Please have a look and guide me. 





package initial;

import java.io.IOException;

import org.apache.hadoop.io.DataOutputBuffer;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.FileSplit;

import org.apache.hadoop.mapred.InputSplit;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapred.RecordReader;

import org.apache.hadoop.mapred.Reporter;

import org.apache.hadoop.mapred.TextInputFormat;

import org.apache.log4j.Logger;


public class PTextInputFormat1 extends TextInputFormat {


public void configure(JobConf jobConf) {



public RecordReader<LongWritable, Text> getRecordReader(InputSplit inputSplit, JobConf

Reporter reporter) throws IOException {

return new PTextRecordReader((FileSplit) inputSplit, jobConf);


public static class PTextRecordReader implements RecordReader<LongWritable, Text> {

private static final Logger sLogger = Logger.getLogger(PTextRecordReader.class);

private DataOutputBuffer buffer = new DataOutputBuffer();

private JobConf job;

private FileSplit FSplit;

private long start;

private long end;

private int count;

org.apache.hadoop.mapred.LineRecordReader lineRecordReader;

public PTextRecordReader(FileSplit split, JobConf jobConf) throws IOException {


start = split.getStart();

job = jobConf;

lineRecordReader = new org.apache.hadoop.mapred.LineRecordReader(job,FSplit);

end = start + split.getLength();


public boolean next(LongWritable key, Text value) throws IOException { 

if (lineRecordReader.next(key, value)){

while (value.toString().length()!=0){





value.set(buffer.getData(), 0, buffer.getLength());


return true;



return false;


public LongWritable createKey() {

return new LongWritable();


public Text createValue() {

return new Text();


public long getStart() {

return start;


public long getEnd() {

return end;


public long getPos() throws IOException {

return lineRecordReader.getPos();


public float getProgress() throws IOException {

return lineRecordReader.getProgress();



public void close() throws IOException {





  • Unnamed multipart/mixed (inline, None, 0 bytes)
View raw message