hadoop-mapreduce-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Shaun Martinec <smarti...@gmail.com>
Subject Single Reducer Times Out During Shuffle
Date Fri, 17 Jun 2011 13:20:42 GMT
I have a MapReduce job that is failing occasionally at the reduce
phase. I believe it's during the shuffle phase, but am not positive. I
have copied the end of the job log below. As you can see, I have a
very large number of maps (2910) and only 1 reducer that is used to
cat the results together (/bin/cat) into a single output file. I have
tried increasing "mapred.reduce.parallel.copies" from 5 to 10, but it
still fails in the same manner. I thought of building a reducer (to
replace /bin/cat) that outputs a status every 10,000 rows, but wasn't
sure if that would have any effect on the shuffle phase. The short
task timeout (25 seconds) is necessary for the mappers and cannot be
changed, unfortunately. I've run out of knowledge at this point and
would appreciate any additional insight or solutions. Thanks.

-Shaun

We are running Hadoop 0.20 using Amazon Elastic MapReduce.

...
MapAttempt TASK_TYPE="MAP" TASKID="task_201106170304_0001_m_002847"
TASK_ATTEMPT_ID="attempt_201106170304_0001_m_002847_1"
START_TIME="1308295418235"
TRACKER_NAME="tracker_ip-10-245-135-148\.ec2\.internal:localhost\.localdomain/127\.0\.0\.1:50801"
HTTP_PORT="9103" .
MapAttempt TASK_TYPE="MAP" TASKID="task_201106170304_0001_m_002847"
TASK_ATTEMPT_ID="attempt_201106170304_0001_m_002847_1"
TASK_STATUS="SUCCESS" FINISH_TIME="1308295436505"
HOSTNAME="/default-rack/ip-10-245-135-148\.ec2\.internal"
STATE_STRING="OK: garage parking aid"
COUNTERS="{(SkippingTaskCounters)(SkippingTaskCounters)[(MapProcessedRecords)(MapProcessedRecords)(10)]}{(FileSystemCounters)(FileSystemCounters)[(S3N_BYTES_READ)(S3N_BYTES_READ)(6821)][(FILE_BYTES_WRITTEN)(FILE_BYTES_WRITTEN)(49594)]}{(Custom)(Custom)[(Terms
crawled \\(Google\\))(Terms crawled
\\(Google\\))(10)]}{(org\.apache\.hadoop\.mapred\.Task$Counter)(Map-Reduce
Framework)[(COMBINE_OUTPUT_RECORDS)(Combine output
records)(0)][(MAP_INPUT_RECORDS)(Map input
records)(10)][(SPILLED_RECORDS)(Spilled
Records)(370)][(MAP_OUTPUT_BYTES)(Map output
bytes)(127806)][(MAP_INPUT_BYTES)(Map input
bytes)(1339)][(COMBINE_INPUT_RECORDS)(Combine input
records)(0)][(MAP_OUTPUT_RECORDS)(Map output records)(370)]}" .
Task TASKID="task_201106170304_0001_m_002847" TASK_TYPE="MAP"
TASK_STATUS="SUCCESS" FINISH_TIME="1308295439373"
COUNTERS="{(SkippingTaskCounters)(SkippingTaskCounters)[(MapProcessedRecords)(MapProcessedRecords)(10)]}{(FileSystemCounters)(FileSystemCounters)[(S3N_BYTES_READ)(S3N_BYTES_READ)(6821)][(FILE_BYTES_WRITTEN)(FILE_BYTES_WRITTEN)(49594)]}{(Custom)(Custom)[(Terms
crawled \\(Google\\))(Terms crawled
\\(Google\\))(10)]}{(org\.apache\.hadoop\.mapred\.Task$Counter)(Map-Reduce
Framework)[(COMBINE_OUTPUT_RECORDS)(Combine output
records)(0)][(MAP_INPUT_RECORDS)(Map input
records)(10)][(SPILLED_RECORDS)(Spilled
Records)(370)][(MAP_OUTPUT_BYTES)(Map output
bytes)(127806)][(MAP_INPUT_BYTES)(Map input
bytes)(1339)][(COMBINE_INPUT_RECORDS)(Combine input
records)(0)][(MAP_OUTPUT_RECORDS)(Map output records)(370)]}" .
ReduceAttempt TASK_TYPE="REDUCE"
TASKID="task_201106170304_0001_r_000000"
TASK_ATTEMPT_ID="attempt_201106170304_0001_r_000000_0"
START_TIME="1308294529811"
TRACKER_NAME="tracker_ip-10-112-62-154\.ec2\.internal:localhost\.localdomain/127\.0\.0\.1:34540"
HTTP_PORT="9103" .
ReduceAttempt TASK_TYPE="REDUCE"
TASKID="task_201106170304_0001_r_000000"
TASK_ATTEMPT_ID="attempt_201106170304_0001_r_000000_0"
TASK_STATUS="FAILED" FINISH_TIME="1308295482933"
HOSTNAME="ip-10-112-62-154\.ec2\.internal" ERROR="Task
attempt_201106170304_0001_r_000000_0 failed to report status for 27
seconds\. Killing!" .
ReduceAttempt TASK_TYPE="REDUCE"
TASKID="task_201106170304_0001_r_000000"
TASK_ATTEMPT_ID="attempt_201106170304_0001_r_000000_1"
START_TIME="1308295486373"
TRACKER_NAME="tracker_ip-10-85-67-66\.ec2\.internal:localhost\.localdomain/127\.0\.0\.1:36502"
HTTP_PORT="9103" .
ReduceAttempt TASK_TYPE="REDUCE"
TASKID="task_201106170304_0001_r_000000"
TASK_ATTEMPT_ID="attempt_201106170304_0001_r_000000_1"
TASK_STATUS="FAILED" FINISH_TIME="1308295579207"
HOSTNAME="ip-10-85-67-66\.ec2\.internal" ERROR="Task
attempt_201106170304_0001_r_000000_1 failed to report status for 27
seconds\. Killing!" .
ReduceAttempt TASK_TYPE="REDUCE"
TASKID="task_201106170304_0001_r_000000"
TASK_ATTEMPT_ID="attempt_201106170304_0001_r_000000_2"
START_TIME="1308295582660"
TRACKER_NAME="tracker_ip-10-204-29-58\.ec2\.internal:localhost\.localdomain/127\.0\.0\.1:43128"
HTTP_PORT="9103" .
ReduceAttempt TASK_TYPE="REDUCE"
TASKID="task_201106170304_0001_r_000000"
TASK_ATTEMPT_ID="attempt_201106170304_0001_r_000000_2"
TASK_STATUS="FAILED" FINISH_TIME="1308295671393"
HOSTNAME="ip-10-204-29-58\.ec2\.internal" ERROR="Task
attempt_201106170304_0001_r_000000_2 failed to report status for 27
seconds\. Killing!" .
ReduceAttempt TASK_TYPE="REDUCE"
TASKID="task_201106170304_0001_r_000000"
TASK_ATTEMPT_ID="attempt_201106170304_0001_r_000000_3"
START_TIME="1308295674391"
TRACKER_NAME="tracker_ip-10-203-67-191\.ec2\.internal:localhost\.localdomain/127\.0\.0\.1:49159"
HTTP_PORT="9103" .
ReduceAttempt TASK_TYPE="REDUCE"
TASKID="task_201106170304_0001_r_000000"
TASK_ATTEMPT_ID="attempt_201106170304_0001_r_000000_3"
TASK_STATUS="FAILED" FINISH_TIME="1308295756816"
HOSTNAME="ip-10-203-67-191\.ec2\.internal" ERROR="Task
attempt_201106170304_0001_r_000000_3 failed to report status for 26
seconds\. Killing!" .
Task TASKID="task_201106170304_0001_r_000000" TASK_TYPE="REDUCE"
TASK_STATUS="FAILED" FINISH_TIME="1308296335470" ERROR="Task
attempt_201106170304_0001_r_000000_9 failed to report status for 27
seconds\. Killing!" TASK_ATTEMPT_ID="" .
Task TASKID="task_201106170304_0001_m_002910" TASK_TYPE="CLEANUP"
START_TIME="1308296338495" SPLITS="" .
MapAttempt TASK_TYPE="CLEANUP"
TASKID="task_201106170304_0001_m_002910"
TASK_ATTEMPT_ID="attempt_201106170304_0001_m_002910_0"
START_TIME="1308296338358"
TRACKER_NAME="tracker_ip-10-196-15-116\.ec2\.internal:localhost\.localdomain/127\.0\.0\.1:37742"
HTTP_PORT="9103" .
MapAttempt TASK_TYPE="CLEANUP"
TASKID="task_201106170304_0001_m_002910"
TASK_ATTEMPT_ID="attempt_201106170304_0001_m_002910_0"
TASK_STATUS="SUCCESS" FINISH_TIME="1308296339573"
HOSTNAME="/default-rack/ip-10-196-15-116\.ec2\.internal"
STATE_STRING="cleanup"
COUNTERS="{(org\.apache\.hadoop\.mapred\.Task$Counter)(Map-Reduce
Framework)[(SPILLED_RECORDS)(Spilled Records)(0)]}" .
Task TASKID="task_201106170304_0001_m_002910" TASK_TYPE="CLEANUP"
TASK_STATUS="SUCCESS" FINISH_TIME="1308296341500"
COUNTERS="{(org\.apache\.hadoop\.mapred\.Task$Counter)(Map-Reduce
Framework)[(SPILLED_RECORDS)(Spilled Records)(0)]}" .
Job JOBID="job_201106170304_0001" FINISH_TIME="1308296341501"
JOB_STATUS="FAILED" FINISHED_MAPS="2910" FINISHED_REDUCES="0" .

Mime
View raw message