hawq-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Ruilong Huo (JIRA)" <j...@apache.org>
Subject [jira] [Created] (HAWQ-1487) hang process due to deadlock when it try to process interrupt in error handling
Date Fri, 16 Jun 2017 02:40:00 GMT
Ruilong Huo created HAWQ-1487:
---------------------------------

             Summary: hang process due to deadlock when it try to process interrupt in error
handling
                 Key: HAWQ-1487
                 URL: https://issues.apache.org/jira/browse/HAWQ-1487
             Project: Apache HAWQ
          Issue Type: Bug
          Components: Query Execution
            Reporter: Ruilong Huo
            Assignee: Lei Chang


It has hang process when it try to process interrupt in error handling. To be specific, some
QE encounter division by zero error, and then it error out. During the error processing, it
try to handle query cancelling interrupt and thus deadlock occur.

The hang process is:
{noformat}
$ hawq ssh -f hostfile -e "ps -ef | grep postgres | grep -v grep"
gpadmin   51246  51245  0 06:15 ?        00:00:01 postgres: port 20100, logger p
gpadmin   51249  51245  0 06:15 ?        00:00:00 postgres: port 20100, stats co
gpadmin   51250  51245  0 06:15 ?        00:00:07 postgres: port 20100, writer p
gpadmin   51251  51245  0 06:15 ?        00:00:01 postgres: port 20100, checkpoi
gpadmin   51252  51245  0 06:15 ?        00:00:11 postgres: port 20100, segment
gpadmin  182983  51245  0 07:00 ?        00:00:03 postgres: port 20100, hawqsupe

$ ps -ef | grep postgres | grep -v grep
gpadmin   51245      1  0 06:15 ?        00:01:01 /usr/local/hawq_2_2_0_0/bin/postgres -D
/data/pulse-agent-data/HAWQ-main-FeatureTest-opt-Multinode-parallel/product/segmentdd -i -M
segment -p 20100 --silent-mode=true
gpadmin   51246  51245  0 06:15 ?        00:00:01 postgres: port 20100, logger process
gpadmin   51249  51245  0 06:15 ?        00:00:00 postgres: port 20100, stats collector process
gpadmin   51250  51245  0 06:15 ?        00:00:07 postgres: port 20100, writer process
gpadmin   51251  51245  0 06:15 ?        00:00:01 postgres: port 20100, checkpoint process
gpadmin   51252  51245  0 06:15 ?        00:00:11 postgres: port 20100, segment resource manager
gpadmin  182983  51245  0 07:00 ?        00:00:03 postgres: port 20100, hawqsuperuser olap_winow...
10.32.34.225(45462) con4405 seg0 cmd2 slice7 MPPEXEC SELECT
gpadmin  194424 194402  0 23:50 pts/0    00:00:00 grep postgres
{noformat}

The call stack is:
{noformat}
$ sudo gdb -p 182983
(gdb) bt
#0  0x0000003ff060e2e4 in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x0000003ff0609588 in _L_lock_854 () from /lib64/libpthread.so.0
#2  0x0000003ff0609457 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x0000003ff221206a in _Unwind_Find_FDE () from /lib64/libgcc_s.so.1
#4  0x0000003ff220f603 in ?? () from /lib64/libgcc_s.so.1
#5  0x0000003ff220ff49 in ?? () from /lib64/libgcc_s.so.1
#6  0x0000003ff22100e7 in _Unwind_Backtrace () from /lib64/libgcc_s.so.1
#7  0x0000003ff02fe966 in backtrace () from /lib64/libc.so.6
#8  0x00000000009cda3f in errstart (elevel=20, filename=0xd309e0 "postgres.c", lineno=3618,
    funcname=0xd32fc0 "ProcessInterrupts", domain=0x0) at elog.c:492
#9  0x00000000008e8fcb in ProcessInterrupts () at postgres.c:3616
#10 0x00000000008e8c9e in StatementCancelHandler (postgres_signal_arg=2) at postgres.c:3463
#11 <signal handler called>
#12 0x0000003ff0609451 in pthread_mutex_lock () from /lib64/libpthread.so.0
#13 0x0000003ff221206a in _Unwind_Find_FDE () from /lib64/libgcc_s.so.1
#14 0x0000003ff220f603 in ?? () from /lib64/libgcc_s.so.1
#15 0x0000003ff2210119 in _Unwind_Backtrace () from /lib64/libgcc_s.so.1
#16 0x0000003ff02fe966 in backtrace () from /lib64/libc.so.6
#17 0x00000000009cda3f in errstart (elevel=20, filename=0xd3ba00 "float.c", lineno=839, funcname=0xd3bf3a
"float8div",
    domain=0x0) at elog.c:492
#18 0x0000000000921a84 in float8div (fcinfo=0x7ffd04d2b8b0) at float.c:836
#19 0x0000000000722fe5 in ExecMakeFunctionResult (fcache=0x324a088, econtext=0x32495d8, isNull=0x7ffd04d2c0e0
"\030",
    isDone=0x7ffd04d2bd04) at execQual.c:1762
#20 0x0000000000723d87 in ExecEvalOper (fcache=0x324a088, econtext=0x32495d8, isNull=0x7ffd04d2c0e0
"\030",
    isDone=0x7ffd04d2bd04) at execQual.c:2250
#21 0x0000000000722451 in ExecEvalFuncArgs (fcinfo=0x7ffd04d2bda0, argList=0x324b378, econtext=0x32495d8)
at execQual.c:1317
#22 0x0000000000722a68 in ExecMakeFunctionResult (fcache=0x3249850, econtext=0x32495d8,
    isNull=0x7ffd04d2c5c1 "\306\322\004\375\177", isDone=0x0) at execQual.c:1532
#23 0x0000000000723d1e in ExecEvalFunc (fcache=0x3249850, econtext=0x32495d8, isNull=0x7ffd04d2c5c1
"\306\322\004\375\177",
    isDone=0x0) at execQual.c:2228
#24 0x000000000076eed2 in initFcinfo (wrxstate=0x31b8fe0, fcinfo=0x7ffd04d2c280, funcstate=0x7f83c7412318,
econtext=0x32495d8,
    check_nulls=1 '\001') at nodeWindow.c:3201
#25 0x000000000076efa4 in add_tuple_to_trans (funcstate=0x7f83c7412318, wstate=0x3248ab8,
econtext=0x32495d8,
    check_nulls=1 '\001') at nodeWindow.c:3223
#26 0x0000000000772f72 in processTupleSlot (wstate=0x3248ab8, slot=0x31ac150, last_peer=0
'\000') at nodeWindow.c:5105
#27 0x0000000000772760 in ExecWindow (wstate=0x3248ab8) at nodeWindow.c:4821
---Type <return> to continue, or q <return> to quit---
#28 0x000000000071eda7 in ExecProcNode (node=0x3248ab8) at execProcnode.c:1007
#29 0x000000000075aded in NextInputSlot (node=0x31af928) at nodeResult.c:95
#30 0x000000000075afba in ExecResult (node=0x31af928) at nodeResult.c:194
#31 0x000000000071eb64 in ExecProcNode (node=0x31af928) at execProcnode.c:891
#32 0x000000000075aded in NextInputSlot (node=0x31ae608) at nodeResult.c:95
#33 0x000000000075afba in ExecResult (node=0x31ae608) at nodeResult.c:194
#34 0x000000000071eb64 in ExecProcNode (node=0x31ae608) at execProcnode.c:891
#35 0x00000000007574e1 in execMotionSender (node=0x31adf80) at nodeMotion.c:364
#36 0x00000000007573cf in ExecMotion (node=0x31adf80) at nodeMotion.c:331
#37 0x000000000071ed80 in ExecProcNode (node=0x31adf80) at execProcnode.c:999
#38 0x00000000007180fc in ExecutePlan (estate=0x31b70d8, planstate=0x31adf80, operation=CMD_SELECT,
numberTuples=0,
    direction=ForwardScanDirection, dest=0x7f83c74609f0) at execMain.c:3199
#39 0x0000000000714450 in ExecutorRun (queryDesc=0x31c0c40, direction=ForwardScanDirection,
count=0) at execMain.c:1197
#40 0x00000000008f0b5b in PortalRunSelect (portal=0x31ba158, forward=1 '\001', count=0, dest=0x7f83c74609f0)
at pquery.c:1730
#41 0x00000000008f073b in PortalRun (portal=0x31ba158, count=9223372036854775807, isTopLevel=1
'\001', dest=0x7f83c74609f0,
    altdest=0x7f83c74609f0, completionTag=0x7ffd04d2d000 "") at pquery.c:1552
#42 0x00000000008e5878 in exec_mpp_query (
    query_string=0x31edeca "SELECT sale.vn,sale.qty,sale.qty, TO_CHAR(COALESCE(REGR_AVGY(floor(sale.vn*sale.vn),floor(sale.prc+sale.pn))
OVER(win1),0),'99999999.9999999'),sale.dt,\nTO_CHAR(COALESCE(MIN(floor(sale.qty)) OVER(win1)"...,
    serializedQuerytree=0x0, serializedQuerytreelen=0, serializedPlantree=0x31ee32f "P{",
serializedPlantreelen=4076,
    serializedParams=0x0, serializedParamslen=0, serializedSliceInfo=0x31ef31b "a\020", serializedSliceInfolen=1057,
    serializedResource=0x31ef78a "\260", serializedResourceLen=50, seqServerHost=0x31ef7bc
"10.32.34.225", seqServerPort=19765,
    localSlice=7) at postgres.c:1487
#43 0x00000000008eb6b8 in PostgresMain (argc=270, argv=0x3060678, username=0x303a938 "hawqsuperuser")
at postgres.c:5080
#44 0x00000000008947f7 in BackendRun (port=0x300b5f0) at postmaster.c:5915
#45 0x0000000000893c16 in BackendStartup (port=0x300b5f0) at postmaster.c:5484
#46 0x000000000088dc92 in ServerLoop () at postmaster.c:2163
#47 0x000000000088cc9f in PostmasterMain (argc=9, argv=0x3013d10) at postmaster.c:1454
#48 0x00000000007aa05b in main (argc=9, argv=0x3013d10) at main.c:226
```



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Mime
View raw message