NotificationsYou must be signed in to change notification settings
Fork28
Star153

Commit97c667c

authored and

committed

Improve deadlock detection algorithm by taking in account hidden dependencies between transactions caused by lack of vacant workers in apply pool

1 parent46a5c82 commit97c667cCopy full SHA for 97c667c

File tree

4 files changed

+69

-26

lines changed

4 files changed

+69

-26

lines changed

`‎bgwpool.c‎`

Lines changed: 20 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,9 @@ static void BgwPoolMainLoop(Datum arg)`
`35`	`35`	`work=malloc(size);`
`36`	`36`	`pool->pending-=1;`
`37`	`37`	`pool->active+=1;`
	`38`	`+if (pool->lastPeakTime==0&&pool->active==pool->nWorkers&&pool->pending!=0) {`
	`39`	`+pool->lastPeakTime=MtmGetSystemTime();`
	`40`	`+}`
`38`	`41`	`if (pool->head+size+4>pool->size) {`
`39`	`42`	`memcpy(work,pool->queue,size);`
`40`	`43`	`pool->head=INTALIGN(size);`
`@@ -48,17 +51,19 @@ static void BgwPoolMainLoop(Datum arg)`
`48`	`51`	`if (pool->producerBlocked) {`
`49`	`52`	`pool->producerBlocked= false;`
`50`	`53`	`PGSemaphoreUnlock(&pool->overflow);`
	`54`	`+pool->lastPeakTime=0;`
`51`	`55`	`}`
`52`	`56`	`SpinLockRelease(&pool->lock);`
`53`	`57`	`pool->executor(id,work,size);`
`54`	`58`	`free(work);`
`55`	`59`	`SpinLockAcquire(&pool->lock);`
`56`	`60`	`pool->active-=1;`
	`61`	`+pool->lastPeakTime=0;`
`57`	`62`	`SpinLockRelease(&pool->lock);`
`58`	`63`	`}`
`59`	`64`	`}`
`60`	`65`
`61`		`-voidBgwPoolInit(BgwPoolpool,BgwPoolExecutorexecutor,charconstdbname,size_tqueueSize)`
	`66`	`+voidBgwPoolInit(BgwPoolpool,BgwPoolExecutorexecutor,charconstdbname,size_tqueueSize,size_tnWorkers)`
`62`	`67`	`{`
`63`	`68`	`pool->queue= (char*)ShmemAlloc(queueSize);`
`64`	`69`	`pool->executor=executor;`
`@@ -73,8 +78,15 @@ void BgwPoolInit(BgwPool* pool, BgwPoolExecutor executor, char const* dbname, si`
`73`	`78`	`pool->size=queueSize;`
`74`	`79`	`pool->active=0;`
`75`	`80`	`pool->pending=0;`
	`81`	`+pool->nWorkers=nWorkers;`
	`82`	`+pool->lastPeakTime=0;`
`76`	`83`	`strcpy(pool->dbname,dbname);`
`77`	`84`	`}`
	`85`	`+`
	`86`	`+timestamp_tBgwGetLastPeekTime(BgwPool*pool)`
	`87`	`+{`
	`88`	`+returnpool->lastPeakTime;`
	`89`	`+}`
`78`	`90`
`79`	`91`	`voidBgwPoolStart(intnWorkers,BgwPoolConstructorconstructor)`
`80`	`92`	`{`
`@@ -123,12 +135,18 @@ void BgwPoolExecute(BgwPool* pool, void* work, size_t size)`
`123`	`135`	`if ((pool->head <=pool->tail&&pool->size-pool->tail<size+4&&pool->head<size)`
`124`	`136`	`\|\| (pool->head>pool->tail&&pool->head-pool->tail<size+4))`
`125`	`137`	`{`
`126`		`-pool->producerBlocked= true;`
	`138`	`+if (pool->lastPeakTime==0) {`
	`139`	`+pool->lastPeakTime=MtmGetSystemTime();`
	`140`	`+}`
	`141`	`+pool->producerBlocked= true;`
`127`	`142`	`SpinLockRelease(&pool->lock);`
`128`	`143`	`PGSemaphoreLock(&pool->overflow);`
`129`	`144`	`SpinLockAcquire(&pool->lock);`
`130`	`145`	`}else {`
`131`	`146`	`pool->pending+=1;`
	`147`	`+if (pool->lastPeakTime==0&&pool->active==pool->nWorkers&&pool->pending!=0) {`
	`148`	`+pool->lastPeakTime=MtmGetSystemTime();`
	`149`	`+}`
`132`	`150`	`(int)&pool->queue[pool->tail]=size;`
`133`	`151`	`if (pool->size-pool->tail >=size+4) {`
`134`	`152`	`memcpy(&pool->queue[pool->tail+4],work,size);`

`‎bgwpool.h‎`

Lines changed: 7 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,8 @@`
`7`	`7`
`8`	`8`	`typedefvoid(BgwPoolExecutor)(intid,voidwork,size_tsize);`
`9`	`9`
	`10`	`+typedefuint64timestamp_t;`
	`11`	`+`
`10`	`12`	`#defineMAX_DBNAME_LEN 30`
`11`	`13`	`#defineMULTIMASTER_BGW_RESTART_TIMEOUT 1/* seconds */`
`12`	`14`
`@@ -21,6 +23,8 @@ typedef struct`
`21`	`23`	`size_tsize;`
`22`	`24`	`size_tactive;`
`23`	`25`	`size_tpending;`
	`26`	`+size_tnWorkers;`
	`27`	`+time_tlastPeakTime;`
`24`	`28`	`boolproducerBlocked;`
`25`	`29`	`chardbname[MAX_DBNAME_LEN];`
`26`	`30`	`char*queue;`
`@@ -30,10 +34,12 @@ typedef BgwPool(BgwPoolConstructor)(void);`
`30`	`34`
`31`	`35`	`externvoidBgwPoolStart(intnWorkers,BgwPoolConstructorconstructor);`
`32`	`36`
`33`		`-externvoidBgwPoolInit(BgwPoolpool,BgwPoolExecutorexecutor,charconstdbname,size_tqueueSize);`
	`37`	`+externvoidBgwPoolInit(BgwPoolpool,BgwPoolExecutorexecutor,charconstdbname,size_tqueueSize,size_tnWorkers);`
`34`	`38`
`35`	`39`	`externvoidBgwPoolExecute(BgwPoolpool,voidwork,size_tsize);`
`36`	`40`
`37`	`41`	`externsize_tBgwPoolGetQueueSize(BgwPool*pool);`
`38`	`42`
	`43`	`+externtimestamp_tBgwGetLastPeekTime(BgwPool*pool);`
	`44`	`+`
`39`	`45`	`#endif`

`‎multimaster.c‎`

Lines changed: 33 additions & 16 deletions

Original file line number	Diff line number	Diff line change
`@@ -255,13 +255,18 @@ void MtmUnlockNode(int nodeId)`
`255`	`255`	`*/`
`256`	`256`
`257`	`257`
`258`		`-timestamp_tMtmGetCurrentTime(void)`
	`258`	`+timestamp_tMtmGetSystemTime(void)`
`259`	`259`	`{`
`260`	`260`	`structtimevaltv;`
`261`	`261`	`gettimeofday(&tv,NULL);`
`262`	`262`	`return (timestamp_t)tv.tv_sec*USEC+tv.tv_usec+Mtm->timeShift;`
`263`	`263`	`}`
`264`	`264`
	`265`	`+timestamp_tMtmGetCurrentTime(void)`
	`266`	`+{`
	`267`	`+returnMtmGetSystemTime()+Mtm->timeShift;`
	`268`	`+}`
	`269`	`+`
`265`	`270`	`voidMtmSleep(timestamp_tinterval)`
`266`	`271`	`{`
`267`	`272`	`structtimespects;`
`@@ -1045,7 +1050,7 @@ void MtmRecoveryCompleted(void)`
`1045`	`1050`	`MtmLock(LW_EXCLUSIVE);`
`1046`	`1051`	`Mtm->recoverySlot=0;`
`1047`	`1052`	`BIT_CLEAR(Mtm->disabledNodeMask,MtmNodeId-1);`
`1048`		`-Mtm->nodes[MtmNodeId-1].lastStatusChangeTime=time(NULL);`
	`1053`	`+Mtm->nodes[MtmNodeId-1].lastStatusChangeTime=MtmGetSystemTime();`
`1049`	`1054`	`/* Mode will be changed to online once all locagical reciever are connected */`
`1050`	`1055`	`MtmSwitchClusterMode(MTM_CONNECTED);`
`1051`	`1056`	`MtmUnlock();`
`@@ -1134,7 +1139,7 @@ bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)`
`1134`	`1139`	`/* We are lucky: caugth-up without locking cluster! */`
`1135`	`1140`	`}`
`1136`	`1141`	`BIT_CLEAR(Mtm->disabledNodeMask,nodeId-1);`
`1137`		`-Mtm->nodes[nodeId-1].lastStatusChangeTime=time(NULL);`
	`1142`	`+Mtm->nodes[nodeId-1].lastStatusChangeTime=MtmGetSystemTime();`
`1138`	`1143`	`Mtm->nNodes+=1;`
`1139`	`1144`	`caughtUp= true;`
`1140`	`1145`	`}elseif (!BIT_CHECK(Mtm->nodeLockerMask,nodeId-1)`
`@@ -1279,15 +1284,15 @@ bool MtmRefreshClusterStatus(bool nowait)`
`1279`	`1284`	`if (mask&1) {`
`1280`	`1285`	`Mtm->nNodes-=1;`
`1281`	`1286`	`BIT_SET(Mtm->disabledNodeMask,i);`
`1282`		`-Mtm->nodes[i].lastStatusChangeTime=time(NULL);`
	`1287`	`+Mtm->nodes[i].lastStatusChangeTime=MtmGetSystemTime();`
`1283`	`1288`	`}`
`1284`	`1289`	`}`
`1285`	`1290`	`mask=clique&Mtm->disabledNodeMask;/* new enabled nodes mask */`
`1286`	`1291`	`for (i=0;mask!=0;i++,mask >>=1) {`
`1287`	`1292`	`if (mask&1) {`
`1288`	`1293`	`Mtm->nNodes+=1;`
`1289`	`1294`	`BIT_CLEAR(Mtm->disabledNodeMask,i);`
`1290`		`-Mtm->nodes[i].lastStatusChangeTime=time(NULL);`
	`1295`	`+Mtm->nodes[i].lastStatusChangeTime=MtmGetSystemTime();`
`1291`	`1296`	`}`
`1292`	`1297`	`}`
`1293`	`1298`	`MtmCheckQuorum();`
`@@ -1327,7 +1332,7 @@ void MtmOnNodeDisconnect(int nodeId)`
`1327`	`1332`	`{`
`1328`	`1333`	`MtmTransState*ts;`
`1329`	`1334`
`1330`		`-if (Mtm->nodes[nodeId-1].lastStatusChangeTime+MtmNodeDisableDelay>time(NULL)) {`
	`1335`	`+if (Mtm->nodes[nodeId-1].lastStatusChangeTime+MSEC_TO_USEC(MtmNodeDisableDelay)>MtmGetSystemTime()) {`
`1331`	`1336`	`/* Avoid false detection of node failure and prevent node status blinking */`
`1332`	`1337`	`return;`
`1333`	`1338`	`}`
`@@ -1342,7 +1347,7 @@ void MtmOnNodeDisconnect(int nodeId)`
`1342`	`1347`	`if (!MtmRefreshClusterStatus(false)) {`
`1343`	`1348`	`MtmLock(LW_EXCLUSIVE);`
`1344`	`1349`	`if (!BIT_CHECK(Mtm->disabledNodeMask,nodeId-1)) {`
`1345`		`-Mtm->nodes[nodeId-1].lastStatusChangeTime=time(NULL);`
	`1350`	`+Mtm->nodes[nodeId-1].lastStatusChangeTime=MtmGetSystemTime();`
`1346`	`1351`	`BIT_SET(Mtm->disabledNodeMask,nodeId-1);`
`1347`	`1352`	`Mtm->nNodes-=1;`
`1348`	`1353`	`MtmCheckQuorum();`
`@@ -1510,14 +1515,14 @@ static void MtmInitialize()`
`1510`	`1515`	`for (i=0;i<MtmNodes;i++) {`
`1511`	`1516`	`Mtm->nodes[i].oldestSnapshot=0;`
`1512`	`1517`	`Mtm->nodes[i].transDelay=0;`
`1513`		`-Mtm->nodes[i].lastStatusChangeTime=time(NULL);`
	`1518`	`+Mtm->nodes[i].lastStatusChangeTime=MtmGetSystemTime();`
`1514`	`1519`	`Mtm->nodes[i].con=MtmConnections[i];`
`1515`	`1520`	`Mtm->nodes[i].flushPos=0;`
`1516`	`1521`	`}`
`1517`	`1522`	`PGSemaphoreCreate(&Mtm->votingSemaphore);`
`1518`	`1523`	`PGSemaphoreReset(&Mtm->votingSemaphore);`
`1519`	`1524`	`SpinLockInit(&Mtm->spinlock);`
`1520`		`-BgwPoolInit(&Mtm->pool,MtmExecutor,MtmDatabaseName,MtmQueueSize);`
	`1525`	`+BgwPoolInit(&Mtm->pool,MtmExecutor,MtmDatabaseName,MtmQueueSize,MtmWorkers);`
`1521`	`1526`	`RegisterXactCallback(MtmXactCallback,NULL);`
`1522`	`1527`	`MtmTx.snapshot=INVALID_CSN;`
`1523`	`1528`	`MtmTx.xid=InvalidTransactionId;`
`@@ -1681,10 +1686,10 @@ _PG_init(void)`
`1681`	`1686`
`1682`	`1687`	`DefineCustomIntVariable(`
`1683`	`1688`	`"multimaster.node_disable_delay",`
`1684`		`-"Minamal amount of time (sec) between node status change",`
	`1689`	`+"Minamal amount of time (msec) between node status change",`
`1685`	`1690`	`"This delay is used to avoid false detection of node failure and to prevent blinking of node status node",`
`1686`	`1691`	`&MtmNodeDisableDelay,`
`1687`		`-1,`
	`1692`	`+1000,`
`1688`	`1693`	`1,`
`1689`	`1694`	`INT_MAX,`
`1690`	`1695`	`PGC_BACKEND,`
`@@ -2032,7 +2037,7 @@ void MtmDropNode(int nodeId, bool dropSlot)`
`2032`	`2037`	`{`
`2033`	`2038`	`elog(ERROR,"NodeID %d is out of range [1,%d]",nodeId,Mtm->nNodes);`
`2034`	`2039`	`}`
`2035`		`-Mtm->nodes[nodeId-1].lastStatusChangeTime=time(NULL);`
	`2040`	`+Mtm->nodes[nodeId-1].lastStatusChangeTime=MtmGetSystemTime();`
`2036`	`2041`	`BIT_SET(Mtm->disabledNodeMask,nodeId-1);`
`2037`	`2042`	`Mtm->nNodes-=1;`
`2038`	`2043`	`MtmCheckQuorum();`
`@@ -2083,15 +2088,15 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)`
`2083`	`2088`	`if (MtmIsRecoverySession) {`
`2084`	`2089`	`MTM_LOG1("%d: Node %d start recovery of node %d",MyProcPid,MtmNodeId,MtmReplicationNodeId);`
`2085`	`2090`	`if (!BIT_CHECK(Mtm->disabledNodeMask,MtmReplicationNodeId-1)) {`
`2086`		`-Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime=time(NULL);`
	`2091`	`+Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime=MtmGetSystemTime();`
`2087`	`2092`	`BIT_SET(Mtm->disabledNodeMask,MtmReplicationNodeId-1);`
`2088`	`2093`	`Mtm->nNodes-=1;`
`2089`	`2094`	`MtmCheckQuorum();`
`2090`	`2095`	`}`
`2091`	`2096`	`}elseif (BIT_CHECK(Mtm->disabledNodeMask,MtmReplicationNodeId-1)) {`
`2092`	`2097`	`if (recoveryCompleted) {`
`2093`	`2098`	`MTM_LOG1("Node %d consider that recovery of node %d is completed: start normal replication",MtmNodeId,MtmReplicationNodeId);`
`2094`		`-Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime=time(NULL);`
	`2099`	`+Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime=MtmGetSystemTime();`
`2095`	`2100`	`BIT_CLEAR(Mtm->disabledNodeMask,MtmReplicationNodeId-1);`
`2096`	`2101`	`Mtm->nNodes+=1;`
`2097`	`2102`	`MtmCheckQuorum();`
`@@ -2238,7 +2243,7 @@ mtm_poll_node(PG_FUNCTION_ARGS)`
`2238`	`2243`	`}`
`2239`	`2244`	`if (!nowait) {`
`2240`	`2245`	`/* Just wait some time until logical repication channels will be reestablished */`
`2241`		`-MtmSleep(MtmNodeDisableDelay);`
	`2246`	`+MtmSleep(MSEC_TO_USEC(MtmNodeDisableDelay));`
`2242`	`2247`	`}`
`2243`	`2248`	`PG_RETURN_BOOL(online);`
`2244`	`2249`	`}`
`@@ -2297,7 +2302,7 @@ mtm_get_nodes_state(PG_FUNCTION_ARGS)`
`2297`	`2302`	`usrfctx->values[4]=Int64GetDatum(lag);`
`2298`	`2303`	`usrfctx->nulls[4]=lag<0;`
`2299`	`2304`	`usrfctx->values[5]=Int64GetDatum(Mtm->transCount ?Mtm->nodes[usrfctx->nodeId-1].transDelay/Mtm->transCount :0);`
`2300`		`-usrfctx->values[6]=TimestampTzGetDatum(time_t_to_timestamptz(Mtm->nodes[usrfctx->nodeId-1].lastStatusChangeTime));`
	`2305`	`+usrfctx->values[6]=TimestampTzGetDatum(time_t_to_timestamptz(Mtm->nodes[usrfctx->nodeId-1].lastStatusChangeTime/USEC));`
`2301`	`2306`	`usrfctx->values[7]=CStringGetTextDatum(Mtm->nodes[usrfctx->nodeId-1].con.connStr);`
`2302`	`2307`	`usrfctx->nodeId+=1;`
`2303`	`2308`
`@@ -3058,6 +3063,18 @@ MtmDetectGlobalDeadLock(PGPROC* proc)`
`3058`	`3063`	`MtmGetGtid(pgxact->xid,&gtid);`
`3059`	`3064`	`hasDeadlock=MtmGraphFindLoop(&graph,&gtid);`
`3060`	`3065`	`elog(WARNING,"Distributed deadlock check for %u:%u = %d",gtid.node,gtid.xid,hasDeadlock);`
	`3066`	`+if (!hasDeadlock) {`
	`3067`	`+/* There is no deadlock loop in graph, but deadlock can be caused by lack of apply workers: if all of them are busy, then some transactions`
	`3068`	`+ * can not be appied just because there are no vacant workers and it cause additional dependency between transactions which is not`
	`3069`	`+ * refelected in lock graph`
	`3070`	`+ */`
	`3071`	`+timestamp_tlastPeekTime=BgwGetLastPeekTime(&Mtm->pool);`
	`3072`	`+if (lastPeekTime!=0&&MtmGetSystemTime()-lastPeekTime >=MSEC_TO_USEC(DeadlockTimeout)) {`
	`3073`	`+hasDeadlock= true;`
	`3074`	`+elog(WARNING,"Apply workers were blocked more than %d msec",`
	`3075`	`+ (int)USEC_TO_MSEC(MtmGetSystemTime()-lastPeekTime));`
	`3076`	`+}`
	`3077`	`+}`
`3061`	`3078`	`}`
`3062`	`3079`	`returnhasDeadlock;`
`3063`	`3080`	`}`

`‎multimaster.h‎`

Lines changed: 9 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,9 @@`
`48`	`48`
`49`	`49`	`#defineUSEC 1000000`
`50`	`50`
	`51`	`+#defineUSEC_TO_MSEC(t) ((t)/1000)`
	`52`	`+#defineMSEC_TO_USEC(t) ((t)*1000)`
	`53`	`+`
`51`	`54`	`#defineNatts_mtm_ddl_log 2`
`52`	`55`	`#defineAnum_mtm_ddl_log_issued1`
`53`	`56`	`#defineAnum_mtm_ddl_log_query2`
`@@ -72,8 +75,6 @@ typedef uint64 csn_t; /* commit serial number */`
`72`	`75`	`#definePGLOGICAL_CAUGHT_UP 0x04`
`73`	`76`
`74`	`77`
`75`		`-typedefuint64timestamp_t;`
`76`		`-`
`77`	`78`	`/* Identifier of global transaction */`
`78`	`79`	`typedefstruct`
`79`	`80`	`{`
`@@ -122,9 +123,9 @@ typedef struct`
`122`	`123`	`typedefstruct`
`123`	`124`	`{`
`124`	`125`	`MtmConnectionInfocon;`
`125`		`-time_ttransDelay;`
`126`		`-time_tlastStatusChangeTime;`
`127`		`-XLogRecPtrflushPos;`
	`126`	`+timestamp_ttransDelay;`
	`127`	`+timestamp_tlastStatusChangeTime;`
	`128`	`+XLogRecPtrflushPos;`
`128`	`129`	`csn_toldestSnapshot;/* Oldest snapshot used by active transactions at this node */`
`129`	`130`	`}MtmNodeInfo;`
`130`	`131`
`@@ -232,8 +233,9 @@ extern void MtmRecoverNode(int nodeId);`
`232`	`233`	`externvoidMtmOnNodeDisconnect(intnodeId);`
`233`	`234`	`externvoidMtmOnNodeConnect(intnodeId);`
`234`	`235`	`externvoidMtmWakeUpBackend(MtmTransState*ts);`
`235`		`-externtimestamp_tMtmGetCurrentTime(void);`
`236`		`-externvoidMtmSleep(timestamp_tinterval);`
	`236`	`+externtimestamp_tMtmGetSystemTime(void);/* non-adjusted current system time */`
	`237`	`+externtimestamp_tMtmGetCurrentTime(void);/* adjusted current system time */`
	`238`	`+externvoidMtmSleep(timestamp_tinterval);`
`237`	`239`	`externvoidMtmAbortTransaction(MtmTransState*ts);`
`238`	`240`	`externvoidMtmSetCurrentTransactionGID(charconst*gid);`
`239`	`241`	`externcsn_tMtmGetTransactionCSN(TransactionIdxid);`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit97c667c

File tree

4 files changed

4 files changed

`‎bgwpool.c‎`

`‎bgwpool.h‎`

`‎multimaster.c‎`

`‎multimaster.h‎`

0 commit comments