Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit4760ea1

Browse files
committed
Add status change time
1 parent406ab79 commit4760ea1

File tree

4 files changed

+40
-8
lines changed

4 files changed

+40
-8
lines changed

‎contrib/mmts/multimaster--1.0.sql‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ AS 'MODULE_PATHNAME','mtm_get_snapshot'
2424
LANGUAGE C;
2525

2626

27-
CREATETYPEmtm.node_stateAS (idinteger, disabled bool, disconnected bool, catchUp bool, slotLagbigint, avgTransDelaybigint, connStrtext);
27+
CREATETYPEmtm.node_stateAS (idinteger, disabled bool, disconnected bool, catchUp bool, slotLagbigint, avgTransDelaybigint,lastStatusChangetimestamp,connStrtext);
2828

2929
CREATEFUNCTIONmtm.get_nodes_state() RETURNS SETOFmtm.node_state
3030
AS'MODULE_PATHNAME','mtm_get_nodes_state'

‎contrib/mmts/multimaster.c‎

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ int MtmConnectAttempts;
179179
intMtmConnectTimeout;
180180
intMtmKeepaliveTimeout;
181181
intMtmReconnectAttempts;
182+
intMtmNodeDisableDelay;
182183
boolMtmUseRaftable;
183184
MtmConnectionInfo*MtmConnections;
184185

@@ -993,6 +994,7 @@ void MtmRecoveryCompleted(void)
993994
MtmLock(LW_EXCLUSIVE);
994995
Mtm->recoverySlot=0;
995996
BIT_CLEAR(Mtm->disabledNodeMask,MtmNodeId-1);
997+
Mtm->nodes[MtmNodeId-1].lastStatusChangeTime=time(NULL);
996998
/* Mode will be changed to online once all locagical reciever are connected */
997999
MtmSwitchClusterMode(MTM_CONNECTED);
9981000
MtmUnlock();
@@ -1081,6 +1083,7 @@ bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
10811083
/* We are lucky: caugth-up without locking cluster! */
10821084
}
10831085
BIT_CLEAR(Mtm->disabledNodeMask,nodeId-1);
1086+
Mtm->nodes[nodeId-1].lastStatusChangeTime=time(NULL);
10841087
Mtm->nNodes+=1;
10851088
caughtUp= true;
10861089
}elseif (!BIT_CHECK(Mtm->nodeLockerMask,nodeId-1)
@@ -1223,13 +1226,15 @@ bool MtmRefreshClusterStatus(bool nowait)
12231226
if (mask&1) {
12241227
Mtm->nNodes-=1;
12251228
BIT_SET(Mtm->disabledNodeMask,i);
1229+
Mtm->nodes[i].lastStatusChangeTime=time(NULL);
12261230
}
12271231
}
12281232
mask=clique&Mtm->disabledNodeMask;/* new enabled nodes mask */
12291233
for (i=0;mask!=0;i++,mask >>=1) {
12301234
if (mask&1) {
12311235
Mtm->nNodes+=1;
12321236
BIT_CLEAR(Mtm->disabledNodeMask,i);
1237+
Mtm->nodes[i].lastStatusChangeTime=time(NULL);
12331238
}
12341239
}
12351240
MtmCheckQuorum();
@@ -1269,6 +1274,11 @@ void MtmOnNodeDisconnect(int nodeId)
12691274
{
12701275
MtmTransState*ts;
12711276

1277+
if (Mtm->nodes[nodeId-1].lastStatusChangeTime+MtmNodeDisableDelay>time(NULL)) {
1278+
/* Avoid false detection of node failure and prevent node status blinking */
1279+
return;
1280+
}
1281+
12721282
BIT_SET(Mtm->connectivityMask,nodeId-1);
12731283
BIT_SET(Mtm->reconnectMask,nodeId-1);
12741284
RaftableSet(psprintf("node-mask-%d",MtmNodeId),&Mtm->connectivityMask,sizeofMtm->connectivityMask, false);
@@ -1279,6 +1289,7 @@ void MtmOnNodeDisconnect(int nodeId)
12791289
if (!MtmRefreshClusterStatus(false)) {
12801290
MtmLock(LW_EXCLUSIVE);
12811291
if (!BIT_CHECK(Mtm->disabledNodeMask,nodeId-1)) {
1292+
Mtm->nodes[nodeId-1].lastStatusChangeTime=time(NULL);
12821293
BIT_SET(Mtm->disabledNodeMask,nodeId-1);
12831294
Mtm->nNodes-=1;
12841295
MtmCheckQuorum();
@@ -1446,6 +1457,7 @@ static void MtmInitialize()
14461457
for (i=0;i<MtmNodes;i++) {
14471458
Mtm->nodes[i].oldestSnapshot=0;
14481459
Mtm->nodes[i].transDelay=0;
1460+
Mtm->nodes[i].lastStatusChangeTime=time(NULL);
14491461
Mtm->nodes[i].con=MtmConnections[i];
14501462
}
14511463
PGSemaphoreCreate(&Mtm->votingSemaphore);
@@ -1566,10 +1578,25 @@ _PG_init(void)
15661578
if (!process_shared_preload_libraries_in_progress)
15671579
return;
15681580

1581+
DefineCustomIntVariable(
1582+
"multimaster.node_disable_delay",
1583+
"Minamal amount of time (sec) between node status change",
1584+
"This delay is used to avoid false detection of node failure and to prevent blinking of node status node",
1585+
&MtmNodeDisableDelay,
1586+
1,
1587+
1,
1588+
INT_MAX,
1589+
PGC_BACKEND,
1590+
0,
1591+
NULL,
1592+
NULL,
1593+
NULL
1594+
);
1595+
15691596
DefineCustomIntVariable(
15701597
"multimaster.min_recovery_lag",
15711598
"Minamal lag of WAL-sender performing recovery after which cluster is locked until recovery is completed",
1572-
"When wal-sender almost catch-up WAL current position we need to stop 'Achilles tortilecompeition' and "
1599+
"When wal-sender almost catch-up WAL current position we need to stop 'Achilles tortilecompetition' and "
15731600
"temporary stop commit of new transactions until node will be completely repared",
15741601
&MtmMinRecoveryLag,
15751602
100000,
@@ -1891,6 +1918,7 @@ void MtmDropNode(int nodeId, bool dropSlot)
18911918
{
18921919
elog(ERROR,"NodeID %d is out of range [1,%d]",nodeId,Mtm->nNodes);
18931920
}
1921+
Mtm->nodes[nodeId-1].lastStatusChangeTime=time(NULL);
18941922
BIT_SET(Mtm->disabledNodeMask,nodeId-1);
18951923
Mtm->nNodes-=1;
18961924
MtmCheckQuorum();
@@ -1941,13 +1969,15 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
19411969
if (MtmIsRecoverySession) {
19421970
MTM_LOG1("%d: Node %d start recovery of node %d",MyProcPid,MtmNodeId,MtmReplicationNodeId);
19431971
if (!BIT_CHECK(Mtm->disabledNodeMask,MtmReplicationNodeId-1)) {
1972+
Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime=time(NULL);
19441973
BIT_SET(Mtm->disabledNodeMask,MtmReplicationNodeId-1);
19451974
Mtm->nNodes-=1;
19461975
MtmCheckQuorum();
19471976
}
19481977
}elseif (BIT_CHECK(Mtm->disabledNodeMask,MtmReplicationNodeId-1)) {
19491978
if (recoveryCompleted) {
19501979
MTM_LOG1("Node %d consider that recovery of node %d is completed: start normal replication",MtmNodeId,MtmReplicationNodeId);
1980+
Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime=time(NULL);
19511981
BIT_CLEAR(Mtm->disabledNodeMask,MtmReplicationNodeId-1);
19521982
Mtm->nNodes+=1;
19531983
MtmCheckQuorum();
@@ -2058,8 +2088,8 @@ typedef struct
20582088
intnodeId;
20592089
char*connStrPtr;
20602090
TupleDescdesc;
2061-
Datumvalues[7];
2062-
boolnulls[7];
2091+
Datumvalues[8];
2092+
boolnulls[8];
20632093
}MtmGetNodeStateCtx;
20642094

20652095
Datum
@@ -2096,11 +2126,12 @@ mtm_get_nodes_state(PG_FUNCTION_ARGS)
20962126
usrfctx->values[4]=Int64GetDatum(lag);
20972127
usrfctx->nulls[4]=lag<0;
20982128
usrfctx->values[5]=Int64GetDatum(Mtm->transCount ?Mtm->nodes[usrfctx->nodeId-1].transDelay/Mtm->transCount :0);
2129+
usrfctx->values[6]=TimestampTzGetDatum(time_t_to_timestamptz(Mtm->nodes[usrfctx->nodeId-1].lastStatusChangeTime));
20992130
p=strchr(usrfctx->connStrPtr,',');
21002131
if (p!=NULL) {
21012132
*p++='\0';
21022133
}
2103-
usrfctx->values[6]=CStringGetTextDatum(usrfctx->connStrPtr);
2134+
usrfctx->values[7]=CStringGetTextDatum(usrfctx->connStrPtr);
21042135
usrfctx->connStrPtr=p;
21052136
usrfctx->nodeId+=1;
21062137

‎contrib/mmts/multimaster.h‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ typedef struct
119119
{
120120
MtmConnectionInfocon;
121121
time_ttransDelay;
122+
time_tlastStatusChangeTime;
122123
csn_toldestSnapshot;/* Oldest snapshot used by active transactions at this node */
123124
}MtmNodeInfo;
124125

@@ -170,7 +171,6 @@ typedef struct
170171
MtmTransState**transListTail;/* Tail of L1 list of all finished transactionds, used to append new elements.
171172
This list is expected to be in CSN ascending order, by strict order may be violated */
172173
uint64transCount;/* Counter of transactions perfromed by this node */
173-
time_tnodeTransDelay[MAX_NODES];/* Time of waiting transaction acknowledgment from node */
174174
BgwPoolpool;/* Pool of background workers for applying logical replication patches */
175175
MtmNodeInfonodes[1];/* [MtmNodes]: per-node data */
176176
}MtmState;
@@ -190,6 +190,7 @@ extern int MtmConnectAttempts;
190190
externintMtmConnectTimeout;
191191
externintMtmReconnectAttempts;
192192
externintMtmKeepaliveTimeout;
193+
externintMtmNodeDisableDelay;
193194
externHTAB*MtmXid2State;
194195

195196
externMtmConnectionInfo*MtmConnections;

‎contrib/mmts/t/001_basic_recovery.pl‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ sub PostgresNode::inet_connstr {
115115

116116
diag("starting node 2");
117117
$nodes[2]->start;
118-
diag("sleeping30");
119-
sleep(30);# XXX: here we can poll
118+
diag("sleeping10");
119+
sleep(10);# XXX: here we can poll
120120
diag("inserting 3");
121121
$nodes[0]->psql('postgres',"insert into t values(3, 30);");
122122
diag("selecting");

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp