Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit44ca402

Browse files
committed
Partition the freelist for shared dynahash tables.
Without this, contention on the freelist can become a pretty seriousproblem on large servers.Aleksander Alekseev, reviewed by Anastasia Lubennikova, Dilip Kumar,and me.
1 parentea4b8bd commit44ca402

File tree

1 file changed

+137
-47
lines changed

1 file changed

+137
-47
lines changed

‎src/backend/utils/hash/dynahash.c

Lines changed: 137 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* to hash_create. This prevents any attempt to split buckets on-the-fly.
1616
* Therefore, each hash bucket chain operates independently, and no fields
1717
* of the hash header change after init except nentries and freeList.
18-
* A partitioned table usesa spinlockto guard changes of those two fields.
18+
* A partitioned table usesspinlocksto guard changes of those fields.
1919
* This lets any subset of the hash buckets be treated as a separately
2020
* lockable partition. We expect callers to use the low-order bits of a
2121
* lookup key's hash value as a partition number --- this will work because
@@ -111,13 +111,27 @@
111111
#defineDEF_DIRSIZE 256
112112
#defineDEF_FFACTOR 1/* default fill factor */
113113

114+
/* Number of freelists to be used for a partitioned hash table. */
115+
#defineNUM_FREELISTS32
114116

115117
/* A hash bucket is a linked list of HASHELEMENTs */
116118
typedefHASHELEMENT*HASHBUCKET;
117119

118120
/* A hash segment is an array of bucket headers */
119121
typedefHASHBUCKET*HASHSEGMENT;
120122

123+
/*
124+
* Using array of FreeListData instead of separate arrays of mutexes, nentries
125+
* and freeLists prevents, at least partially, sharing one cache line between
126+
* different mutexes (see below).
127+
*/
128+
typedefstruct
129+
{
130+
slock_tmutex;/* spinlock */
131+
longnentries;/* number of entries */
132+
HASHELEMENT*freeList;/* list of free elements */
133+
}FreeListData;
134+
121135
/*
122136
* Header structure for a hash table --- contains all changeable info
123137
*
@@ -128,12 +142,15 @@ typedef HASHBUCKET *HASHSEGMENT;
128142
*/
129143
structHASHHDR
130144
{
131-
/* In a partitioned table, take this lock to touch nentries or freeList */
132-
slock_tmutex;/* unused if not partitioned table */
133-
134-
/* These fields change during entry addition/deletion */
135-
longnentries;/* number of entries in hash table */
136-
HASHELEMENT*freeList;/* linked list of free elements */
145+
/*
146+
* The freelist can become a point of contention on high-concurrency hash
147+
* tables, so we use an array of freelist, each with its own mutex and
148+
* nentries count, instead of just a single one.
149+
*
150+
* If hash table is not partitioned only freeList[0] is used and spinlocks
151+
* are not used at all.
152+
*/
153+
FreeListDatafreeList[NUM_FREELISTS];
137154

138155
/* These fields can change, but not in a partitioned table */
139156
/* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -166,6 +183,9 @@ struct HASHHDR
166183

167184
#defineIS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
168185

186+
#defineFREELIST_IDX(hctl,hashcode) \
187+
(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
188+
169189
/*
170190
* Top control structure for a hashtable --- in a shared table, each backend
171191
* has its own copy (OK since no fields change at runtime)
@@ -219,10 +239,10 @@ static long hash_accesses,
219239
*/
220240
staticvoid*DynaHashAlloc(Sizesize);
221241
staticHASHSEGMENTseg_alloc(HTAB*hashp);
222-
staticboolelement_alloc(HTAB*hashp,intnelem);
242+
staticboolelement_alloc(HTAB*hashp,intnelem,intfreelist_idx);
223243
staticbooldir_realloc(HTAB*hashp);
224244
staticboolexpand_table(HTAB*hashp);
225-
staticHASHBUCKETget_hash_entry(HTAB*hashp);
245+
staticHASHBUCKETget_hash_entry(HTAB*hashp,intfreelist_idx);
226246
staticvoidhdefault(HTAB*hashp);
227247
staticintchoose_nelem_alloc(Sizeentrysize);
228248
staticboolinit_htab(HTAB*hashp,longnelem);
@@ -482,10 +502,40 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
482502
if ((flags&HASH_SHARED_MEM)||
483503
nelem<hctl->nelem_alloc)
484504
{
485-
if (!element_alloc(hashp, (int)nelem))
486-
ereport(ERROR,
487-
(errcode(ERRCODE_OUT_OF_MEMORY),
488-
errmsg("out of memory")));
505+
inti,
506+
freelist_partitions,
507+
nelem_alloc,
508+
nelem_alloc_first;
509+
510+
/*
511+
* If hash table is partitioned all freeLists have equal number of
512+
* elements. Otherwise only freeList[0] is used.
513+
*/
514+
if (IS_PARTITIONED(hashp->hctl))
515+
freelist_partitions=NUM_FREELISTS;
516+
else
517+
freelist_partitions=1;
518+
519+
nelem_alloc=nelem /freelist_partitions;
520+
if (nelem_alloc==0)
521+
nelem_alloc=1;
522+
523+
/* Make sure all memory will be used */
524+
if (nelem_alloc*freelist_partitions<nelem)
525+
nelem_alloc_first=
526+
nelem-nelem_alloc* (freelist_partitions-1);
527+
else
528+
nelem_alloc_first=nelem_alloc;
529+
530+
for (i=0;i<freelist_partitions;i++)
531+
{
532+
inttemp= (i==0) ?nelem_alloc_first :nelem_alloc;
533+
534+
if (!element_alloc(hashp,temp,i))
535+
ereport(ERROR,
536+
(errcode(ERRCODE_OUT_OF_MEMORY),
537+
errmsg("out of memory")));
538+
}
489539
}
490540

491541
if (flags&HASH_FIXED_SIZE)
@@ -503,9 +553,6 @@ hdefault(HTAB *hashp)
503553

504554
MemSet(hctl,0,sizeof(HASHHDR));
505555

506-
hctl->nentries=0;
507-
hctl->freeList=NULL;
508-
509556
hctl->dsize=DEF_DIRSIZE;
510557
hctl->nsegs=0;
511558

@@ -572,12 +619,14 @@ init_htab(HTAB *hashp, long nelem)
572619
HASHSEGMENT*segp;
573620
intnbuckets;
574621
intnsegs;
622+
inti;
575623

576624
/*
577625
* initialize mutex if it's a partitioned table
578626
*/
579627
if (IS_PARTITIONED(hctl))
580-
SpinLockInit(&hctl->mutex);
628+
for (i=0;i<NUM_FREELISTS;i++)
629+
SpinLockInit(&(hctl->freeList[i].mutex));
581630

582631
/*
583632
* Divide number of elements by the fill factor to determine a desired
@@ -648,7 +697,7 @@ init_htab(HTAB *hashp, long nelem)
648697
"HIGH MASK ",hctl->high_mask,
649698
"LOW MASK ",hctl->low_mask,
650699
"NSEGS ",hctl->nsegs,
651-
"NENTRIES ",hctl->nentries);
700+
"NENTRIES ",hash_get_num_entries(hctl));
652701
#endif
653702
return true;
654703
}
@@ -769,7 +818,7 @@ hash_stats(const char *where, HTAB *hashp)
769818
where,hashp->hctl->accesses,hashp->hctl->collisions);
770819

771820
fprintf(stderr,"hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
772-
hashp->hctl->nentries, (long)hashp->hctl->keysize,
821+
hash_get_num_entries(hashp), (long)hashp->hctl->keysize,
773822
hashp->hctl->max_bucket,hashp->hctl->nsegs);
774823
fprintf(stderr,"%s: total accesses %ld total collisions %ld\n",
775824
where,hash_accesses,hash_collisions);
@@ -863,6 +912,7 @@ hash_search_with_hash_value(HTAB *hashp,
863912
HASHBUCKETcurrBucket;
864913
HASHBUCKET*prevBucketPtr;
865914
HashCompareFuncmatch;
915+
intfreelist_idx=FREELIST_IDX(hctl,hashvalue);
866916

867917
#ifHASH_STATISTICS
868918
hash_accesses++;
@@ -885,7 +935,7 @@ hash_search_with_hash_value(HTAB *hashp,
885935
* order of these tests is to try to check cheaper conditions first.
886936
*/
887937
if (!IS_PARTITIONED(hctl)&& !hashp->frozen&&
888-
hctl->nentries / (long) (hctl->max_bucket+1) >=hctl->ffactor&&
938+
hctl->freeList[0].nentries / (long) (hctl->max_bucket+1) >=hctl->ffactor&&
889939
!has_seq_scans(hashp))
890940
(void)expand_table(hashp);
891941
}
@@ -943,20 +993,20 @@ hash_search_with_hash_value(HTAB *hashp,
943993
{
944994
/* if partitioned, must lock to touch nentries and freeList */
945995
if (IS_PARTITIONED(hctl))
946-
SpinLockAcquire(&hctl->mutex);
996+
SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
947997

948-
Assert(hctl->nentries>0);
949-
hctl->nentries--;
998+
Assert(hctl->freeList[freelist_idx].nentries>0);
999+
hctl->freeList[freelist_idx].nentries--;
9501000

9511001
/* remove record from hash bucket's chain. */
9521002
*prevBucketPtr=currBucket->link;
9531003

9541004
/* add the record to the freelist for this table. */
955-
currBucket->link=hctl->freeList;
956-
hctl->freeList=currBucket;
1005+
currBucket->link=hctl->freeList[freelist_idx].freeList;
1006+
hctl->freeList[freelist_idx].freeList=currBucket;
9571007

9581008
if (IS_PARTITIONED(hctl))
959-
SpinLockRelease(&hctl->mutex);
1009+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
9601010

9611011
/*
9621012
* better hope the caller is synchronizing access to this
@@ -982,7 +1032,7 @@ hash_search_with_hash_value(HTAB *hashp,
9821032
elog(ERROR,"cannot insert into frozen hashtable \"%s\"",
9831033
hashp->tabname);
9841034

985-
currBucket=get_hash_entry(hashp);
1035+
currBucket=get_hash_entry(hashp,freelist_idx);
9861036
if (currBucket==NULL)
9871037
{
9881038
/* out of memory */
@@ -1175,39 +1225,69 @@ hash_update_hash_key(HTAB *hashp,
11751225
* create a new entry if possible
11761226
*/
11771227
staticHASHBUCKET
1178-
get_hash_entry(HTAB*hashp)
1228+
get_hash_entry(HTAB*hashp,intfreelist_idx)
11791229
{
1180-
HASHHDR*hctl=hashp->hctl;
1230+
HASHHDR*hctl=hashp->hctl;
11811231
HASHBUCKETnewElement;
1232+
intborrow_from_idx;
11821233

11831234
for (;;)
11841235
{
11851236
/* if partitioned, must lock to touch nentries and freeList */
11861237
if (IS_PARTITIONED(hctl))
1187-
SpinLockAcquire(&hctl->mutex);
1238+
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
11881239

11891240
/* try to get an entry from the freelist */
1190-
newElement=hctl->freeList;
1241+
newElement=hctl->freeList[freelist_idx].freeList;
1242+
11911243
if (newElement!=NULL)
11921244
break;
11931245

1194-
/* no free elements. allocate another chunk of buckets */
11951246
if (IS_PARTITIONED(hctl))
1196-
SpinLockRelease(&hctl->mutex);
1247+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
11971248

1198-
if (!element_alloc(hashp,hctl->nelem_alloc))
1249+
/* no free elements. allocate another chunk of buckets */
1250+
if (!element_alloc(hashp,hctl->nelem_alloc,freelist_idx))
11991251
{
1200-
/* out of memory */
1201-
returnNULL;
1252+
if (!IS_PARTITIONED(hctl))
1253+
returnNULL;/* out of memory */
1254+
1255+
/* try to borrow element from another partition */
1256+
borrow_from_idx=freelist_idx;
1257+
for (;;)
1258+
{
1259+
borrow_from_idx= (borrow_from_idx+1) %NUM_FREELISTS;
1260+
if (borrow_from_idx==freelist_idx)
1261+
break;
1262+
1263+
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1264+
newElement=hctl->freeList[borrow_from_idx].freeList;
1265+
1266+
if (newElement!=NULL)
1267+
{
1268+
hctl->freeList[borrow_from_idx].freeList=newElement->link;
1269+
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1270+
1271+
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1272+
hctl->freeList[freelist_idx].nentries++;
1273+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1274+
1275+
break;
1276+
}
1277+
1278+
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1279+
}
1280+
1281+
returnnewElement;
12021282
}
12031283
}
12041284

12051285
/* remove entry from freelist, bump nentries */
1206-
hctl->freeList=newElement->link;
1207-
hctl->nentries++;
1286+
hctl->freeList[freelist_idx].freeList=newElement->link;
1287+
hctl->freeList[freelist_idx].nentries++;
12081288

12091289
if (IS_PARTITIONED(hctl))
1210-
SpinLockRelease(&hctl->mutex);
1290+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
12111291

12121292
returnnewElement;
12131293
}
@@ -1218,11 +1298,21 @@ get_hash_entry(HTAB *hashp)
12181298
long
12191299
hash_get_num_entries(HTAB*hashp)
12201300
{
1301+
inti;
1302+
longsum=hashp->hctl->freeList[0].nentries;
1303+
12211304
/*
12221305
* We currently don't bother with the mutex; it's only sensible to call
12231306
* this function if you've got lock on all partitions of the table.
12241307
*/
1225-
returnhashp->hctl->nentries;
1308+
1309+
if (!IS_PARTITIONED(hashp->hctl))
1310+
returnsum;
1311+
1312+
for (i=1;i<NUM_FREELISTS;i++)
1313+
sum+=hashp->hctl->freeList[i].nentries;
1314+
1315+
returnsum;
12261316
}
12271317

12281318
/*
@@ -1527,12 +1617,12 @@ seg_alloc(HTAB *hashp)
15271617
}
15281618

15291619
/*
1530-
* allocate some new elements and link them into the free list
1620+
* allocate some new elements and link them into theindicatedfree list
15311621
*/
15321622
staticbool
1533-
element_alloc(HTAB*hashp,intnelem)
1623+
element_alloc(HTAB*hashp,intnelem,intfreelist_idx)
15341624
{
1535-
HASHHDR*hctl=hashp->hctl;
1625+
HASHHDR*hctl=hashp->hctl;
15361626
SizeelementSize;
15371627
HASHELEMENT*firstElement;
15381628
HASHELEMENT*tmpElement;
@@ -1563,14 +1653,14 @@ element_alloc(HTAB *hashp, int nelem)
15631653

15641654
/* if partitioned, must lock to touch freeList */
15651655
if (IS_PARTITIONED(hctl))
1566-
SpinLockAcquire(&hctl->mutex);
1656+
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
15671657

15681658
/* freelist could be nonempty if two backends did this concurrently */
1569-
firstElement->link=hctl->freeList;
1570-
hctl->freeList=prevElement;
1659+
firstElement->link=hctl->freeList[freelist_idx].freeList;
1660+
hctl->freeList[freelist_idx].freeList=prevElement;
15711661

15721662
if (IS_PARTITIONED(hctl))
1573-
SpinLockRelease(&hctl->mutex);
1663+
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
15741664

15751665
return true;
15761666
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp