Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitab2324f

Browse files
committed
Improve comments about partitioned hash table freelists.
While I couldn't find any live bugs in commit44ca402, the commentsseemed pretty far from adequate; in particular it was not made plain that"borrowing" entries from other freelists is critical for correctness.Try to improve the commentary. A couple of very minor code styletweaks, as well.Discussion:https://postgr.es/m/10593.1500670709@sss.pgh.pa.us
1 parent991c8b0 commitab2324f

File tree

1 file changed

+70
-35
lines changed

1 file changed

+70
-35
lines changed

‎src/backend/utils/hash/dynahash.c

Lines changed: 70 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
* to hash_create. This prevents any attempt to split buckets on-the-fly.
1616
* Therefore, each hash bucket chain operates independently, and no fields
1717
* of the hash header change after init except nentries and freeList.
18-
* A partitioned table uses spinlocks to guard changes of those fields.
18+
* (A partitioned table uses multiple copies of those fields, guarded by
19+
* spinlocks, for additional concurrency.)
1920
* This lets any subset of the hash buckets be treated as a separately
2021
* lockable partition. We expect callers to use the low-order bits of a
2122
* lookup key's hash value as a partition number --- this will work because
@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
121122
typedefHASHBUCKET*HASHSEGMENT;
122123

123124
/*
124-
* Using array of FreeListData instead of separate arrays of mutexes, nentries
125-
* and freeLists prevents, at least partially, sharing one cache line between
126-
* different mutexes (see below).
125+
* Per-freelist data.
126+
*
127+
* In a partitioned hash table, each freelist is associated with a specific
128+
* set of hashcodes, as determined by the FREELIST_IDX() macro below.
129+
* nentries tracks the number of live hashtable entries having those hashcodes
130+
* (NOT the number of entries in the freelist, as you might expect).
131+
*
132+
* The coverage of a freelist might be more or less than one partition, so it
133+
* needs its own lock rather than relying on caller locking. Relying on that
134+
* wouldn't work even if the coverage was the same, because of the occasional
135+
* need to "borrow" entries from another freelist; see get_hash_entry().
136+
*
137+
* Using an array of FreeListData instead of separate arrays of mutexes,
138+
* nentries and freeLists helps to reduce sharing of cache lines between
139+
* different mutexes.
127140
*/
128141
typedefstruct
129142
{
130-
slock_tmutex;/* spinlock */
131-
longnentries;/* number of entries */
132-
HASHELEMENT*freeList;/*list of free elements */
143+
slock_tmutex;/* spinlockfor this freelist*/
144+
longnentries;/* number of entriesin associated buckets*/
145+
HASHELEMENT*freeList;/*chain of free elements */
133146
}FreeListData;
134147

135148
/*
@@ -143,12 +156,14 @@ typedef struct
143156
structHASHHDR
144157
{
145158
/*
146-
* The freelist can become a point of contention on high-concurrency hash
147-
* tables, so we use an array of freelist, each with its own mutex and
148-
* nentries count, instead of just a single one.
159+
* The freelist can become a point of contention in high-concurrency hash
160+
* tables, so we use an array of freelists, each with its own mutex and
161+
* nentries count, instead of just a single one. Although the freelists
162+
* normally operate independently, we will scavenge entries from freelists
163+
* other than a hashcode's default freelist when necessary.
149164
*
150-
* If hash table is not partitioned only freeList[0] is used andspinlocks
151-
*arenot used at all.
165+
* Ifthehash table is not partitioned, only freeList[0] is used andits
166+
*spinlock isnot used at all; callers' locking is assumed sufficient.
152167
*/
153168
FreeListDatafreeList[NUM_FREELISTS];
154169

@@ -184,7 +199,7 @@ struct HASHHDR
184199
#defineIS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
185200

186201
#defineFREELIST_IDX(hctl,hashcode) \
187-
(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
202+
(IS_PARTITIONED(hctl) ?(hashcode) % NUM_FREELISTS : 0)
188203

189204
/*
190205
* Top control structure for a hashtable --- in a shared table, each backend
@@ -506,19 +521,22 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
506521
nelem_alloc_first;
507522

508523
/*
509-
* If hash table is partitioned all freeLists haveequalnumber of
510-
*elements. Otherwise only freeList[0] is used.
524+
* If hash table is partitioned, give each freelist anequalshare of
525+
*the initial allocation. Otherwise only freeList[0] is used.
511526
*/
512527
if (IS_PARTITIONED(hashp->hctl))
513528
freelist_partitions=NUM_FREELISTS;
514529
else
515530
freelist_partitions=1;
516531

517532
nelem_alloc=nelem /freelist_partitions;
518-
if (nelem_alloc==0)
533+
if (nelem_alloc<=0)
519534
nelem_alloc=1;
520535

521-
/* Make sure all memory will be used */
536+
/*
537+
* Make sure we'll allocate all the requested elements; freeList[0]
538+
* gets the excess if the request isn't divisible by NUM_FREELISTS.
539+
*/
522540
if (nelem_alloc*freelist_partitions<nelem)
523541
nelem_alloc_first=
524542
nelem-nelem_alloc* (freelist_partitions-1);
@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
620638
inti;
621639

622640
/*
623-
* initializemutex if it's a partitioned table
641+
* initializemutexes if it's a partitioned table
624642
*/
625643
if (IS_PARTITIONED(hctl))
626644
for (i=0;i<NUM_FREELISTS;i++)
@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
902920
bool*foundPtr)
903921
{
904922
HASHHDR*hctl=hashp->hctl;
923+
intfreelist_idx=FREELIST_IDX(hctl,hashvalue);
905924
Sizekeysize;
906925
uint32bucket;
907926
longsegment_num;
@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
910929
HASHBUCKETcurrBucket;
911930
HASHBUCKET*prevBucketPtr;
912931
HashCompareFuncmatch;
913-
intfreelist_idx=FREELIST_IDX(hctl,hashvalue);
914932

915933
#ifHASH_STATISTICS
916934
hash_accesses++;
@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
9931011
if (IS_PARTITIONED(hctl))
9941012
SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
9951013

1014+
/* delete the record from the appropriate nentries counter. */
9961015
Assert(hctl->freeList[freelist_idx].nentries>0);
9971016
hctl->freeList[freelist_idx].nentries--;
9981017

9991018
/* remove record from hash bucket's chain. */
10001019
*prevBucketPtr=currBucket->link;
10011020

1002-
/* add the record to thefreelist for this table. */
1021+
/* add the record to theappropriate freelist. */
10031022
currBucket->link=hctl->freeList[freelist_idx].freeList;
10041023
hctl->freeList[freelist_idx].freeList=currBucket;
10051024

@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
12201239
}
12211240

12221241
/*
1223-
* create a new entry if possible
1242+
* Allocate a new hashtable entry if possible; return NULL if out of memory.
1243+
* (Or, if the underlying space allocator throws error for out-of-memory,
1244+
* we won't return at all.)
12241245
*/
12251246
staticHASHBUCKET
12261247
get_hash_entry(HTAB*hashp,intfreelist_idx)
12271248
{
12281249
HASHHDR*hctl=hashp->hctl;
12291250
HASHBUCKETnewElement;
1230-
intborrow_from_idx;
12311251

12321252
for (;;)
12331253
{
@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12441264
if (IS_PARTITIONED(hctl))
12451265
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
12461266

1247-
/* no free elements. allocate another chunk of buckets */
1267+
/*
1268+
* No free elements in this freelist. In a partitioned table, there
1269+
* might be entries in other freelists, but to reduce contention we
1270+
* prefer to first try to get another chunk of buckets from the main
1271+
* shmem allocator. If that fails, though, we *MUST* root through all
1272+
* the other freelists before giving up. There are multiple callers
1273+
* that assume that they can allocate every element in the initially
1274+
* requested table size, or that deleting an element guarantees they
1275+
* can insert a new element, even if shared memory is entirely full.
1276+
* Failing because the needed element is in a different freelist is
1277+
* not acceptable.
1278+
*/
12481279
if (!element_alloc(hashp,hctl->nelem_alloc,freelist_idx))
12491280
{
1281+
intborrow_from_idx;
1282+
12501283
if (!IS_PARTITIONED(hctl))
12511284
returnNULL;/* out of memory */
12521285

1253-
/* try to borrow element from anotherpartition */
1286+
/* try to borrow element from anotherfreelist */
12541287
borrow_from_idx=freelist_idx;
12551288
for (;;)
12561289
{
12571290
borrow_from_idx= (borrow_from_idx+1) %NUM_FREELISTS;
12581291
if (borrow_from_idx==freelist_idx)
1259-
break;
1292+
break;/* examined all freelists, fail */
12601293

12611294
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
12621295
newElement=hctl->freeList[borrow_from_idx].freeList;
@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12661299
hctl->freeList[borrow_from_idx].freeList=newElement->link;
12671300
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
12681301

1302+
/* careful: count the new element in its proper freelist */
12691303
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
12701304
hctl->freeList[freelist_idx].nentries++;
12711305
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
12721306

1273-
break;
1307+
returnnewElement;
12741308
}
12751309

12761310
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
12771311
}
12781312

1279-
returnnewElement;
1313+
/* no elements available to borrow either, so out of memory */
1314+
returnNULL;
12801315
}
12811316
}
12821317

@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
13001335
longsum=hashp->hctl->freeList[0].nentries;
13011336

13021337
/*
1303-
* We currently don't bother with the mutex; it's only sensible to call
1304-
* this function if you've got lock on all partitions of the table.
1338+
* We currently don't bother with acquiring the mutexes; it's only
1339+
* sensible to call this function if you've got lock on all partitions of
1340+
* the table.
13051341
*/
1306-
1307-
if (!IS_PARTITIONED(hashp->hctl))
1308-
returnsum;
1309-
1310-
for (i=1;i<NUM_FREELISTS;i++)
1311-
sum+=hashp->hctl->freeList[i].nentries;
1342+
if (IS_PARTITIONED(hashp->hctl))
1343+
{
1344+
for (i=1;i<NUM_FREELISTS;i++)
1345+
sum+=hashp->hctl->freeList[i].nentries;
1346+
}
13121347

13131348
returnsum;
13141349
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp