1515 * to hash_create. This prevents any attempt to split buckets on-the-fly.
1616 * Therefore, each hash bucket chain operates independently, and no fields
1717 * of the hash header change after init except nentries and freeList.
18- * A partitioned table uses spinlocks to guard changes of those fields.
18+ * (A partitioned table uses multiple copies of those fields, guarded by
19+ * spinlocks, for additional concurrency.)
1920 * This lets any subset of the hash buckets be treated as a separately
2021 * lockable partition. We expect callers to use the low-order bits of a
2122 * lookup key's hash value as a partition number --- this will work because
@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
121122typedef HASHBUCKET * HASHSEGMENT ;
122123
123124/*
124- * Using array of FreeListData instead of separate arrays of mutexes, nentries
125- * and freeLists prevents, at least partially, sharing one cache line between
126- * different mutexes (see below).
125+ * Per-freelist data.
126+ *
127+ * In a partitioned hash table, each freelist is associated with a specific
128+ * set of hashcodes, as determined by the FREELIST_IDX() macro below.
129+ * nentries tracks the number of live hashtable entries having those hashcodes
130+ * (NOT the number of entries in the freelist, as you might expect).
131+ *
132+ * The coverage of a freelist might be more or less than one partition, so it
133+ * needs its own lock rather than relying on caller locking. Relying on that
134+ * wouldn't work even if the coverage was the same, because of the occasional
135+ * need to "borrow" entries from another freelist; see get_hash_entry().
136+ *
137+ * Using an array of FreeListData instead of separate arrays of mutexes,
138+ * nentries and freeLists helps to reduce sharing of cache lines between
139+ * different mutexes.
127140 */
128141typedef struct
129142{
130- slock_t mutex ;/* spinlock */
131- long nentries ;/* number of entries */
132- HASHELEMENT * freeList ;/*list of free elements */
143+ slock_t mutex ;/* spinlockfor this freelist */
144+ long nentries ;/* number of entriesin associated buckets */
145+ HASHELEMENT * freeList ;/*chain of free elements */
133146}FreeListData ;
134147
135148/*
@@ -143,12 +156,14 @@ typedef struct
143156struct HASHHDR
144157{
145158/*
146- * The freelist can become a point of contention on high-concurrency hash
147- * tables, so we use an array of freelist, each with its own mutex and
148- * nentries count, instead of just a single one.
159+ * The freelist can become a point of contention in high-concurrency hash
160+ * tables, so we use an array of freelists, each with its own mutex and
161+ * nentries count, instead of just a single one. Although the freelists
162+ * normally operate independently, we will scavenge entries from freelists
163+ * other than a hashcode's default freelist when necessary.
149164 *
150- * If hash table is not partitioned only freeList[0] is used andspinlocks
151- *are not used at all.
165+ * Ifthe hash table is not partitioned, only freeList[0] is used andits
166+ *spinlock is not used at all; callers' locking is assumed sufficient .
152167 */
153168FreeListData freeList [NUM_FREELISTS ];
154169
@@ -184,7 +199,7 @@ struct HASHHDR
184199#define IS_PARTITIONED (hctl ) ((hctl)->num_partitions != 0)
185200
186201#define FREELIST_IDX (hctl ,hashcode ) \
187- (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
202+ (IS_PARTITIONED(hctl) ?( hashcode) % NUM_FREELISTS : 0)
188203
189204/*
190205 * Top control structure for a hashtable --- in a shared table, each backend
@@ -506,19 +521,22 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
506521nelem_alloc_first ;
507522
508523/*
509- * If hash table is partitioned all freeLists have equalnumber of
510- *elements. Otherwise only freeList[0] is used.
524+ * If hash table is partitioned, give each freelist an equalshare of
525+ *the initial allocation. Otherwise only freeList[0] is used.
511526 */
512527if (IS_PARTITIONED (hashp -> hctl ))
513528freelist_partitions = NUM_FREELISTS ;
514529else
515530freelist_partitions = 1 ;
516531
517532nelem_alloc = nelem /freelist_partitions ;
518- if (nelem_alloc = =0 )
533+ if (nelem_alloc < =0 )
519534nelem_alloc = 1 ;
520535
521- /* Make sure all memory will be used */
536+ /*
537+ * Make sure we'll allocate all the requested elements; freeList[0]
538+ * gets the excess if the request isn't divisible by NUM_FREELISTS.
539+ */
522540if (nelem_alloc * freelist_partitions < nelem )
523541nelem_alloc_first =
524542nelem - nelem_alloc * (freelist_partitions - 1 );
@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
620638int i ;
621639
622640/*
623- * initializemutex if it's a partitioned table
641+ * initializemutexes if it's a partitioned table
624642 */
625643if (IS_PARTITIONED (hctl ))
626644for (i = 0 ;i < NUM_FREELISTS ;i ++ )
@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
902920bool * foundPtr )
903921{
904922HASHHDR * hctl = hashp -> hctl ;
923+ int freelist_idx = FREELIST_IDX (hctl ,hashvalue );
905924Size keysize ;
906925uint32 bucket ;
907926long segment_num ;
@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
910929HASHBUCKET currBucket ;
911930HASHBUCKET * prevBucketPtr ;
912931HashCompareFunc match ;
913- int freelist_idx = FREELIST_IDX (hctl ,hashvalue );
914932
915933#if HASH_STATISTICS
916934hash_accesses ++ ;
@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
9931011if (IS_PARTITIONED (hctl ))
9941012SpinLockAcquire (& (hctl -> freeList [freelist_idx ].mutex ));
9951013
1014+ /* delete the record from the appropriate nentries counter. */
9961015Assert (hctl -> freeList [freelist_idx ].nentries > 0 );
9971016hctl -> freeList [freelist_idx ].nentries -- ;
9981017
9991018/* remove record from hash bucket's chain. */
10001019* prevBucketPtr = currBucket -> link ;
10011020
1002- /* add the record to thefreelist for this table. */
1021+ /* add the record to theappropriate freelist. */
10031022currBucket -> link = hctl -> freeList [freelist_idx ].freeList ;
10041023hctl -> freeList [freelist_idx ].freeList = currBucket ;
10051024
@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
12201239}
12211240
12221241/*
1223- * create a new entry if possible
1242+ * Allocate a new hashtable entry if possible; return NULL if out of memory.
1243+ * (Or, if the underlying space allocator throws error for out-of-memory,
1244+ * we won't return at all.)
12241245 */
12251246static HASHBUCKET
12261247get_hash_entry (HTAB * hashp ,int freelist_idx )
12271248{
12281249HASHHDR * hctl = hashp -> hctl ;
12291250HASHBUCKET newElement ;
1230- int borrow_from_idx ;
12311251
12321252for (;;)
12331253{
@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12441264if (IS_PARTITIONED (hctl ))
12451265SpinLockRelease (& hctl -> freeList [freelist_idx ].mutex );
12461266
1247- /* no free elements. allocate another chunk of buckets */
1267+ /*
1268+ * No free elements in this freelist. In a partitioned table, there
1269+ * might be entries in other freelists, but to reduce contention we
1270+ * prefer to first try to get another chunk of buckets from the main
1271+ * shmem allocator. If that fails, though, we *MUST* root through all
1272+ * the other freelists before giving up. There are multiple callers
1273+ * that assume that they can allocate every element in the initially
1274+ * requested table size, or that deleting an element guarantees they
1275+ * can insert a new element, even if shared memory is entirely full.
1276+ * Failing because the needed element is in a different freelist is
1277+ * not acceptable.
1278+ */
12481279if (!element_alloc (hashp ,hctl -> nelem_alloc ,freelist_idx ))
12491280{
1281+ int borrow_from_idx ;
1282+
12501283if (!IS_PARTITIONED (hctl ))
12511284return NULL ;/* out of memory */
12521285
1253- /* try to borrow element from anotherpartition */
1286+ /* try to borrow element from anotherfreelist */
12541287borrow_from_idx = freelist_idx ;
12551288for (;;)
12561289{
12571290borrow_from_idx = (borrow_from_idx + 1 ) %NUM_FREELISTS ;
12581291if (borrow_from_idx == freelist_idx )
1259- break ;
1292+ break ;/* examined all freelists, fail */
12601293
12611294SpinLockAcquire (& (hctl -> freeList [borrow_from_idx ].mutex ));
12621295newElement = hctl -> freeList [borrow_from_idx ].freeList ;
@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12661299hctl -> freeList [borrow_from_idx ].freeList = newElement -> link ;
12671300SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
12681301
1302+ /* careful: count the new element in its proper freelist */
12691303SpinLockAcquire (& hctl -> freeList [freelist_idx ].mutex );
12701304hctl -> freeList [freelist_idx ].nentries ++ ;
12711305SpinLockRelease (& hctl -> freeList [freelist_idx ].mutex );
12721306
1273- break ;
1307+ return newElement ;
12741308}
12751309
12761310SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
12771311}
12781312
1279- return newElement ;
1313+ /* no elements available to borrow either, so out of memory */
1314+ return NULL ;
12801315}
12811316}
12821317
@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
13001335long sum = hashp -> hctl -> freeList [0 ].nentries ;
13011336
13021337/*
1303- * We currently don't bother with the mutex; it's only sensible to call
1304- * this function if you've got lock on all partitions of the table.
1338+ * We currently don't bother with acquiring the mutexes; it's only
1339+ * sensible to call this function if you've got lock on all partitions of
1340+ * the table.
13051341 */
1306-
1307- if (!IS_PARTITIONED (hashp -> hctl ))
1308- return sum ;
1309-
1310- for (i = 1 ;i < NUM_FREELISTS ;i ++ )
1311- sum += hashp -> hctl -> freeList [i ].nentries ;
1342+ if (IS_PARTITIONED (hashp -> hctl ))
1343+ {
1344+ for (i = 1 ;i < NUM_FREELISTS ;i ++ )
1345+ sum += hashp -> hctl -> freeList [i ].nentries ;
1346+ }
13121347
13131348return sum ;
13141349}