1515 * to hash_create. This prevents any attempt to split buckets on-the-fly.
1616 * Therefore, each hash bucket chain operates independently, and no fields
1717 * of the hash header change after init except nentries and freeList.
18- * A partitioned table usesa spinlock to guard changes of those two fields.
18+ * A partitioned table usesspinlocks to guard changes of those fields.
1919 * This lets any subset of the hash buckets be treated as a separately
2020 * lockable partition. We expect callers to use the low-order bits of a
2121 * lookup key's hash value as a partition number --- this will work because
111111#define DEF_DIRSIZE 256
112112#define DEF_FFACTOR 1/* default fill factor */
113113
114+ /* Number of freelists to be used for a partitioned hash table. */
115+ #define NUM_FREELISTS 32
114116
115117/* A hash bucket is a linked list of HASHELEMENTs */
116118typedef HASHELEMENT * HASHBUCKET ;
117119
118120/* A hash segment is an array of bucket headers */
119121typedef HASHBUCKET * HASHSEGMENT ;
120122
123+ /*
124+ * Using array of FreeListData instead of separate arrays of mutexes, nentries
125+ * and freeLists prevents, at least partially, sharing one cache line between
126+ * different mutexes (see below).
127+ */
128+ typedef struct
129+ {
130+ slock_t mutex ;/* spinlock */
131+ long nentries ;/* number of entries */
132+ HASHELEMENT * freeList ;/* list of free elements */
133+ }FreeListData ;
134+
121135/*
122136 * Header structure for a hash table --- contains all changeable info
123137 *
@@ -128,12 +142,15 @@ typedef HASHBUCKET *HASHSEGMENT;
128142 */
129143struct HASHHDR
130144{
131- /* In a partitioned table, take this lock to touch nentries or freeList */
132- slock_t mutex ;/* unused if not partitioned table */
133-
134- /* These fields change during entry addition/deletion */
135- long nentries ;/* number of entries in hash table */
136- HASHELEMENT * freeList ;/* linked list of free elements */
145+ /*
146+ * The freelist can become a point of contention on high-concurrency hash
147+ * tables, so we use an array of freelist, each with its own mutex and
148+ * nentries count, instead of just a single one.
149+ *
150+ * If hash table is not partitioned only freeList[0] is used and spinlocks
151+ * are not used at all.
152+ */
153+ FreeListData freeList [NUM_FREELISTS ];
137154
138155/* These fields can change, but not in a partitioned table */
139156/* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -166,6 +183,9 @@ struct HASHHDR
166183
167184#define IS_PARTITIONED (hctl ) ((hctl)->num_partitions != 0)
168185
186+ #define FREELIST_IDX (hctl ,hashcode ) \
187+ (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
188+
169189/*
170190 * Top control structure for a hashtable --- in a shared table, each backend
171191 * has its own copy (OK since no fields change at runtime)
@@ -219,10 +239,10 @@ static long hash_accesses,
219239 */
220240static void * DynaHashAlloc (Size size );
221241static HASHSEGMENT seg_alloc (HTAB * hashp );
222- static bool element_alloc (HTAB * hashp ,int nelem );
242+ static bool element_alloc (HTAB * hashp ,int nelem , int freelist_idx );
223243static bool dir_realloc (HTAB * hashp );
224244static bool expand_table (HTAB * hashp );
225- static HASHBUCKET get_hash_entry (HTAB * hashp );
245+ static HASHBUCKET get_hash_entry (HTAB * hashp , int freelist_idx );
226246static void hdefault (HTAB * hashp );
227247static int choose_nelem_alloc (Size entrysize );
228248static bool init_htab (HTAB * hashp ,long nelem );
@@ -482,10 +502,40 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
482502if ((flags & HASH_SHARED_MEM )||
483503nelem < hctl -> nelem_alloc )
484504{
485- if (!element_alloc (hashp , (int )nelem ))
486- ereport (ERROR ,
487- (errcode (ERRCODE_OUT_OF_MEMORY ),
488- errmsg ("out of memory" )));
505+ int i ,
506+ freelist_partitions ,
507+ nelem_alloc ,
508+ nelem_alloc_first ;
509+
510+ /*
511+ * If hash table is partitioned all freeLists have equal number of
512+ * elements. Otherwise only freeList[0] is used.
513+ */
514+ if (IS_PARTITIONED (hashp -> hctl ))
515+ freelist_partitions = NUM_FREELISTS ;
516+ else
517+ freelist_partitions = 1 ;
518+
519+ nelem_alloc = nelem /freelist_partitions ;
520+ if (nelem_alloc == 0 )
521+ nelem_alloc = 1 ;
522+
523+ /* Make sure all memory will be used */
524+ if (nelem_alloc * freelist_partitions < nelem )
525+ nelem_alloc_first =
526+ nelem - nelem_alloc * (freelist_partitions - 1 );
527+ else
528+ nelem_alloc_first = nelem_alloc ;
529+
530+ for (i = 0 ;i < freelist_partitions ;i ++ )
531+ {
532+ int temp = (i == 0 ) ?nelem_alloc_first :nelem_alloc ;
533+
534+ if (!element_alloc (hashp ,temp ,i ))
535+ ereport (ERROR ,
536+ (errcode (ERRCODE_OUT_OF_MEMORY ),
537+ errmsg ("out of memory" )));
538+ }
489539}
490540
491541if (flags & HASH_FIXED_SIZE )
@@ -503,9 +553,6 @@ hdefault(HTAB *hashp)
503553
504554MemSet (hctl ,0 ,sizeof (HASHHDR ));
505555
506- hctl -> nentries = 0 ;
507- hctl -> freeList = NULL ;
508-
509556hctl -> dsize = DEF_DIRSIZE ;
510557hctl -> nsegs = 0 ;
511558
@@ -572,12 +619,14 @@ init_htab(HTAB *hashp, long nelem)
572619HASHSEGMENT * segp ;
573620int nbuckets ;
574621int nsegs ;
622+ int i ;
575623
576624/*
577625 * initialize mutex if it's a partitioned table
578626 */
579627if (IS_PARTITIONED (hctl ))
580- SpinLockInit (& hctl -> mutex );
628+ for (i = 0 ;i < NUM_FREELISTS ;i ++ )
629+ SpinLockInit (& (hctl -> freeList [i ].mutex ));
581630
582631/*
583632 * Divide number of elements by the fill factor to determine a desired
@@ -648,7 +697,7 @@ init_htab(HTAB *hashp, long nelem)
648697"HIGH MASK " ,hctl -> high_mask ,
649698"LOW MASK " ,hctl -> low_mask ,
650699"NSEGS " ,hctl -> nsegs ,
651- "NENTRIES " ,hctl -> nentries );
700+ "NENTRIES " ,hash_get_num_entries ( hctl ) );
652701#endif
653702return true;
654703}
@@ -769,7 +818,7 @@ hash_stats(const char *where, HTAB *hashp)
769818where ,hashp -> hctl -> accesses ,hashp -> hctl -> collisions );
770819
771820fprintf (stderr ,"hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n" ,
772- hashp -> hctl -> nentries , (long )hashp -> hctl -> keysize ,
821+ hash_get_num_entries ( hashp ) , (long )hashp -> hctl -> keysize ,
773822hashp -> hctl -> max_bucket ,hashp -> hctl -> nsegs );
774823fprintf (stderr ,"%s: total accesses %ld total collisions %ld\n" ,
775824where ,hash_accesses ,hash_collisions );
@@ -863,6 +912,7 @@ hash_search_with_hash_value(HTAB *hashp,
863912HASHBUCKET currBucket ;
864913HASHBUCKET * prevBucketPtr ;
865914HashCompareFunc match ;
915+ int freelist_idx = FREELIST_IDX (hctl ,hashvalue );
866916
867917#if HASH_STATISTICS
868918hash_accesses ++ ;
@@ -885,7 +935,7 @@ hash_search_with_hash_value(HTAB *hashp,
885935 * order of these tests is to try to check cheaper conditions first.
886936 */
887937if (!IS_PARTITIONED (hctl )&& !hashp -> frozen &&
888- hctl -> nentries / (long ) (hctl -> max_bucket + 1 ) >=hctl -> ffactor &&
938+ hctl -> freeList [ 0 ]. nentries / (long ) (hctl -> max_bucket + 1 ) >=hctl -> ffactor &&
889939!has_seq_scans (hashp ))
890940(void )expand_table (hashp );
891941}
@@ -943,20 +993,20 @@ hash_search_with_hash_value(HTAB *hashp,
943993{
944994/* if partitioned, must lock to touch nentries and freeList */
945995if (IS_PARTITIONED (hctl ))
946- SpinLockAcquire (& hctl -> mutex );
996+ SpinLockAcquire (& ( hctl -> freeList [ freelist_idx ]. mutex ) );
947997
948- Assert (hctl -> nentries > 0 );
949- hctl -> nentries -- ;
998+ Assert (hctl -> freeList [ freelist_idx ]. nentries > 0 );
999+ hctl -> freeList [ freelist_idx ]. nentries -- ;
9501000
9511001/* remove record from hash bucket's chain. */
9521002* prevBucketPtr = currBucket -> link ;
9531003
9541004/* add the record to the freelist for this table. */
955- currBucket -> link = hctl -> freeList ;
956- hctl -> freeList = currBucket ;
1005+ currBucket -> link = hctl -> freeList [ freelist_idx ]. freeList ;
1006+ hctl -> freeList [ freelist_idx ]. freeList = currBucket ;
9571007
9581008if (IS_PARTITIONED (hctl ))
959- SpinLockRelease (& hctl -> mutex );
1009+ SpinLockRelease (& hctl -> freeList [ freelist_idx ]. mutex );
9601010
9611011/*
9621012 * better hope the caller is synchronizing access to this
@@ -982,7 +1032,7 @@ hash_search_with_hash_value(HTAB *hashp,
9821032elog (ERROR ,"cannot insert into frozen hashtable \"%s\"" ,
9831033hashp -> tabname );
9841034
985- currBucket = get_hash_entry (hashp );
1035+ currBucket = get_hash_entry (hashp , freelist_idx );
9861036if (currBucket == NULL )
9871037{
9881038/* out of memory */
@@ -1175,39 +1225,69 @@ hash_update_hash_key(HTAB *hashp,
11751225 * create a new entry if possible
11761226 */
11771227static HASHBUCKET
1178- get_hash_entry (HTAB * hashp )
1228+ get_hash_entry (HTAB * hashp , int freelist_idx )
11791229{
1180- HASHHDR * hctl = hashp -> hctl ;
1230+ HASHHDR * hctl = hashp -> hctl ;
11811231HASHBUCKET newElement ;
1232+ int borrow_from_idx ;
11821233
11831234for (;;)
11841235{
11851236/* if partitioned, must lock to touch nentries and freeList */
11861237if (IS_PARTITIONED (hctl ))
1187- SpinLockAcquire (& hctl -> mutex );
1238+ SpinLockAcquire (& hctl -> freeList [ freelist_idx ]. mutex );
11881239
11891240/* try to get an entry from the freelist */
1190- newElement = hctl -> freeList ;
1241+ newElement = hctl -> freeList [freelist_idx ].freeList ;
1242+
11911243if (newElement != NULL )
11921244break ;
11931245
1194- /* no free elements. allocate another chunk of buckets */
11951246if (IS_PARTITIONED (hctl ))
1196- SpinLockRelease (& hctl -> mutex );
1247+ SpinLockRelease (& hctl -> freeList [ freelist_idx ]. mutex );
11971248
1198- if (!element_alloc (hashp ,hctl -> nelem_alloc ))
1249+ /* no free elements. allocate another chunk of buckets */
1250+ if (!element_alloc (hashp ,hctl -> nelem_alloc ,freelist_idx ))
11991251{
1200- /* out of memory */
1201- return NULL ;
1252+ if (!IS_PARTITIONED (hctl ))
1253+ return NULL ;/* out of memory */
1254+
1255+ /* try to borrow element from another partition */
1256+ borrow_from_idx = freelist_idx ;
1257+ for (;;)
1258+ {
1259+ borrow_from_idx = (borrow_from_idx + 1 ) %NUM_FREELISTS ;
1260+ if (borrow_from_idx == freelist_idx )
1261+ break ;
1262+
1263+ SpinLockAcquire (& (hctl -> freeList [borrow_from_idx ].mutex ));
1264+ newElement = hctl -> freeList [borrow_from_idx ].freeList ;
1265+
1266+ if (newElement != NULL )
1267+ {
1268+ hctl -> freeList [borrow_from_idx ].freeList = newElement -> link ;
1269+ SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
1270+
1271+ SpinLockAcquire (& hctl -> freeList [freelist_idx ].mutex );
1272+ hctl -> freeList [freelist_idx ].nentries ++ ;
1273+ SpinLockRelease (& hctl -> freeList [freelist_idx ].mutex );
1274+
1275+ break ;
1276+ }
1277+
1278+ SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
1279+ }
1280+
1281+ return newElement ;
12021282}
12031283}
12041284
12051285/* remove entry from freelist, bump nentries */
1206- hctl -> freeList = newElement -> link ;
1207- hctl -> nentries ++ ;
1286+ hctl -> freeList [ freelist_idx ]. freeList = newElement -> link ;
1287+ hctl -> freeList [ freelist_idx ]. nentries ++ ;
12081288
12091289if (IS_PARTITIONED (hctl ))
1210- SpinLockRelease (& hctl -> mutex );
1290+ SpinLockRelease (& hctl -> freeList [ freelist_idx ]. mutex );
12111291
12121292return newElement ;
12131293}
@@ -1218,11 +1298,21 @@ get_hash_entry(HTAB *hashp)
12181298long
12191299hash_get_num_entries (HTAB * hashp )
12201300{
1301+ int i ;
1302+ long sum = hashp -> hctl -> freeList [0 ].nentries ;
1303+
12211304/*
12221305 * We currently don't bother with the mutex; it's only sensible to call
12231306 * this function if you've got lock on all partitions of the table.
12241307 */
1225- return hashp -> hctl -> nentries ;
1308+
1309+ if (!IS_PARTITIONED (hashp -> hctl ))
1310+ return sum ;
1311+
1312+ for (i = 1 ;i < NUM_FREELISTS ;i ++ )
1313+ sum += hashp -> hctl -> freeList [i ].nentries ;
1314+
1315+ return sum ;
12261316}
12271317
12281318/*
@@ -1527,12 +1617,12 @@ seg_alloc(HTAB *hashp)
15271617}
15281618
15291619/*
1530- * allocate some new elements and link them into the free list
1620+ * allocate some new elements and link them into theindicated free list
15311621 */
15321622static bool
1533- element_alloc (HTAB * hashp ,int nelem )
1623+ element_alloc (HTAB * hashp ,int nelem , int freelist_idx )
15341624{
1535- HASHHDR * hctl = hashp -> hctl ;
1625+ HASHHDR * hctl = hashp -> hctl ;
15361626Size elementSize ;
15371627HASHELEMENT * firstElement ;
15381628HASHELEMENT * tmpElement ;
@@ -1563,14 +1653,14 @@ element_alloc(HTAB *hashp, int nelem)
15631653
15641654/* if partitioned, must lock to touch freeList */
15651655if (IS_PARTITIONED (hctl ))
1566- SpinLockAcquire (& hctl -> mutex );
1656+ SpinLockAcquire (& hctl -> freeList [ freelist_idx ]. mutex );
15671657
15681658/* freelist could be nonempty if two backends did this concurrently */
1569- firstElement -> link = hctl -> freeList ;
1570- hctl -> freeList = prevElement ;
1659+ firstElement -> link = hctl -> freeList [ freelist_idx ]. freeList ;
1660+ hctl -> freeList [ freelist_idx ]. freeList = prevElement ;
15711661
15721662if (IS_PARTITIONED (hctl ))
1573- SpinLockRelease (& hctl -> mutex );
1663+ SpinLockRelease (& hctl -> freeList [ freelist_idx ]. mutex );
15741664
15751665return true;
15761666}