4343#include "access/htup_details.h"
4444#include "nodes/bitmapset.h"
4545#include "nodes/tidbitmap.h"
46- #include "utils/hsearch.h"
4746
4847/*
4948 * The maximum number of tuples per page is not large (typically 256 with
6160 * for that page in the page table.
6261 *
6362 * We actually store both exact pages and lossy chunks in the same hash
64- * table, using identical data structures. (This is becausedynahash.c's
65- *memory management doesn't allow space to be transferred easily from one
66- * hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the
67- * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we
68- * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
69- * remainder operations. So, define it like this:
63+ * table, using identical data structures. (This is becausethe memory
64+ * managementfor hashtables doesn'teasily/efficiently allow space to be
65+ *transferred easily from one hashtable to another.) Therefore it's best
66+ *if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
67+ *too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
68+ *avoid expensive integer remainder operations. So, define it like this:
7069 */
7170#define PAGES_PER_CHUNK (BLCKSZ / 32)
7271
9796typedef struct PagetableEntry
9897{
9998BlockNumber blockno ;/* page number (hashtable key) */
99+ char status ;/* hash entry status */
100100bool ischunk ;/* T = lossy storage, F = exact */
101101bool recheck ;/* should the tuples be rechecked? */
102102bitmapword words [Max (WORDS_PER_PAGE ,WORDS_PER_CHUNK )];
103103}PagetableEntry ;
104104
105105/*
106- *dynahash.c is optimized for relatively large, long-lived hash tables.
107- *This is notideal for TIDBitMap, particularly when we are using a bitmap
108- * scan on the inside of a nestloop join: a bitmap may well live only long
109- * enough to accumulate one entry in such cases. We therefore avoid creating
110- * an actual hashtable until we need two pagetable entries. When just one
111- * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
112- * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
113- * to zero or one page again. So, status can be TBM_HASH even when nentries
114- * is zero or one.)
106+ *We want to avoid the overhead of creating the hashtable, which is
107+ *comparatively large, when notnecessary. Particularly when we are using a
108+ *bitmap scan on the inside of a nestloop join: a bitmap may well live only
109+ *long enough to accumulate one entry in such cases. We therefore avoid
110+ *creating an actual hashtable until we need two pagetable entries. When
111+ *just one pagetable entry is needed, we store it in a fixed field of
112+ *TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
113+ *shrinks down to zero or one page again. So, status can be TBM_HASH even
114+ *when nentries is zero or one.)
115115 */
116116typedef enum
117117{
@@ -128,12 +128,13 @@ struct TIDBitmap
128128NodeTag type ;/* to make it a valid Node */
129129MemoryContext mcxt ;/* memory context containing me */
130130TBMStatus status ;/* see codes above */
131- HTAB * pagetable ;/* hash table of PagetableEntry's */
131+ struct pagetable_hash * pagetable ;/* hash table of PagetableEntry's */
132132int nentries ;/* number of entries in pagetable */
133133int maxentries ;/* limit on same to meet maxbytes */
134134int npages ;/* number of exact entries in pagetable */
135135int nchunks ;/* number of lossy entries in pagetable */
136136bool iterating ;/* tbm_begin_iterate called? */
137+ uint32 lossify_start ;/* offset to start lossifying hashtable at */
137138PagetableEntry entry1 ;/* used when status == TBM_ONE_PAGE */
138139/* these are valid when iterating is true: */
139140PagetableEntry * * spages ;/* sorted exact-page list, or NULL */
@@ -168,6 +169,35 @@ static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
168169static void tbm_lossify (TIDBitmap * tbm );
169170static int tbm_comparator (const void * left ,const void * right );
170171
172+ /*
173+ * Simple inline murmur hash implementation for the exact width required, for
174+ * performance.
175+ */
176+ static inline uint32
177+ hash_blockno (BlockNumber b )
178+ {
179+ uint32 h = b ;
180+
181+ h ^=h >>16 ;
182+ h *=0x85ebca6b ;
183+ h ^=h >>13 ;
184+ h *=0xc2b2ae35 ;
185+ h ^=h >>16 ;
186+ return h ;
187+ }
188+
189+ /* define hashtable mapping block numbers to PagetableEntry's */
190+ #define SH_PREFIX pagetable
191+ #define SH_ELEMENT_TYPE PagetableEntry
192+ #define SH_KEY_TYPE BlockNumber
193+ #define SH_KEY blockno
194+ #define SH_HASH_KEY (tb ,key ) hash_blockno(key)
195+ #define SH_EQUAL (tb ,a ,b ) a == b
196+ #define SH_SCOPE static inline
197+ #define SH_DEFINE
198+ #define SH_DECLARE
199+ #include "lib/simplehash.h"
200+
171201
172202/*
173203 * tbm_create - create an initially-empty bitmap
@@ -190,17 +220,16 @@ tbm_create(long maxbytes)
190220
191221/*
192222 * Estimate number of hashtable entries we can have within maxbytes. This
193- * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
194- * pointer per hash entry, which is crude but good enough for our purpose.
195- * Also count an extra Pointer per entry for the arrays created during
196- * iteration readout.
223+ * estimates the hash cost as sizeof(PagetableEntry), which is good enough
224+ * for our purpose. Also count an extra Pointer per entry for the arrays
225+ * created during iteration readout.
197226 */
198227nbuckets = maxbytes /
199- (MAXALIGN (sizeof (HASHELEMENT ))+ MAXALIGN (sizeof (PagetableEntry ))
200- + sizeof (Pointer )+ sizeof (Pointer ));
228+ (sizeof (PagetableEntry )+ sizeof (Pointer )+ sizeof (Pointer ));
201229nbuckets = Min (nbuckets ,INT_MAX - 1 );/* safety limit */
202230nbuckets = Max (nbuckets ,16 );/* sanity limit */
203231tbm -> maxentries = (int )nbuckets ;
232+ tbm -> lossify_start = 0 ;
204233
205234return tbm ;
206235}
@@ -212,32 +241,25 @@ tbm_create(long maxbytes)
212241static void
213242tbm_create_pagetable (TIDBitmap * tbm )
214243{
215- HASHCTL hash_ctl ;
216-
217244Assert (tbm -> status != TBM_HASH );
218245Assert (tbm -> pagetable == NULL );
219246
220- /* Create the hashtable proper */
221- MemSet (& hash_ctl ,0 ,sizeof (hash_ctl ));
222- hash_ctl .keysize = sizeof (BlockNumber );
223- hash_ctl .entrysize = sizeof (PagetableEntry );
224- hash_ctl .hcxt = tbm -> mcxt ;
225- tbm -> pagetable = hash_create ("TIDBitmap" ,
226- 128 ,/* start small and extend */
227- & hash_ctl ,
228- HASH_ELEM |HASH_BLOBS |HASH_CONTEXT );
247+ tbm -> pagetable = pagetable_create (tbm -> mcxt ,128 );
229248
230249/* If entry1 is valid, push it into the hashtable */
231250if (tbm -> status == TBM_ONE_PAGE )
232251{
233252PagetableEntry * page ;
234253bool found ;
254+ char oldstatus ;
235255
236- page = ( PagetableEntry * ) hash_search (tbm -> pagetable ,
237- ( void * ) & tbm -> entry1 .blockno ,
238- HASH_ENTER , & found );
256+ page = pagetable_insert (tbm -> pagetable ,
257+ tbm -> entry1 .blockno ,
258+ & found );
239259Assert (!found );
260+ oldstatus = page -> status ;
240261memcpy (page ,& tbm -> entry1 ,sizeof (PagetableEntry ));
262+ page -> status = oldstatus ;
241263}
242264
243265tbm -> status = TBM_HASH ;
250272tbm_free (TIDBitmap * tbm )
251273{
252274if (tbm -> pagetable )
253- hash_destroy (tbm -> pagetable );
275+ pagetable_destroy (tbm -> pagetable );
254276if (tbm -> spages )
255277pfree (tbm -> spages );
256278if (tbm -> schunks )
@@ -357,12 +379,12 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b)
357379tbm_union_page (a ,& b -> entry1 );
358380else
359381{
360- HASH_SEQ_STATUS status ;
382+ pagetable_iterator i ;
361383PagetableEntry * bpage ;
362384
363385Assert (b -> status == TBM_HASH );
364- hash_seq_init ( & status , b -> pagetable );
365- while ((bpage = ( PagetableEntry * ) hash_seq_search ( & status ))!= NULL )
386+ pagetable_start_iterate ( b -> pagetable , & i );
387+ while ((bpage = pagetable_iterate ( b -> pagetable , & i ))!= NULL )
366388tbm_union_page (a ,bpage );
367389}
368390}
@@ -449,12 +471,12 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
449471}
450472else
451473{
452- HASH_SEQ_STATUS status ;
474+ pagetable_iterator i ;
453475PagetableEntry * apage ;
454476
455477Assert (a -> status == TBM_HASH );
456- hash_seq_init ( & status , a -> pagetable );
457- while ((apage = ( PagetableEntry * ) hash_seq_search ( & status ))!= NULL )
478+ pagetable_start_iterate ( a -> pagetable , & i );
479+ while ((apage = pagetable_iterate ( a -> pagetable , & i ))!= NULL )
458480{
459481if (tbm_intersect_page (a ,apage ,b ))
460482{
@@ -464,9 +486,7 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
464486else
465487a -> npages -- ;
466488a -> nentries -- ;
467- if (hash_search (a -> pagetable ,
468- (void * )& apage -> blockno ,
469- HASH_REMOVE ,NULL )== NULL )
489+ if (!pagetable_delete (a -> pagetable ,apage -> blockno ))
470490elog (ERROR ,"hash table corrupted" );
471491}
472492}
@@ -606,7 +626,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
606626 */
607627if (tbm -> status == TBM_HASH && !tbm -> iterating )
608628{
609- HASH_SEQ_STATUS status ;
629+ pagetable_iterator i ;
610630PagetableEntry * page ;
611631int npages ;
612632int nchunks ;
@@ -620,9 +640,9 @@ tbm_begin_iterate(TIDBitmap *tbm)
620640MemoryContextAlloc (tbm -> mcxt ,
621641tbm -> nchunks * sizeof (PagetableEntry * ));
622642
623- hash_seq_init (& status ,tbm -> pagetable );
624643npages = nchunks = 0 ;
625- while ((page = (PagetableEntry * )hash_seq_search (& status ))!= NULL )
644+ pagetable_start_iterate (tbm -> pagetable ,& i );
645+ while ((page = pagetable_iterate (tbm -> pagetable ,& i ))!= NULL )
626646{
627647if (page -> ischunk )
628648tbm -> schunks [nchunks ++ ]= page ;
@@ -791,9 +811,7 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
791811return page ;
792812}
793813
794- page = (PagetableEntry * )hash_search (tbm -> pagetable ,
795- (void * )& pageno ,
796- HASH_FIND ,NULL );
814+ page = pagetable_lookup (tbm -> pagetable ,pageno );
797815if (page == NULL )
798816return NULL ;
799817if (page -> ischunk )
@@ -834,15 +852,16 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
834852}
835853
836854/* Look up or create an entry */
837- page = (PagetableEntry * )hash_search (tbm -> pagetable ,
838- (void * )& pageno ,
839- HASH_ENTER ,& found );
855+ page = pagetable_insert (tbm -> pagetable ,pageno ,& found );
840856}
841857
842858/* Initialize it if not present before */
843859if (!found )
844860{
861+ char oldstatus = page -> status ;
862+
845863MemSet (page ,0 ,sizeof (PagetableEntry ));
864+ page -> status = oldstatus ;
846865page -> blockno = pageno ;
847866/* must count it too */
848867tbm -> nentries ++ ;
@@ -869,9 +888,9 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
869888
870889bitno = pageno %PAGES_PER_CHUNK ;
871890chunk_pageno = pageno - bitno ;
872- page = ( PagetableEntry * ) hash_search ( tbm -> pagetable ,
873- ( void * ) & chunk_pageno ,
874- HASH_FIND , NULL );
891+
892+ page = pagetable_lookup ( tbm -> pagetable , chunk_pageno );
893+
875894if (page != NULL && page -> ischunk )
876895{
877896int wordnum = WORDNUM (bitno );
@@ -912,9 +931,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
912931 */
913932if (bitno != 0 )
914933{
915- if (hash_search (tbm -> pagetable ,
916- (void * )& pageno ,
917- HASH_REMOVE ,NULL )!= NULL )
934+ if (pagetable_delete (tbm -> pagetable ,pageno ))
918935{
919936/* It was present, so adjust counts */
920937tbm -> nentries -- ;
@@ -923,14 +940,15 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
923940}
924941
925942/* Look up or create entry for chunk-header page */
926- page = (PagetableEntry * )hash_search (tbm -> pagetable ,
927- (void * )& chunk_pageno ,
928- HASH_ENTER ,& found );
943+ page = pagetable_insert (tbm -> pagetable ,chunk_pageno ,& found );
929944
930945/* Initialize it if not present before */
931946if (!found )
932947{
948+ char oldstatus = page -> status ;
949+
933950MemSet (page ,0 ,sizeof (PagetableEntry ));
951+ page -> status = oldstatus ;
934952page -> blockno = chunk_pageno ;
935953page -> ischunk = true;
936954/* must count it too */
@@ -939,8 +957,11 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
939957}
940958else if (!page -> ischunk )
941959{
960+ char oldstatus = page -> status ;
961+
942962/* chunk header page was formerly non-lossy, make it lossy */
943963MemSet (page ,0 ,sizeof (PagetableEntry ));
964+ page -> status = oldstatus ;
944965page -> blockno = chunk_pageno ;
945966page -> ischunk = true;
946967/* we assume it had some tuple bit(s) set, so mark it lossy */
@@ -962,7 +983,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
962983static void
963984tbm_lossify (TIDBitmap * tbm )
964985{
965- HASH_SEQ_STATUS status ;
986+ pagetable_iterator i ;
966987PagetableEntry * page ;
967988
968989/*
@@ -977,8 +998,8 @@ tbm_lossify(TIDBitmap *tbm)
977998Assert (!tbm -> iterating );
978999Assert (tbm -> status == TBM_HASH );
9791000
980- hash_seq_init ( & status ,tbm -> pagetable );
981- while ((page = ( PagetableEntry * ) hash_seq_search ( & status ))!= NULL )
1001+ pagetable_start_iterate_at ( tbm -> pagetable , & i ,tbm -> lossify_start );
1002+ while ((page = pagetable_iterate ( tbm -> pagetable , & i ))!= NULL )
9821003{
9831004if (page -> ischunk )
9841005continue ;/* already a chunk header */
@@ -995,15 +1016,19 @@ tbm_lossify(TIDBitmap *tbm)
9951016
9961017if (tbm -> nentries <=tbm -> maxentries /2 )
9971018{
998- /* we have done enough */
999- hash_seq_term (& status );
1019+ /*
1020+ * We have made enough room. Remember where to start lossifying
1021+ * next round, so we evenly iterate over the hashtable.
1022+ */
1023+ tbm -> lossify_start = i .cur ;
10001024break ;
10011025}
10021026
10031027/*
10041028 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1005- * hashtable. We can continue the same seq_search scan since we do
1006- * not care whether we visit lossy chunks or not.
1029+ * hashtable and may have deleted the non-lossy chunk. We can
1030+ * continue the same hash table scan, since failure to visit one
1031+ * element or visiting the newly inserted element, isn't fatal.
10071032 */
10081033}
10091034