Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb154ee6

Browse files
committed
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let usersreproduce the old behavior of hash aggregation, i.e. that it could usemore than work_mem at need. However, the implementation failed to getthe job done on Win64, where work_mem is clamped to 2GB to protectvarious places that calculate memory sizes using "long int". Aswritten, the same clamp was applied to hash_mem. This resulted insevere performance regressions for queries requiring a bit more than2GB for hash aggregation, as they now spill to disk and there's noway to stop that.Getting rid of the work_mem restriction seems like a good idea, butit's a big job and could not conceivably be back-patched. However,there's only a fairly small number of places that are concerned withthe hash_mem value, and it turns out to be possible to remove therestriction there without too much code churn or any ABI breaks.So, let's do that for now to fix the regression, and leave thelarger task for another day.This patch does introduce a bit more infrastructure that should helpwith the larger task, namely pg_bitutils.h support for working withsize_t values.Per gripe from Laurent Hasson. Back-patch to v13 where thebehavior change came in.Discussion:https://postgr.es/m/997817.1627074924@sss.pgh.pa.usDiscussion:https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
1 parent3d0a463 commitb154ee6

File tree

11 files changed

+172
-103
lines changed

11 files changed

+172
-103
lines changed

‎src/backend/executor/execGrouping.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,16 @@ BuildTupleHashTableExt(PlanState *parent,
165165
{
166166
TupleHashTablehashtable;
167167
Sizeentrysize=sizeof(TupleHashEntryData)+additionalsize;
168-
inthash_mem=get_hash_mem();
168+
Sizehash_mem_limit;
169169
MemoryContextoldcontext;
170170
boolallow_jit;
171171

172172
Assert(nbuckets>0);
173173

174174
/* Limit initial table size request to not more than hash_mem */
175-
nbuckets=Min(nbuckets, (long) ((hash_mem*1024L) /entrysize));
175+
hash_mem_limit=get_hash_memory_limit() /entrysize;
176+
if (nbuckets>hash_mem_limit)
177+
nbuckets=hash_mem_limit;
176178

177179
oldcontext=MemoryContextSwitchTo(metacxt);
178180

‎src/backend/executor/nodeAgg.c

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1801,15 +1801,15 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
18011801
{
18021802
intnpartitions;
18031803
Sizepartition_mem;
1804-
inthash_mem=get_hash_mem();
1804+
Sizehash_mem_limit=get_hash_memory_limit();
18051805

18061806
/* if not expected to spill, use all of hash_mem */
1807-
if (input_groups*hashentrysize<hash_mem*1024L)
1807+
if (input_groups*hashentrysize <=hash_mem_limit)
18081808
{
18091809
if (num_partitions!=NULL)
18101810
*num_partitions=0;
1811-
*mem_limit=hash_mem*1024L;
1812-
*ngroups_limit=*mem_limit /hashentrysize;
1811+
*mem_limit=hash_mem_limit;
1812+
*ngroups_limit=hash_mem_limit /hashentrysize;
18131813
return;
18141814
}
18151815

@@ -1834,10 +1834,10 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
18341834
* minimum number of partitions, so we aren't going to dramatically exceed
18351835
* work mem anyway.
18361836
*/
1837-
if (hash_mem*1024L>4*partition_mem)
1838-
*mem_limit=hash_mem*1024L-partition_mem;
1837+
if (hash_mem_limit>4*partition_mem)
1838+
*mem_limit=hash_mem_limit-partition_mem;
18391839
else
1840-
*mem_limit=hash_mem*1024L*0.75;
1840+
*mem_limit=hash_mem_limit*0.75;
18411841

18421842
if (*mem_limit>hashentrysize)
18431843
*ngroups_limit=*mem_limit /hashentrysize;
@@ -1991,32 +1991,36 @@ static int
19911991
hash_choose_num_partitions(doubleinput_groups,doublehashentrysize,
19921992
intused_bits,int*log2_npartitions)
19931993
{
1994-
Sizemem_wanted;
1995-
intpartition_limit;
1994+
Sizehash_mem_limit=get_hash_memory_limit();
1995+
doublepartition_limit;
1996+
doublemem_wanted;
1997+
doubledpartitions;
19961998
intnpartitions;
19971999
intpartition_bits;
1998-
inthash_mem=get_hash_mem();
19992000

20002001
/*
20012002
* Avoid creating so many partitions that the memory requirements of the
20022003
* open partition files are greater than 1/4 of hash_mem.
20032004
*/
20042005
partition_limit=
2005-
(hash_mem*1024L*0.25-HASHAGG_READ_BUFFER_SIZE) /
2006+
(hash_mem_limit*0.25-HASHAGG_READ_BUFFER_SIZE) /
20062007
HASHAGG_WRITE_BUFFER_SIZE;
20072008

20082009
mem_wanted=HASHAGG_PARTITION_FACTOR*input_groups*hashentrysize;
20092010

20102011
/* make enough partitions so that each one is likely to fit in memory */
2011-
npartitions=1+ (mem_wanted / (hash_mem*1024L));
2012+
dpartitions=1+ (mem_wanted /hash_mem_limit);
2013+
2014+
if (dpartitions>partition_limit)
2015+
dpartitions=partition_limit;
20122016

2013-
if (npartitions>partition_limit)
2014-
npartitions=partition_limit;
2017+
if (dpartitions<HASHAGG_MIN_PARTITIONS)
2018+
dpartitions=HASHAGG_MIN_PARTITIONS;
2019+
if (dpartitions>HASHAGG_MAX_PARTITIONS)
2020+
dpartitions=HASHAGG_MAX_PARTITIONS;
20152021

2016-
if (npartitions<HASHAGG_MIN_PARTITIONS)
2017-
npartitions=HASHAGG_MIN_PARTITIONS;
2018-
if (npartitions>HASHAGG_MAX_PARTITIONS)
2019-
npartitions=HASHAGG_MAX_PARTITIONS;
2022+
/* HASHAGG_MAX_PARTITIONS limit makes this safe */
2023+
npartitions= (int)dpartitions;
20202024

20212025
/* ceil(log2(npartitions)) */
20222026
partition_bits=my_log2(npartitions);
@@ -2029,7 +2033,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
20292033
*log2_npartitions=partition_bits;
20302034

20312035
/* number of partitions will be a power of two */
2032-
npartitions=1L <<partition_bits;
2036+
npartitions=1 <<partition_bits;
20332037

20342038
returnnpartitions;
20352039
}

‎src/backend/executor/nodeHash.c

Lines changed: 84 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -675,15 +675,12 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
675675
{
676676
inttupsize;
677677
doubleinner_rel_bytes;
678-
longbucket_bytes;
679-
longhash_table_bytes;
680-
longskew_table_bytes;
681-
longmax_pointers;
682-
longmppow2;
678+
size_thash_table_bytes;
679+
size_tbucket_bytes;
680+
size_tmax_pointers;
683681
intnbatch=1;
684682
intnbuckets;
685683
doubledbuckets;
686-
inthash_mem=get_hash_mem();
687684

688685
/* Force a plausible relation size if no info */
689686
if (ntuples <=0.0)
@@ -700,17 +697,24 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
700697
inner_rel_bytes=ntuples*tupsize;
701698

702699
/*
703-
*Target in-memory hashtable sizeis hash_mem kilobytes.
700+
*Compute in-memory hashtable sizelimit from GUCs.
704701
*/
705-
hash_table_bytes=hash_mem*1024L;
702+
hash_table_bytes=get_hash_memory_limit();
706703

707704
/*
708705
* Parallel Hash tries to use the combined hash_mem of all workers to
709706
* avoid the need to batch. If that won't work, it falls back to hash_mem
710707
* per worker and tries to process batches in parallel.
711708
*/
712709
if (try_combined_hash_mem)
713-
hash_table_bytes+=hash_table_bytes*parallel_workers;
710+
{
711+
/* Careful, this could overflow size_t */
712+
doublenewlimit;
713+
714+
newlimit= (double)hash_table_bytes* (double) (parallel_workers+1);
715+
newlimit=Min(newlimit, (double)SIZE_MAX);
716+
hash_table_bytes= (size_t)newlimit;
717+
}
714718

715719
*space_allowed=hash_table_bytes;
716720

@@ -730,53 +734,68 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
730734
*/
731735
if (useskew)
732736
{
733-
skew_table_bytes=hash_table_bytes*SKEW_HASH_MEM_PERCENT /100;
737+
size_tbytes_per_mcv;
738+
size_tskew_mcvs;
734739

735740
/*----------
741+
* Compute number of MCVs we could hold in hash_table_bytes
742+
*
736743
* Divisor is:
737744
* size of a hash tuple +
738745
* worst-case size of skewBucket[] per MCV +
739746
* size of skewBucketNums[] entry +
740747
* size of skew bucket struct itself
741748
*----------
742749
*/
743-
*num_skew_mcvs=skew_table_bytes / (tupsize+
744-
(8*sizeof(HashSkewBucket*))+
745-
sizeof(int)+
746-
SKEW_BUCKET_OVERHEAD);
747-
if (*num_skew_mcvs>0)
748-
hash_table_bytes-=skew_table_bytes;
750+
bytes_per_mcv=tupsize+
751+
(8*sizeof(HashSkewBucket*))+
752+
sizeof(int)+
753+
SKEW_BUCKET_OVERHEAD;
754+
skew_mcvs=hash_table_bytes /bytes_per_mcv;
755+
756+
/*
757+
* Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as
758+
* not to worry about size_t overflow in the multiplication)
759+
*/
760+
skew_mcvs= (skew_mcvs*SKEW_HASH_MEM_PERCENT) /100;
761+
762+
/* Now clamp to integer range */
763+
skew_mcvs=Min(skew_mcvs,INT_MAX);
764+
765+
*num_skew_mcvs= (int)skew_mcvs;
766+
767+
/* Reduce hash_table_bytes by the amount needed for the skew table */
768+
if (skew_mcvs>0)
769+
hash_table_bytes-=skew_mcvs*bytes_per_mcv;
749770
}
750771
else
751772
*num_skew_mcvs=0;
752773

753774
/*
754775
* Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
755776
* memory is filled, assuming a single batch; but limit the value so that
756-
* the pointer arrays we'll try to allocate do not exceedhash_mem nor
757-
* MaxAllocSize.
777+
* the pointer arrays we'll try to allocate do not exceedhash_table_bytes
778+
*norMaxAllocSize.
758779
*
759780
* Note that both nbuckets and nbatch must be powers of 2 to make
760781
* ExecHashGetBucketAndBatch fast.
761782
*/
762-
max_pointers=*space_allowed /sizeof(HashJoinTuple);
783+
max_pointers=hash_table_bytes /sizeof(HashJoinTuple);
763784
max_pointers=Min(max_pointers,MaxAllocSize /sizeof(HashJoinTuple));
764785
/* If max_pointers isn't a power of 2, must round it down to one */
765-
mppow2=1L <<my_log2(max_pointers);
766-
if (max_pointers!=mppow2)
767-
max_pointers=mppow2 /2;
786+
max_pointers=pg_prevpower2_size_t(max_pointers);
768787

769788
/* Also ensure we avoid integer overflow in nbatch and nbuckets */
770789
/* (this step is redundant given the current value of MaxAllocSize) */
771-
max_pointers=Min(max_pointers,INT_MAX /2);
790+
max_pointers=Min(max_pointers,INT_MAX /2+1);
772791

773792
dbuckets=ceil(ntuples /NTUP_PER_BUCKET);
774793
dbuckets=Min(dbuckets,max_pointers);
775794
nbuckets= (int)dbuckets;
776795
/* don't let nbuckets be really small, though ... */
777796
nbuckets=Max(nbuckets,1024);
778797
/* ... and force it to be a power of 2. */
779-
nbuckets=1 <<my_log2(nbuckets);
798+
nbuckets=pg_nextpower2_32(nbuckets);
780799

781800
/*
782801
* If there's not enough space to store the projected number of tuples and
@@ -786,10 +805,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
786805
if (inner_rel_bytes+bucket_bytes>hash_table_bytes)
787806
{
788807
/* We'll need multiple batches */
789-
longlbuckets;
808+
size_tsbuckets;
790809
doubledbatch;
791810
intminbatch;
792-
longbucket_size;
811+
size_tbucket_size;
793812

794813
/*
795814
* If Parallel Hash with combined hash_mem would still need multiple
@@ -813,10 +832,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
813832
* overhead for the hash code, pointer to the next tuple, etc.
814833
*/
815834
bucket_size= (tupsize*NTUP_PER_BUCKET+sizeof(HashJoinTuple));
816-
lbuckets=1L <<my_log2(hash_table_bytes /bucket_size);
817-
lbuckets=Min(lbuckets,max_pointers);
818-
nbuckets= (int)lbuckets;
819-
nbuckets=1 <<my_log2(nbuckets);
835+
sbuckets=pg_nextpower2_size_t(hash_table_bytes /bucket_size);
836+
sbuckets=Min(sbuckets,max_pointers);
837+
nbuckets= (int)sbuckets;
838+
nbuckets=pg_nextpower2_32(nbuckets);
820839
bucket_bytes=nbuckets*sizeof(HashJoinTuple);
821840

822841
/*
@@ -1097,14 +1116,12 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
10971116
/* Figure out how many batches to use. */
10981117
if (hashtable->nbatch==1)
10991118
{
1100-
inthash_mem=get_hash_mem();
1101-
11021119
/*
11031120
* We are going from single-batch to multi-batch. We need
11041121
* to switch from one large combined memory budget to the
11051122
* regular hash_mem budget.
11061123
*/
1107-
pstate->space_allowed=hash_mem*1024L;
1124+
pstate->space_allowed=get_hash_memory_limit();
11081125

11091126
/*
11101127
* The combined hash_mem of all participants wasn't
@@ -1113,7 +1130,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
11131130
* insufficient. So try two batches per participant,
11141131
* rounded up to a power of two.
11151132
*/
1116-
new_nbatch=1 <<my_log2(pstate->nparticipants*2);
1133+
new_nbatch=pg_nextpower2_32(pstate->nparticipants*2);
11171134
}
11181135
else
11191136
{
@@ -1152,7 +1169,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
11521169
MaxAllocSize /sizeof(dsa_pointer_atomic));
11531170
new_nbuckets= (int)dbuckets;
11541171
new_nbuckets=Max(new_nbuckets,1024);
1155-
new_nbuckets=1 <<my_log2(new_nbuckets);
1172+
new_nbuckets=pg_nextpower2_32(new_nbuckets);
11561173
dsa_free(hashtable->area,old_batch0->buckets);
11571174
hashtable->batches[0].shared->buckets=
11581175
dsa_allocate(hashtable->area,
@@ -3372,39 +3389,46 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
33723389
}
33733390

33743391
/*
3375-
* Get a hash_mem value by multiplying the work_mem GUC's value by the
3376-
* hash_mem_multiplier GUC's value.
3377-
*
3378-
* Returns a work_mem style KB value that hash-based nodes (including but not
3379-
* limited to hash join) use in place of work_mem. This is subject to the
3380-
* same restrictions as work_mem itself. (There is no such thing as the
3381-
* hash_mem GUC, but it's convenient for our callers to pretend that there
3382-
* is.)
3392+
* Calculate the limit on how much memory can be used by Hash and similar
3393+
* plan types. This is work_mem times hash_mem_multiplier, and is
3394+
* expressed in bytes.
33833395
*
3384-
* Exported for use by the planner, as well as other hash-based executor
3396+
* Exported for use by the planner, as well as other hash-like executor
33853397
* nodes. This is a rather random place for this, but there is no better
33863398
* place.
33873399
*/
3400+
size_t
3401+
get_hash_memory_limit(void)
3402+
{
3403+
doublemem_limit;
3404+
3405+
/* Do initial calculation in double arithmetic */
3406+
mem_limit= (double)work_mem*hash_mem_multiplier*1024.0;
3407+
3408+
/* Clamp in case it doesn't fit in size_t */
3409+
mem_limit=Min(mem_limit, (double)SIZE_MAX);
3410+
3411+
return (size_t)mem_limit;
3412+
}
3413+
3414+
/*
3415+
* Convert the hash memory limit to an integer number of kilobytes,
3416+
* that is something comparable to work_mem. Like work_mem, we clamp
3417+
* the result to ensure that multiplying it by 1024 fits in a long int.
3418+
*
3419+
* This is deprecated since it may understate the actual memory limit.
3420+
* It is unused in core and will eventually be removed.
3421+
*/
33883422
int
33893423
get_hash_mem(void)
33903424
{
3391-
doublehash_mem;
3392-
3393-
Assert(hash_mem_multiplier >=1.0);
3425+
size_tmem_limit=get_hash_memory_limit();
33943426

3395-
hash_mem= (double)work_mem*hash_mem_multiplier;
3427+
/* Remove the kilobyte factor */
3428+
mem_limit /=1024;
33963429

3397-
/*
3398-
* guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to
3399-
* support the assumption that raw derived byte values can be stored in
3400-
* 'long' variables. The returned hash_mem value must also meet this
3401-
* assumption.
3402-
*
3403-
* We clamp the final value rather than throw an error because it should
3404-
* be possible to set work_mem and hash_mem_multiplier independently.
3405-
*/
3406-
if (hash_mem<MAX_KILOBYTES)
3407-
return (int)hash_mem;
3430+
/* Clamp to MAX_KILOBYTES, like work_mem */
3431+
mem_limit=Min(mem_limit, (size_t)MAX_KILOBYTES);
34083432

3409-
returnMAX_KILOBYTES;
3433+
return(int)mem_limit;
34103434
}

‎src/backend/executor/nodeMemoize.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,7 @@ ExecInitMemoize(Memoize *node, EState *estate, int eflags)
905905
mstate->mem_used=0;
906906

907907
/* Limit the total memory consumed by the cache to this */
908-
mstate->mem_limit=get_hash_mem()*1024L;
908+
mstate->mem_limit=get_hash_memory_limit();
909909

910910
/* A memory context dedicated for the cache */
911911
mstate->tableContext=AllocSetContextCreate(CurrentMemoryContext,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp