Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit28d9360

Browse files
committed
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let usersreproduce the old behavior of hash aggregation, i.e. that it could usemore than work_mem at need. However, the implementation failed to getthe job done on Win64, where work_mem is clamped to 2GB to protectvarious places that calculate memory sizes using "long int". Aswritten, the same clamp was applied to hash_mem. This resulted insevere performance regressions for queries requiring a bit more than2GB for hash aggregation, as they now spill to disk and there's noway to stop that.Getting rid of the work_mem restriction seems like a good idea, butit's a big job and could not conceivably be back-patched. However,there's only a fairly small number of places that are concerned withthe hash_mem value, and it turns out to be possible to remove therestriction there without too much code churn or any ABI breaks.So, let's do that for now to fix the regression, and leave thelarger task for another day.This patch does introduce a bit more infrastructure that should helpwith the larger task, namely pg_bitutils.h support for working withsize_t values.Per gripe from Laurent Hasson. Back-patch to v13 where thebehavior change came in.Discussion:https://postgr.es/m/997817.1627074924@sss.pgh.pa.usDiscussion:https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
1 parentd9d8aa9 commit28d9360

File tree

12 files changed

+153
-111
lines changed

12 files changed

+153
-111
lines changed

‎src/backend/executor/execGrouping.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,16 @@ BuildTupleHashTableExt(PlanState *parent,
165165
{
166166
TupleHashTablehashtable;
167167
Sizeentrysize=sizeof(TupleHashEntryData)+additionalsize;
168-
inthash_mem=get_hash_mem();
168+
Sizehash_mem_limit;
169169
MemoryContextoldcontext;
170170
boolallow_jit;
171171

172172
Assert(nbuckets>0);
173173

174174
/* Limit initial table size request to not more than hash_mem */
175-
nbuckets=Min(nbuckets, (long) ((hash_mem*1024L) /entrysize));
175+
hash_mem_limit=get_hash_memory_limit() /entrysize;
176+
if (nbuckets>hash_mem_limit)
177+
nbuckets=hash_mem_limit;
176178

177179
oldcontext=MemoryContextSwitchTo(metacxt);
178180

‎src/backend/executor/nodeAgg.c

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1802,15 +1802,15 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
18021802
{
18031803
intnpartitions;
18041804
Sizepartition_mem;
1805-
inthash_mem=get_hash_mem();
1805+
Sizehash_mem_limit=get_hash_memory_limit();
18061806

18071807
/* if not expected to spill, use all of hash_mem */
1808-
if (input_groups*hashentrysize<hash_mem*1024L)
1808+
if (input_groups*hashentrysize <=hash_mem_limit)
18091809
{
18101810
if (num_partitions!=NULL)
18111811
*num_partitions=0;
1812-
*mem_limit=hash_mem*1024L;
1813-
*ngroups_limit=*mem_limit /hashentrysize;
1812+
*mem_limit=hash_mem_limit;
1813+
*ngroups_limit=hash_mem_limit /hashentrysize;
18141814
return;
18151815
}
18161816

@@ -1835,10 +1835,10 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
18351835
* minimum number of partitions, so we aren't going to dramatically exceed
18361836
* work mem anyway.
18371837
*/
1838-
if (hash_mem*1024L>4*partition_mem)
1839-
*mem_limit=hash_mem*1024L-partition_mem;
1838+
if (hash_mem_limit>4*partition_mem)
1839+
*mem_limit=hash_mem_limit-partition_mem;
18401840
else
1841-
*mem_limit=hash_mem*1024L*0.75;
1841+
*mem_limit=hash_mem_limit*0.75;
18421842

18431843
if (*mem_limit>hashentrysize)
18441844
*ngroups_limit=*mem_limit /hashentrysize;
@@ -1992,32 +1992,36 @@ static int
19921992
hash_choose_num_partitions(doubleinput_groups,doublehashentrysize,
19931993
intused_bits,int*log2_npartitions)
19941994
{
1995-
Sizemem_wanted;
1996-
intpartition_limit;
1995+
Sizehash_mem_limit=get_hash_memory_limit();
1996+
doublepartition_limit;
1997+
doublemem_wanted;
1998+
doubledpartitions;
19971999
intnpartitions;
19982000
intpartition_bits;
1999-
inthash_mem=get_hash_mem();
20002001

20012002
/*
20022003
* Avoid creating so many partitions that the memory requirements of the
20032004
* open partition files are greater than 1/4 of hash_mem.
20042005
*/
20052006
partition_limit=
2006-
(hash_mem*1024L*0.25-HASHAGG_READ_BUFFER_SIZE) /
2007+
(hash_mem_limit*0.25-HASHAGG_READ_BUFFER_SIZE) /
20072008
HASHAGG_WRITE_BUFFER_SIZE;
20082009

20092010
mem_wanted=HASHAGG_PARTITION_FACTOR*input_groups*hashentrysize;
20102011

20112012
/* make enough partitions so that each one is likely to fit in memory */
2012-
npartitions=1+ (mem_wanted / (hash_mem*1024L));
2013+
dpartitions=1+ (mem_wanted /hash_mem_limit);
2014+
2015+
if (dpartitions>partition_limit)
2016+
dpartitions=partition_limit;
20132017

2014-
if (npartitions>partition_limit)
2015-
npartitions=partition_limit;
2018+
if (dpartitions<HASHAGG_MIN_PARTITIONS)
2019+
dpartitions=HASHAGG_MIN_PARTITIONS;
2020+
if (dpartitions>HASHAGG_MAX_PARTITIONS)
2021+
dpartitions=HASHAGG_MAX_PARTITIONS;
20162022

2017-
if (npartitions<HASHAGG_MIN_PARTITIONS)
2018-
npartitions=HASHAGG_MIN_PARTITIONS;
2019-
if (npartitions>HASHAGG_MAX_PARTITIONS)
2020-
npartitions=HASHAGG_MAX_PARTITIONS;
2023+
/* HASHAGG_MAX_PARTITIONS limit makes this safe */
2024+
npartitions= (int)dpartitions;
20212025

20222026
/* ceil(log2(npartitions)) */
20232027
partition_bits=my_log2(npartitions);
@@ -2030,7 +2034,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
20302034
*log2_npartitions=partition_bits;
20312035

20322036
/* number of partitions will be a power of two */
2033-
npartitions=1L <<partition_bits;
2037+
npartitions=1 <<partition_bits;
20342038

20352039
returnnpartitions;
20362040
}

‎src/backend/executor/nodeHash.c

Lines changed: 64 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -675,15 +675,12 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
675675
{
676676
inttupsize;
677677
doubleinner_rel_bytes;
678-
longbucket_bytes;
679-
longhash_table_bytes;
680-
longskew_table_bytes;
681-
longmax_pointers;
682-
longmppow2;
678+
size_thash_table_bytes;
679+
size_tbucket_bytes;
680+
size_tmax_pointers;
683681
intnbatch=1;
684682
intnbuckets;
685683
doubledbuckets;
686-
inthash_mem=get_hash_mem();
687684

688685
/* Force a plausible relation size if no info */
689686
if (ntuples <=0.0)
@@ -700,17 +697,24 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
700697
inner_rel_bytes=ntuples*tupsize;
701698

702699
/*
703-
*Target in-memory hashtable sizeis hash_mem kilobytes.
700+
*Compute in-memory hashtable sizelimit from GUCs.
704701
*/
705-
hash_table_bytes=hash_mem*1024L;
702+
hash_table_bytes=get_hash_memory_limit();
706703

707704
/*
708705
* Parallel Hash tries to use the combined hash_mem of all workers to
709706
* avoid the need to batch. If that won't work, it falls back to hash_mem
710707
* per worker and tries to process batches in parallel.
711708
*/
712709
if (try_combined_hash_mem)
713-
hash_table_bytes+=hash_table_bytes*parallel_workers;
710+
{
711+
/* Careful, this could overflow size_t */
712+
doublenewlimit;
713+
714+
newlimit= (double)hash_table_bytes* (double) (parallel_workers+1);
715+
newlimit=Min(newlimit, (double)SIZE_MAX);
716+
hash_table_bytes= (size_t)newlimit;
717+
}
714718

715719
*space_allowed=hash_table_bytes;
716720

@@ -730,53 +734,68 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
730734
*/
731735
if (useskew)
732736
{
733-
skew_table_bytes=hash_table_bytes*SKEW_HASH_MEM_PERCENT /100;
737+
size_tbytes_per_mcv;
738+
size_tskew_mcvs;
734739

735740
/*----------
741+
* Compute number of MCVs we could hold in hash_table_bytes
742+
*
736743
* Divisor is:
737744
* size of a hash tuple +
738745
* worst-case size of skewBucket[] per MCV +
739746
* size of skewBucketNums[] entry +
740747
* size of skew bucket struct itself
741748
*----------
742749
*/
743-
*num_skew_mcvs=skew_table_bytes / (tupsize+
744-
(8*sizeof(HashSkewBucket*))+
745-
sizeof(int)+
746-
SKEW_BUCKET_OVERHEAD);
747-
if (*num_skew_mcvs>0)
748-
hash_table_bytes-=skew_table_bytes;
750+
bytes_per_mcv=tupsize+
751+
(8*sizeof(HashSkewBucket*))+
752+
sizeof(int)+
753+
SKEW_BUCKET_OVERHEAD;
754+
skew_mcvs=hash_table_bytes /bytes_per_mcv;
755+
756+
/*
757+
* Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as
758+
* not to worry about size_t overflow in the multiplication)
759+
*/
760+
skew_mcvs= (skew_mcvs*SKEW_HASH_MEM_PERCENT) /100;
761+
762+
/* Now clamp to integer range */
763+
skew_mcvs=Min(skew_mcvs,INT_MAX);
764+
765+
*num_skew_mcvs= (int)skew_mcvs;
766+
767+
/* Reduce hash_table_bytes by the amount needed for the skew table */
768+
if (skew_mcvs>0)
769+
hash_table_bytes-=skew_mcvs*bytes_per_mcv;
749770
}
750771
else
751772
*num_skew_mcvs=0;
752773

753774
/*
754775
* Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
755776
* memory is filled, assuming a single batch; but limit the value so that
756-
* the pointer arrays we'll try to allocate do not exceedhash_mem nor
757-
* MaxAllocSize.
777+
* the pointer arrays we'll try to allocate do not exceedhash_table_bytes
778+
*norMaxAllocSize.
758779
*
759780
* Note that both nbuckets and nbatch must be powers of 2 to make
760781
* ExecHashGetBucketAndBatch fast.
761782
*/
762-
max_pointers=*space_allowed /sizeof(HashJoinTuple);
783+
max_pointers=hash_table_bytes /sizeof(HashJoinTuple);
763784
max_pointers=Min(max_pointers,MaxAllocSize /sizeof(HashJoinTuple));
764785
/* If max_pointers isn't a power of 2, must round it down to one */
765-
mppow2=1L <<my_log2(max_pointers);
766-
if (max_pointers!=mppow2)
767-
max_pointers=mppow2 /2;
786+
max_pointers=pg_prevpower2_size_t(max_pointers);
768787

769788
/* Also ensure we avoid integer overflow in nbatch and nbuckets */
770789
/* (this step is redundant given the current value of MaxAllocSize) */
771-
max_pointers=Min(max_pointers,INT_MAX /2);
790+
max_pointers=Min(max_pointers,INT_MAX /2+1);
772791

773792
dbuckets=ceil(ntuples /NTUP_PER_BUCKET);
774793
dbuckets=Min(dbuckets,max_pointers);
775794
nbuckets= (int)dbuckets;
776795
/* don't let nbuckets be really small, though ... */
777796
nbuckets=Max(nbuckets,1024);
778797
/* ... and force it to be a power of 2. */
779-
nbuckets=1 <<my_log2(nbuckets);
798+
nbuckets=pg_nextpower2_32(nbuckets);
780799

781800
/*
782801
* If there's not enough space to store the projected number of tuples and
@@ -786,10 +805,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
786805
if (inner_rel_bytes+bucket_bytes>hash_table_bytes)
787806
{
788807
/* We'll need multiple batches */
789-
longlbuckets;
808+
size_tsbuckets;
790809
doubledbatch;
791810
intminbatch;
792-
longbucket_size;
811+
size_tbucket_size;
793812

794813
/*
795814
* If Parallel Hash with combined hash_mem would still need multiple
@@ -813,10 +832,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
813832
* overhead for the hash code, pointer to the next tuple, etc.
814833
*/
815834
bucket_size= (tupsize*NTUP_PER_BUCKET+sizeof(HashJoinTuple));
816-
lbuckets=1L <<my_log2(hash_table_bytes /bucket_size);
817-
lbuckets=Min(lbuckets,max_pointers);
818-
nbuckets= (int)lbuckets;
819-
nbuckets=1 <<my_log2(nbuckets);
835+
sbuckets=pg_nextpower2_size_t(hash_table_bytes /bucket_size);
836+
sbuckets=Min(sbuckets,max_pointers);
837+
nbuckets= (int)sbuckets;
838+
nbuckets=pg_nextpower2_32(nbuckets);
820839
bucket_bytes=nbuckets*sizeof(HashJoinTuple);
821840

822841
/*
@@ -1097,14 +1116,12 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
10971116
/* Figure out how many batches to use. */
10981117
if (hashtable->nbatch==1)
10991118
{
1100-
inthash_mem=get_hash_mem();
1101-
11021119
/*
11031120
* We are going from single-batch to multi-batch. We need
11041121
* to switch from one large combined memory budget to the
11051122
* regular hash_mem budget.
11061123
*/
1107-
pstate->space_allowed=hash_mem*1024L;
1124+
pstate->space_allowed=get_hash_memory_limit();
11081125

11091126
/*
11101127
* The combined hash_mem of all participants wasn't
@@ -1113,7 +1130,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
11131130
* insufficient. So try two batches per participant,
11141131
* rounded up to a power of two.
11151132
*/
1116-
new_nbatch=1 <<my_log2(pstate->nparticipants*2);
1133+
new_nbatch=pg_nextpower2_32(pstate->nparticipants*2);
11171134
}
11181135
else
11191136
{
@@ -1152,7 +1169,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
11521169
MaxAllocSize /sizeof(dsa_pointer_atomic));
11531170
new_nbuckets= (int)dbuckets;
11541171
new_nbuckets=Max(new_nbuckets,1024);
1155-
new_nbuckets=1 <<my_log2(new_nbuckets);
1172+
new_nbuckets=pg_nextpower2_32(new_nbuckets);
11561173
dsa_free(hashtable->area,old_batch0->buckets);
11571174
hashtable->batches[0].shared->buckets=
11581175
dsa_allocate(hashtable->area,
@@ -3372,39 +3389,24 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
33723389
}
33733390

33743391
/*
3375-
* Get a hash_mem value by multiplying the work_mem GUC's value by the
3376-
* hash_mem_multiplier GUC's value.
3392+
* Calculate the limit on how much memory can be used by Hash and similar
3393+
* plan types. This is work_mem times hash_mem_multiplier, and is
3394+
* expressed in bytes.
33773395
*
3378-
* Returns a work_mem style KB value that hash-based nodes (including but not
3379-
* limited to hash join) use in place of work_mem. This is subject to the
3380-
* same restrictions as work_mem itself. (There is no such thing as the
3381-
* hash_mem GUC, but it's convenient for our callers to pretend that there
3382-
* is.)
3383-
*
3384-
* Exported for use by the planner, as well as other hash-based executor
3396+
* Exported for use by the planner, as well as other hash-like executor
33853397
* nodes. This is a rather random place for this, but there is no better
33863398
* place.
33873399
*/
3388-
int
3389-
get_hash_mem(void)
3400+
size_t
3401+
get_hash_memory_limit(void)
33903402
{
3391-
doublehash_mem;
3403+
doublemem_limit;
33923404

3393-
Assert(hash_mem_multiplier >=1.0);
3405+
/* Do initial calculation in double arithmetic */
3406+
mem_limit= (double)work_mem*hash_mem_multiplier*1024.0;
33943407

3395-
hash_mem= (double)work_mem*hash_mem_multiplier;
3396-
3397-
/*
3398-
* guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to
3399-
* support the assumption that raw derived byte values can be stored in
3400-
* 'long' variables. The returned hash_mem value must also meet this
3401-
* assumption.
3402-
*
3403-
* We clamp the final value rather than throw an error because it should
3404-
* be possible to set work_mem and hash_mem_multiplier independently.
3405-
*/
3406-
if (hash_mem<MAX_KILOBYTES)
3407-
return (int)hash_mem;
3408+
/* Clamp in case it doesn't fit in size_t */
3409+
mem_limit=Min(mem_limit, (double)SIZE_MAX);
34083410

3409-
returnMAX_KILOBYTES;
3411+
return(size_t)mem_limit;
34103412
}

‎src/backend/executor/nodeMemoize.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,7 @@ ExecInitMemoize(Memoize *node, EState *estate, int eflags)
905905
mstate->mem_used=0;
906906

907907
/* Limit the total memory consumed by the cache to this */
908-
mstate->mem_limit=get_hash_mem()*1024L;
908+
mstate->mem_limit=get_hash_memory_limit();
909909

910910
/* A memory context dedicated for the cache */
911911
mstate->tableContext=AllocSetContextCreate(CurrentMemoryContext,

‎src/backend/optimizer/path/costsize.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2438,7 +2438,7 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
24382438
Costtotal_cost;
24392439

24402440
/* available cache space */
2441-
hash_mem_bytes=get_hash_mem()*1024L;
2441+
hash_mem_bytes=get_hash_memory_limit();
24422442

24432443
/*
24442444
* Set the number of bytes each cache entry should consume in the cache.
@@ -3860,7 +3860,6 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
38603860
Costrun_cost=workspace->run_cost;
38613861
intnumbuckets=workspace->numbuckets;
38623862
intnumbatches=workspace->numbatches;
3863-
inthash_mem;
38643863
Costcpu_per_tuple;
38653864
QualCosthash_qual_cost;
38663865
QualCostqp_qual_cost;
@@ -3986,10 +3985,8 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
39863985
* that way, so it will be unable to drive the batch size below hash_mem
39873986
* when this is true.)
39883987
*/
3989-
hash_mem=get_hash_mem();
39903988
if (relation_byte_size(clamp_row_est(inner_path_rows*innermcvfreq),
3991-
inner_path->pathtarget->width)>
3992-
(hash_mem*1024L))
3989+
inner_path->pathtarget->width)>get_hash_memory_limit())
39933990
startup_cost+=disable_cost;
39943991

39953992
/*

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp