Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit908a968

Browse files
committed
Optimize WindowAgg's use of tuplestores
When WindowAgg finished one partition of a PARTITION BY, it previouslywould call tuplestore_end() to purge all the stored tuples before againcalling tuplestore_begin_heap() and carefully setting up all of thetuplestore read pointers exactly as required for the given frameOptions.Since the frameOptions don't change between partitions, this part doesnot make much sense. For queries that had very few rows per partition,the overhead of this was very large.It seems much better to create the tuplestore and the read pointers onceand simply call tuplestore_clear() at the end of each partition.tuplestore_clear() moves all of the read pointers back to the startposition and deletes all the previously stored tuples.A simple test query with 1 million partitions and 1 tuple per partitionhas been shown to run around 40% faster than without this change. Theadditional effort seems to have mostly been spent in malloc/free.Making this work required adding a new bool field to WindowAggStatewhich had the unfortunate effect of being the 9th bool field in a groupresulting in the struct being enlarged. Here we shuffle the fieldsaround a little so that the two bool fields for runcondition relatingstuff fit into existing padding. Also, move the "runcondition" field tobe near those. This frees up enough space with the other bool fields sothat the newly added one fits into the padding bytes. This was done toaddress a very small but apparent performance regression with queriescontaining a large number of rows per partition.Reviewed-by: Ashutosh Bapat <ashutosh.bapat.oss@gmail.com>Reviewed-by: Tatsuo Ishii <ishii@postgresql.org>Discussion:https://postgr.es/m/CAHoyFK9n-QCXKTUWT_xxtXninSMEv%2BgbJN66-y6prM3f4WkEHw%40mail.gmail.com
1 parent19b861f commit908a968

File tree

2 files changed

+118
-66
lines changed

2 files changed

+118
-66
lines changed

‎src/backend/executor/nodeWindowAgg.c

Lines changed: 106 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,57 +1074,24 @@ eval_windowfunction(WindowAggState *winstate, WindowStatePerFunc perfuncstate,
10741074
}
10751075

10761076
/*
1077-
* begin_partition
1078-
* Start buffering rows of the next partition.
1077+
* prepare_tuplestore
1078+
*Prepare the tuplestore and all of the required read pointers for the
1079+
*WindowAggState's frameOptions.
1080+
*
1081+
* Note: We use pg_noinline to avoid bloating the calling function with code
1082+
* which is only called once.
10791083
*/
1080-
staticvoid
1081-
begin_partition(WindowAggState*winstate)
1084+
staticpg_noinlinevoid
1085+
prepare_tuplestore(WindowAggState*winstate)
10821086
{
10831087
WindowAgg*node= (WindowAgg*)winstate->ss.ps.plan;
1084-
PlanState*outerPlan=outerPlanState(winstate);
10851088
intframeOptions=winstate->frameOptions;
10861089
intnumfuncs=winstate->numfuncs;
1087-
inti;
1088-
1089-
winstate->partition_spooled= false;
1090-
winstate->framehead_valid= false;
1091-
winstate->frametail_valid= false;
1092-
winstate->grouptail_valid= false;
1093-
winstate->spooled_rows=0;
1094-
winstate->currentpos=0;
1095-
winstate->frameheadpos=0;
1096-
winstate->frametailpos=0;
1097-
winstate->currentgroup=0;
1098-
winstate->frameheadgroup=0;
1099-
winstate->frametailgroup=0;
1100-
winstate->groupheadpos=0;
1101-
winstate->grouptailpos=-1;/* see update_grouptailpos */
1102-
ExecClearTuple(winstate->agg_row_slot);
1103-
if (winstate->framehead_slot)
1104-
ExecClearTuple(winstate->framehead_slot);
1105-
if (winstate->frametail_slot)
1106-
ExecClearTuple(winstate->frametail_slot);
1107-
1108-
/*
1109-
* If this is the very first partition, we need to fetch the first input
1110-
* row to store in first_part_slot.
1111-
*/
1112-
if (TupIsNull(winstate->first_part_slot))
1113-
{
1114-
TupleTableSlot*outerslot=ExecProcNode(outerPlan);
11151090

1116-
if (!TupIsNull(outerslot))
1117-
ExecCopySlot(winstate->first_part_slot,outerslot);
1118-
else
1119-
{
1120-
/* outer plan is empty, so we have nothing to do */
1121-
winstate->partition_spooled= true;
1122-
winstate->more_partitions= false;
1123-
return;
1124-
}
1125-
}
1091+
/* we shouldn't be called if this was done already */
1092+
Assert(winstate->buffer==NULL);
11261093

1127-
/* Create new tuplestorefor this partition*/
1094+
/* Create new tuplestore */
11281095
winstate->buffer=tuplestore_begin_heap(false, false,work_mem);
11291096

11301097
/*
@@ -1158,16 +1125,10 @@ begin_partition(WindowAggState *winstate)
11581125

11591126
agg_winobj->readptr=tuplestore_alloc_read_pointer(winstate->buffer,
11601127
readptr_flags);
1161-
agg_winobj->markpos=-1;
1162-
agg_winobj->seekpos=-1;
1163-
1164-
/* Also reset the row counters for aggregates */
1165-
winstate->aggregatedbase=0;
1166-
winstate->aggregatedupto=0;
11671128
}
11681129

11691130
/* create mark and read pointers for each real window function */
1170-
for (i=0;i<numfuncs;i++)
1131+
for (inti=0;i<numfuncs;i++)
11711132
{
11721133
WindowStatePerFuncperfuncstate=&(winstate->perfunc[i]);
11731134

@@ -1179,8 +1140,6 @@ begin_partition(WindowAggState *winstate)
11791140
0);
11801141
winobj->readptr=tuplestore_alloc_read_pointer(winstate->buffer,
11811142
EXEC_FLAG_BACKWARD);
1182-
winobj->markpos=-1;
1183-
winobj->seekpos=-1;
11841143
}
11851144
}
11861145

@@ -1224,6 +1183,88 @@ begin_partition(WindowAggState *winstate)
12241183
winstate->grouptail_ptr=
12251184
tuplestore_alloc_read_pointer(winstate->buffer,0);
12261185
}
1186+
}
1187+
1188+
/*
1189+
* begin_partition
1190+
* Start buffering rows of the next partition.
1191+
*/
1192+
staticvoid
1193+
begin_partition(WindowAggState*winstate)
1194+
{
1195+
PlanState*outerPlan=outerPlanState(winstate);
1196+
intnumfuncs=winstate->numfuncs;
1197+
1198+
winstate->partition_spooled= false;
1199+
winstate->framehead_valid= false;
1200+
winstate->frametail_valid= false;
1201+
winstate->grouptail_valid= false;
1202+
winstate->spooled_rows=0;
1203+
winstate->currentpos=0;
1204+
winstate->frameheadpos=0;
1205+
winstate->frametailpos=0;
1206+
winstate->currentgroup=0;
1207+
winstate->frameheadgroup=0;
1208+
winstate->frametailgroup=0;
1209+
winstate->groupheadpos=0;
1210+
winstate->grouptailpos=-1;/* see update_grouptailpos */
1211+
ExecClearTuple(winstate->agg_row_slot);
1212+
if (winstate->framehead_slot)
1213+
ExecClearTuple(winstate->framehead_slot);
1214+
if (winstate->frametail_slot)
1215+
ExecClearTuple(winstate->frametail_slot);
1216+
1217+
/*
1218+
* If this is the very first partition, we need to fetch the first input
1219+
* row to store in first_part_slot.
1220+
*/
1221+
if (TupIsNull(winstate->first_part_slot))
1222+
{
1223+
TupleTableSlot*outerslot=ExecProcNode(outerPlan);
1224+
1225+
if (!TupIsNull(outerslot))
1226+
ExecCopySlot(winstate->first_part_slot,outerslot);
1227+
else
1228+
{
1229+
/* outer plan is empty, so we have nothing to do */
1230+
winstate->partition_spooled= true;
1231+
winstate->more_partitions= false;
1232+
return;
1233+
}
1234+
}
1235+
1236+
/* Create new tuplestore if not done already. */
1237+
if (unlikely(winstate->buffer==NULL))
1238+
prepare_tuplestore(winstate);
1239+
1240+
winstate->next_partition= false;
1241+
1242+
if (winstate->numaggs>0)
1243+
{
1244+
WindowObjectagg_winobj=winstate->agg_winobj;
1245+
1246+
/* reset mark and see positions for aggregate functions */
1247+
agg_winobj->markpos=-1;
1248+
agg_winobj->seekpos=-1;
1249+
1250+
/* Also reset the row counters for aggregates */
1251+
winstate->aggregatedbase=0;
1252+
winstate->aggregatedupto=0;
1253+
}
1254+
1255+
/* reset mark and seek positions for each real window function */
1256+
for (inti=0;i<numfuncs;i++)
1257+
{
1258+
WindowStatePerFuncperfuncstate=&(winstate->perfunc[i]);
1259+
1260+
if (!perfuncstate->plain_agg)
1261+
{
1262+
WindowObjectwinobj=perfuncstate->winobj;
1263+
1264+
winobj->markpos=-1;
1265+
winobj->seekpos=-1;
1266+
}
1267+
}
12271268

12281269
/*
12291270
* Store the first tuple into the tuplestore (it's always available now;
@@ -1360,9 +1401,9 @@ release_partition(WindowAggState *winstate)
13601401
}
13611402

13621403
if (winstate->buffer)
1363-
tuplestore_end(winstate->buffer);
1364-
winstate->buffer=NULL;
1404+
tuplestore_clear(winstate->buffer);
13651405
winstate->partition_spooled= false;
1406+
winstate->next_partition= true;
13661407
}
13671408

13681409
/*
@@ -2143,7 +2184,7 @@ ExecWindowAgg(PlanState *pstate)
21432184
/* We need to loop as the runCondition or qual may filter out tuples */
21442185
for (;;)
21452186
{
2146-
if (winstate->buffer==NULL)
2187+
if (winstate->next_partition)
21472188
{
21482189
/* Initialize for first partition and set current row = 0 */
21492190
begin_partition(winstate);
@@ -2686,6 +2727,7 @@ ExecInitWindowAgg(WindowAgg *node, EState *estate, int eflags)
26862727
winstate->all_first= true;
26872728
winstate->partition_spooled= false;
26882729
winstate->more_partitions= false;
2730+
winstate->next_partition= true;
26892731

26902732
returnwinstate;
26912733
}
@@ -2700,6 +2742,14 @@ ExecEndWindowAgg(WindowAggState *node)
27002742
PlanState*outerPlan;
27012743
inti;
27022744

2745+
if (node->buffer!=NULL)
2746+
{
2747+
tuplestore_end(node->buffer);
2748+
2749+
/* nullify so that release_partition skips the tuplestore_clear() */
2750+
node->buffer=NULL;
2751+
}
2752+
27032753
release_partition(node);
27042754

27052755
for (i=0;i<node->numaggs;i++)

‎src/include/nodes/execnodes.h

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2619,6 +2619,17 @@ typedef struct WindowAggState
26192619
boolinRangeAsc;/* use ASC sort order for in_range tests? */
26202620
boolinRangeNullsFirst;/* nulls sort first for in_range tests? */
26212621

2622+
/* fields relating to runconditions */
2623+
booluse_pass_through;/* When false, stop execution when
2624+
* runcondition is no longer true. Else
2625+
* just stop evaluating window funcs. */
2626+
booltop_window;/* true if this is the top-most WindowAgg or
2627+
* the only WindowAgg in this query level */
2628+
ExprState*runcondition;/* Condition which must remain true otherwise
2629+
* execution of the WindowAgg will finish or
2630+
* go into pass-through mode. NULL when there
2631+
* is no such condition. */
2632+
26222633
/* these fields are used in GROUPS mode: */
26232634
int64currentgroup;/* peer group # of current row in partition */
26242635
int64frameheadgroup;/* peer group # of frame head row */
@@ -2631,19 +2642,10 @@ typedef struct WindowAggState
26312642
MemoryContextcuraggcontext;/* current aggregate's working data */
26322643
ExprContext*tmpcontext;/* short-term evaluation context */
26332644

2634-
ExprState*runcondition;/* Condition which must remain true otherwise
2635-
* execution of the WindowAgg will finish or
2636-
* go into pass-through mode. NULL when there
2637-
* is no such condition. */
2638-
2639-
booluse_pass_through;/* When false, stop execution when
2640-
* runcondition is no longer true. Else
2641-
* just stop evaluating window funcs. */
2642-
booltop_window;/* true if this is the top-most WindowAgg or
2643-
* the only WindowAgg in this query level */
26442645
boolall_first;/* true if the scan is starting */
26452646
boolpartition_spooled;/* true if all tuples in current partition
26462647
* have been spooled into tuplestore */
2648+
boolnext_partition;/* true if begin_partition needs to be called */
26472649
boolmore_partitions;/* true if there's more partitions after
26482650
* this one */
26492651
boolframehead_valid;/* true if frameheadpos is known up to

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp