Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc8ea87e

Browse files
committed
Avoid quadratic slowdown in regexp match/split functions.
regexp_matches, regexp_split_to_table and regexp_split_to_array allwork by compiling a list of match positions as character offsets (NOTbyte positions) in the source string.Formerly, they then used text_substr to extract the matched text; butin a multi-byte encoding, that counts the characters in the string,and the characters needed to reach the starting byte position, onevery call. Accordingly, the performance degraded as the product ofthe input string length and the number of match positions, such thatsplitting a string of a few hundred kbytes could take many minutes.Repair by keeping the wide-character copy of the input stringavailable (only in the case where encoding_max_length is not 1) afterperforming the match operation, and extracting substrings from thatinstead. This reduces the complexity to being linear in the number ofresult bytes, discounting the actual regexp match itself (which is notaffected by this patch).In passing, remove cleanup using retail pfree() which was obsoleted bycommitff428cd (Feb 2008) which made cleanup of SRF multi-callcontexts automatic. Also increase (to ~134 million) the maximum numberof matches and provide an error message when it is reached.Backpatch all the way because this has been wrong forever.Analysis and patch by me; review by Kaiting Chen.Discussion:https://postgr.es/m/87pnyn55qh.fsf@news-spur.riddles.org.uksee alsohttps://postgr.es/m/87lg996g4r.fsf@news-spur.riddles.org.uk
1 parent3e2ceb2 commitc8ea87e

File tree

1 file changed

+129
-53
lines changed

1 file changed

+129
-53
lines changed

‎src/backend/utils/adt/regexp.c

Lines changed: 129 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include"regex/regex.h"
3636
#include"utils/array.h"
3737
#include"utils/builtins.h"
38+
#include"utils/memutils.h"
3839
#include"utils/varlena.h"
3940

4041
#definePG_GETARG_TEXT_PP_IF_EXISTS(_n) \
@@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx
6162
/* workspace for build_regexp_match_result() */
6263
Datum*elems;/* has npatterns elements */
6364
bool*nulls;/* has npatterns elements */
65+
pg_wchar*wide_str;/* wide-char version of original string */
66+
char*conv_buf;/* conversion buffer */
67+
intconv_bufsiz;/* size thereof */
6468
}regexp_matches_ctx;
6569

6670
/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111115
pg_re_flags*flags,
112116
Oidcollation,
113117
booluse_subpatterns,
114-
boolignore_degenerate);
115-
staticvoidcleanup_regexp_matches(regexp_matches_ctx*matchctx);
118+
boolignore_degenerate,
119+
boolfetching_unmatched);
116120
staticArrayType*build_regexp_match_result(regexp_matches_ctx*matchctx);
117121
staticDatumbuild_regexp_split_result(regexp_matches_ctx*splitctx);
118122

@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
863867
errhint("Use the regexp_matches function instead.")));
864868

865869
matchctx=setup_regexp_matches(orig_str,pattern,&re_flags,
866-
PG_GET_COLLATION(), true, false);
870+
PG_GET_COLLATION(), true, false, false);
867871

868872
if (matchctx->nmatches==0)
869873
PG_RETURN_NULL();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
911915
matchctx=setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0),pattern,
912916
&re_flags,
913917
PG_GET_COLLATION(),
914-
true, false);
918+
true, false, false);
915919

916920
/* Pre-create workspace that build_regexp_match_result needs */
917921
matchctx->elems= (Datum*)palloc(sizeof(Datum)*matchctx->npatterns);
@@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)
933937
SRF_RETURN_NEXT(funcctx,PointerGetDatum(result_ary));
934938
}
935939

936-
/* release space in multi-call ctx to avoid intraquery memory leak */
937-
cleanup_regexp_matches(matchctx);
938-
939940
SRF_RETURN_DONE(funcctx);
940941
}
941942

@@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
954955
* all the matching in one swoop. The returned regexp_matches_ctx contains
955956
* the locations of all the substrings matching the pattern.
956957
*
957-
* Thetwo bool parameters have only two patterns (one for matching, one for
958+
* Thethree bool parameters have only two patterns (one for matching, one for
958959
* splitting) but it seems clearer to distinguish the functionality this way
959-
* than to key it all off one "is_split" flag.
960+
* than to key it all off one "is_split" flag. We don't currently assume that
961+
* fetching_unmatched is exclusive of fetching the matched text too; if it's
962+
* set, the conversion buffer is large enough to fetch any single matched or
963+
* unmatched string, but not any larger substring. (In practice, when splitting
964+
* the matches are usually small anyway, and it didn't seem worth complicating
965+
* the code further.)
960966
*/
961967
staticregexp_matches_ctx*
962968
setup_regexp_matches(text*orig_str,text*pattern,pg_re_flags*re_flags,
963969
Oidcollation,
964970
booluse_subpatterns,
965-
boolignore_degenerate)
971+
boolignore_degenerate,
972+
boolfetching_unmatched)
966973
{
967974
regexp_matches_ctx*matchctx=palloc0(sizeof(regexp_matches_ctx));
975+
inteml=pg_database_encoding_max_length();
968976
intorig_len;
969977
pg_wchar*wide_str;
970978
intwide_len;
@@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
975983
intarray_idx;
976984
intprev_match_end;
977985
intstart_search;
986+
intmaxlen=0;/* largest fetch length in characters */
978987

979988
/* save original string --- we'll extract result substrings from it */
980989
matchctx->orig_str=orig_str;
@@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10031012
/* temporary output space for RE package */
10041013
pmatch=palloc(sizeof(regmatch_t)*pmatch_len);
10051014

1006-
/* the real output space (grown dynamically if needed) */
1007-
array_len=re_flags->glob ?256 :32;
1015+
/*
1016+
* the real output space (grown dynamically if needed)
1017+
*
1018+
* use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
1019+
* than at 2^27
1020+
*/
1021+
array_len=re_flags->glob ?255 :31;
10081022
matchctx->match_locs= (int*)palloc(sizeof(int)*array_len);
10091023
array_idx=0;
10101024

@@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10241038
pmatch[0].rm_eo>prev_match_end))
10251039
{
10261040
/* enlarge output space if needed */
1027-
while (array_idx+matchctx->npatterns*2>array_len)
1041+
while (array_idx+matchctx->npatterns*2+1>array_len)
10281042
{
1029-
array_len *=2;
1043+
array_len+=array_len+1;/* 2^n-1 => 2^(n+1)-1 */
1044+
if (array_len>MaxAllocSize/sizeof(int))
1045+
ereport(ERROR,
1046+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1047+
errmsg("too many regular expression matches")));
10301048
matchctx->match_locs= (int*)repalloc(matchctx->match_locs,
10311049
sizeof(int)*array_len);
10321050
}
@@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10381056

10391057
for (i=1;i <=matchctx->npatterns;i++)
10401058
{
1041-
matchctx->match_locs[array_idx++]=pmatch[i].rm_so;
1042-
matchctx->match_locs[array_idx++]=pmatch[i].rm_eo;
1059+
intso=pmatch[i].rm_so;
1060+
inteo=pmatch[i].rm_eo;
1061+
matchctx->match_locs[array_idx++]=so;
1062+
matchctx->match_locs[array_idx++]=eo;
1063+
if (so >=0&&eo >=0&& (eo-so)>maxlen)
1064+
maxlen= (eo-so);
10431065
}
10441066
}
10451067
else
10461068
{
1047-
matchctx->match_locs[array_idx++]=pmatch[0].rm_so;
1048-
matchctx->match_locs[array_idx++]=pmatch[0].rm_eo;
1069+
intso=pmatch[0].rm_so;
1070+
inteo=pmatch[0].rm_eo;
1071+
matchctx->match_locs[array_idx++]=so;
1072+
matchctx->match_locs[array_idx++]=eo;
1073+
if (so >=0&&eo >=0&& (eo-so)>maxlen)
1074+
maxlen= (eo-so);
10491075
}
10501076
matchctx->nmatches++;
1077+
1078+
/*
1079+
* check length of unmatched portion between end of previous match
1080+
* and start of current one
1081+
*/
1082+
if (fetching_unmatched&&
1083+
pmatch[0].rm_so >=0&&
1084+
(pmatch[0].rm_so-prev_match_end)>maxlen)
1085+
maxlen= (pmatch[0].rm_so-prev_match_end);
10511086
}
10521087
prev_match_end=pmatch[0].rm_eo;
10531088

@@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10681103
break;
10691104
}
10701105

1106+
/*
1107+
* check length of unmatched portion between end of last match and end of
1108+
* input string
1109+
*/
1110+
if (fetching_unmatched&&
1111+
(wide_len-prev_match_end)>maxlen)
1112+
maxlen= (wide_len-prev_match_end);
1113+
1114+
/*
1115+
* Keep a note of the end position of the string for the benefit of
1116+
* splitting code.
1117+
*/
1118+
matchctx->match_locs[array_idx]=wide_len;
1119+
1120+
if (eml>1)
1121+
{
1122+
int64maxsiz=eml* (int64)maxlen;
1123+
intconv_bufsiz;
1124+
1125+
/*
1126+
* Make the conversion buffer large enough for any substring of
1127+
* interest.
1128+
*
1129+
* Worst case: assume we need the maximum size (maxlen*eml), but take
1130+
* advantage of the fact that the original string length in bytes is an
1131+
* upper bound on the byte length of any fetched substring (and we know
1132+
* that len+1 is safe to allocate because the varlena header is longer
1133+
* than 1 byte).
1134+
*/
1135+
if (maxsiz>orig_len)
1136+
conv_bufsiz=orig_len+1;
1137+
else
1138+
conv_bufsiz=maxsiz+1;/* safe since maxsiz < 2^30 */
1139+
1140+
matchctx->conv_buf=palloc(conv_bufsiz);
1141+
matchctx->conv_bufsiz=conv_bufsiz;
1142+
matchctx->wide_str=wide_str;
1143+
}
1144+
else
1145+
{
1146+
/* No need to keep the wide string if we're in a single-byte charset. */
1147+
pfree(wide_str);
1148+
matchctx->wide_str=NULL;
1149+
matchctx->conv_buf=NULL;
1150+
matchctx->conv_bufsiz=0;
1151+
}
1152+
10711153
/* Clean up temp storage */
1072-
pfree(wide_str);
10731154
pfree(pmatch);
10741155

10751156
returnmatchctx;
10761157
}
10771158

1078-
/*
1079-
* cleanup_regexp_matches - release memory of a regexp_matches_ctx
1080-
*/
1081-
staticvoid
1082-
cleanup_regexp_matches(regexp_matches_ctx*matchctx)
1083-
{
1084-
pfree(matchctx->orig_str);
1085-
pfree(matchctx->match_locs);
1086-
if (matchctx->elems)
1087-
pfree(matchctx->elems);
1088-
if (matchctx->nulls)
1089-
pfree(matchctx->nulls);
1090-
pfree(matchctx);
1091-
}
1092-
10931159
/*
10941160
* build_regexp_match_result - build output array for current match
10951161
*/
10961162
staticArrayType*
10971163
build_regexp_match_result(regexp_matches_ctx*matchctx)
10981164
{
1165+
char*buf=matchctx->conv_buf;
1166+
intbufsizPG_USED_FOR_ASSERTS_ONLY=matchctx->conv_bufsiz;
10991167
Datum*elems=matchctx->elems;
11001168
bool*nulls=matchctx->nulls;
11011169
intdims[1];
@@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
11151183
elems[i]= (Datum)0;
11161184
nulls[i]= true;
11171185
}
1186+
elseif (buf)
1187+
{
1188+
intlen=pg_wchar2mb_with_len(matchctx->wide_str+so,
1189+
buf,
1190+
eo-so);
1191+
Assert(len<bufsiz);
1192+
elems[i]=PointerGetDatum(cstring_to_text_with_len(buf,len));
1193+
nulls[i]= false;
1194+
}
11181195
else
11191196
{
11201197
elems[i]=DirectFunctionCall3(text_substr,
@@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11681245
splitctx=setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0),pattern,
11691246
&re_flags,
11701247
PG_GET_COLLATION(),
1171-
false, true);
1248+
false, true, true);
11721249

11731250
MemoryContextSwitchTo(oldcontext);
11741251
funcctx->user_fctx= (void*)splitctx;
@@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11851262
SRF_RETURN_NEXT(funcctx,result);
11861263
}
11871264

1188-
/* release space in multi-call ctx to avoid intraquery memory leak */
1189-
cleanup_regexp_matches(splitctx);
1190-
11911265
SRF_RETURN_DONE(funcctx);
11921266
}
11931267

@@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
12241298
PG_GETARG_TEXT_PP(1),
12251299
&re_flags,
12261300
PG_GET_COLLATION(),
1227-
false, true);
1301+
false, true, true);
12281302

12291303
while (splitctx->next_match <=splitctx->nmatches)
12301304
{
@@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
12361310
splitctx->next_match++;
12371311
}
12381312

1239-
/*
1240-
* We don't call cleanup_regexp_matches here; it would try to pfree the
1241-
* input string, which we didn't copy. The space is not in a long-lived
1242-
* memory context anyway.
1243-
*/
1244-
12451313
PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,CurrentMemoryContext));
12461314
}
12471315

@@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
12611329
staticDatum
12621330
build_regexp_split_result(regexp_matches_ctx*splitctx)
12631331
{
1332+
char*buf=splitctx->conv_buf;
12641333
intstartpos;
12651334
intendpos;
12661335

@@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
12711340
if (startpos<0)
12721341
elog(ERROR,"invalid match ending position");
12731342

1274-
if (splitctx->next_match<splitctx->nmatches)
1343+
if (buf)
12751344
{
1345+
intbufsizPG_USED_FOR_ASSERTS_ONLY=splitctx->conv_bufsiz;
1346+
intlen;
1347+
12761348
endpos=splitctx->match_locs[splitctx->next_match*2];
12771349
if (endpos<startpos)
12781350
elog(ERROR,"invalid match starting position");
1279-
returnDirectFunctionCall3(text_substr,
1280-
PointerGetDatum(splitctx->orig_str),
1281-
Int32GetDatum(startpos+1),
1282-
Int32GetDatum(endpos-startpos));
1351+
len=pg_wchar2mb_with_len(splitctx->wide_str+startpos,
1352+
buf,
1353+
endpos-startpos);
1354+
Assert(len<bufsiz);
1355+
returnPointerGetDatum(cstring_to_text_with_len(buf,len));
12831356
}
12841357
else
12851358
{
1286-
/* no more matches, return rest of string */
1287-
returnDirectFunctionCall2(text_substr_no_len,
1359+
endpos=splitctx->match_locs[splitctx->next_match*2];
1360+
if (endpos<startpos)
1361+
elog(ERROR,"invalid match starting position");
1362+
returnDirectFunctionCall3(text_substr,
12881363
PointerGetDatum(splitctx->orig_str),
1289-
Int32GetDatum(startpos+1));
1364+
Int32GetDatum(startpos+1),
1365+
Int32GetDatum(endpos-startpos));
12901366
}
12911367
}
12921368

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp