NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commitc8ea87e

committed

Avoid quadratic slowdown in regexp match/split functions.

regexp_matches, regexp_split_to_table and regexp_split_to_array allwork by compiling a list of match positions as character offsets (NOTbyte positions) in the source string.Formerly, they then used text_substr to extract the matched text; butin a multi-byte encoding, that counts the characters in the string,and the characters needed to reach the starting byte position, onevery call. Accordingly, the performance degraded as the product ofthe input string length and the number of match positions, such thatsplitting a string of a few hundred kbytes could take many minutes.Repair by keeping the wide-character copy of the input stringavailable (only in the case where encoding_max_length is not 1) afterperforming the match operation, and extracting substrings from thatinstead. This reduces the complexity to being linear in the number ofresult bytes, discounting the actual regexp match itself (which is notaffected by this patch).In passing, remove cleanup using retail pfree() which was obsoleted bycommitff428cd (Feb 2008) which made cleanup of SRF multi-callcontexts automatic. Also increase (to ~134 million) the maximum numberof matches and provide an error message when it is reached.Backpatch all the way because this has been wrong forever.Analysis and patch by me; review by Kaiting Chen.Discussion:https://postgr.es/m/87pnyn55qh.fsf@news-spur.riddles.org.uksee alsohttps://postgr.es/m/87lg996g4r.fsf@news-spur.riddles.org.uk

1 parent3e2ceb2 commitc8ea87eCopy full SHA for c8ea87e

File tree

1 file changed

+129

-53

lines changed

src/backend/utils/adt
- regexp.c

1 file changed

+129

-53

lines changed

`‎src/backend/utils/adt/regexp.c‎`

Lines changed: 129 additions & 53 deletions

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@`
`35`	`35`	`#include"regex/regex.h"`
`36`	`36`	`#include"utils/array.h"`
`37`	`37`	`#include"utils/builtins.h"`
	`38`	`+#include"utils/memutils.h"`
`38`	`39`	`#include"utils/varlena.h"`
`39`	`40`
`40`	`41`	`#definePG_GETARG_TEXT_PP_IF_EXISTS(_n) \`
`@@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx`
`61`	`62`	`/* workspace for build_regexp_match_result() */`
`62`	`63`	`Datumelems;/ has npatterns elements */`
`63`	`64`	`boolnulls;/ has npatterns elements */`
	`65`	`+pg_wcharwide_str;/ wide-char version of original string */`
	`66`	`+charconv_buf;/ conversion buffer */`
	`67`	`+intconv_bufsiz;/* size thereof */`
`64`	`68`	`}regexp_matches_ctx;`
`65`	`69`
`66`	`70`	`/*`
`@@ -111,8 +115,8 @@ static regexp_matches_ctx setup_regexp_matches(text orig_str, text *pattern,`
`111`	`115`	`pg_re_flags*flags,`
`112`	`116`	`Oidcollation,`
`113`	`117`	`booluse_subpatterns,`
`114`		`-boolignore_degenerate);`
`115`		`-staticvoidcleanup_regexp_matches(regexp_matches_ctx*matchctx);`
	`118`	`+boolignore_degenerate,`
	`119`	`+boolfetching_unmatched);`
`116`	`120`	`staticArrayTypebuild_regexp_match_result(regexp_matches_ctxmatchctx);`
`117`	`121`	`staticDatumbuild_regexp_split_result(regexp_matches_ctx*splitctx);`
`118`	`122`
`@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)`
`863`	`867`	`errhint("Use the regexp_matches function instead.")));`
`864`	`868`
`865`	`869`	`matchctx=setup_regexp_matches(orig_str,pattern,&re_flags,`
`866`		`-PG_GET_COLLATION(), true, false);`
	`870`	`+PG_GET_COLLATION(), true, false, false);`
`867`	`871`
`868`	`872`	`if (matchctx->nmatches==0)`
`869`	`873`	`PG_RETURN_NULL();`
`@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)`
`911`	`915`	`matchctx=setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0),pattern,`
`912`	`916`	`&re_flags,`
`913`	`917`	`PG_GET_COLLATION(),`
`914`		`-true, false);`
	`918`	`+true, false, false);`
`915`	`919`
`916`	`920`	`/* Pre-create workspace that build_regexp_match_result needs */`
`917`	`921`	`matchctx->elems= (Datum)palloc(sizeof(Datum)matchctx->npatterns);`
`@@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)`
`933`	`937`	`SRF_RETURN_NEXT(funcctx,PointerGetDatum(result_ary));`
`934`	`938`	`}`
`935`	`939`
`936`		`-/* release space in multi-call ctx to avoid intraquery memory leak */`
`937`		`-cleanup_regexp_matches(matchctx);`
`938`		`-`
`939`	`940`	`SRF_RETURN_DONE(funcctx);`
`940`	`941`	`}`
`941`	`942`
`@@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)`
`954`	`955`	`* all the matching in one swoop. The returned regexp_matches_ctx contains`
`955`	`956`	`* the locations of all the substrings matching the pattern.`
`956`	`957`	`*`
`957`		`- * Thetwo bool parameters have only two patterns (one for matching, one for`
	`958`	`+ * Thethree bool parameters have only two patterns (one for matching, one for`
`958`	`959`	`* splitting) but it seems clearer to distinguish the functionality this way`
`959`		`- * than to key it all off one "is_split" flag.`
	`960`	`+ * than to key it all off one "is_split" flag. We don't currently assume that`
	`961`	`+ * fetching_unmatched is exclusive of fetching the matched text too; if it's`
	`962`	`+ * set, the conversion buffer is large enough to fetch any single matched or`
	`963`	`+ * unmatched string, but not any larger substring. (In practice, when splitting`
	`964`	`+ * the matches are usually small anyway, and it didn't seem worth complicating`
	`965`	`+ * the code further.)`
`960`	`966`	`*/`
`961`	`967`	`staticregexp_matches_ctx*`
`962`	`968`	`setup_regexp_matches(textorig_str,textpattern,pg_re_flags*re_flags,`
`963`	`969`	`Oidcollation,`
`964`	`970`	`booluse_subpatterns,`
`965`		`-boolignore_degenerate)`
	`971`	`+boolignore_degenerate,`
	`972`	`+boolfetching_unmatched)`
`966`	`973`	`{`
`967`	`974`	`regexp_matches_ctx*matchctx=palloc0(sizeof(regexp_matches_ctx));`
	`975`	`+inteml=pg_database_encoding_max_length();`
`968`	`976`	`intorig_len;`
`969`	`977`	`pg_wchar*wide_str;`
`970`	`978`	`intwide_len;`
`@@ -975,6 +983,7 @@ setup_regexp_matches(text orig_str, text pattern, pg_re_flags *re_flags,`
`975`	`983`	`intarray_idx;`
`976`	`984`	`intprev_match_end;`
`977`	`985`	`intstart_search;`
	`986`	`+intmaxlen=0;/* largest fetch length in characters */`
`978`	`987`
`979`	`988`	`/* save original string --- we'll extract result substrings from it */`
`980`	`989`	`matchctx->orig_str=orig_str;`
`@@ -1003,8 +1012,13 @@ setup_regexp_matches(text orig_str, text pattern, pg_re_flags *re_flags,`
`1003`	`1012`	`/* temporary output space for RE package */`
`1004`	`1013`	`pmatch=palloc(sizeof(regmatch_t)*pmatch_len);`
`1005`	`1014`
`1006`		`-/* the real output space (grown dynamically if needed) */`
`1007`		`-array_len=re_flags->glob ?256 :32;`
	`1015`	`+/*`
	`1016`	`+ * the real output space (grown dynamically if needed)`
	`1017`	`+ *`
	`1018`	`+ * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather`
	`1019`	`+ * than at 2^27`
	`1020`	`+ */`
	`1021`	`+array_len=re_flags->glob ?255 :31;`
`1008`	`1022`	`matchctx->match_locs= (int)palloc(sizeof(int)array_len);`
`1009`	`1023`	`array_idx=0;`
`1010`	`1024`
`@@ -1024,9 +1038,13 @@ setup_regexp_matches(text orig_str, text pattern, pg_re_flags *re_flags,`
`1024`	`1038`	`pmatch[0].rm_eo>prev_match_end))`
`1025`	`1039`	`{`
`1026`	`1040`	`/* enlarge output space if needed */`
`1027`		`-while (array_idx+matchctx->npatterns*2>array_len)`
	`1041`	`+while (array_idx+matchctx->npatterns*2+1>array_len)`
`1028`	`1042`	`{`
`1029`		`-array_len *=2;`
	`1043`	`+array_len+=array_len+1;/* 2^n-1 => 2^(n+1)-1 */`
	`1044`	`+if (array_len>MaxAllocSize/sizeof(int))`
	`1045`	`+ereport(ERROR,`
	`1046`	`+(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),`
	`1047`	`+errmsg("too many regular expression matches")));`
`1030`	`1048`	`matchctx->match_locs= (int*)repalloc(matchctx->match_locs,`
`1031`	`1049`	`sizeof(int)*array_len);`
`1032`	`1050`	`}`
`@@ -1038,16 +1056,33 @@ setup_regexp_matches(text orig_str, text pattern, pg_re_flags *re_flags,`
`1038`	`1056`
`1039`	`1057`	`for (i=1;i <=matchctx->npatterns;i++)`
`1040`	`1058`	`{`
`1041`		`-matchctx->match_locs[array_idx++]=pmatch[i].rm_so;`
`1042`		`-matchctx->match_locs[array_idx++]=pmatch[i].rm_eo;`
	`1059`	`+intso=pmatch[i].rm_so;`
	`1060`	`+inteo=pmatch[i].rm_eo;`
	`1061`	`+matchctx->match_locs[array_idx++]=so;`
	`1062`	`+matchctx->match_locs[array_idx++]=eo;`
	`1063`	`+if (so >=0&&eo >=0&& (eo-so)>maxlen)`
	`1064`	`+maxlen= (eo-so);`
`1043`	`1065`	`}`
`1044`	`1066`	`}`
`1045`	`1067`	`else`
`1046`	`1068`	`{`
`1047`		`-matchctx->match_locs[array_idx++]=pmatch[0].rm_so;`
`1048`		`-matchctx->match_locs[array_idx++]=pmatch[0].rm_eo;`
	`1069`	`+intso=pmatch[0].rm_so;`
	`1070`	`+inteo=pmatch[0].rm_eo;`
	`1071`	`+matchctx->match_locs[array_idx++]=so;`
	`1072`	`+matchctx->match_locs[array_idx++]=eo;`
	`1073`	`+if (so >=0&&eo >=0&& (eo-so)>maxlen)`
	`1074`	`+maxlen= (eo-so);`
`1049`	`1075`	`}`
`1050`	`1076`	`matchctx->nmatches++;`
	`1077`	`+`
	`1078`	`+/*`
	`1079`	`+ * check length of unmatched portion between end of previous match`
	`1080`	`+ * and start of current one`
	`1081`	`+ */`
	`1082`	`+if (fetching_unmatched&&`
	`1083`	`+pmatch[0].rm_so >=0&&`
	`1084`	`+(pmatch[0].rm_so-prev_match_end)>maxlen)`
	`1085`	`+maxlen= (pmatch[0].rm_so-prev_match_end);`
`1051`	`1086`	`}`
`1052`	`1087`	`prev_match_end=pmatch[0].rm_eo;`
`1053`	`1088`
`@@ -1068,34 +1103,67 @@ setup_regexp_matches(text orig_str, text pattern, pg_re_flags *re_flags,`
`1068`	`1103`	`break;`
`1069`	`1104`	`}`
`1070`	`1105`
	`1106`	`+/*`
	`1107`	`+ * check length of unmatched portion between end of last match and end of`
	`1108`	`+ * input string`
	`1109`	`+ */`
	`1110`	`+if (fetching_unmatched&&`
	`1111`	`+(wide_len-prev_match_end)>maxlen)`
	`1112`	`+maxlen= (wide_len-prev_match_end);`
	`1113`	`+`
	`1114`	`+/*`
	`1115`	`+ * Keep a note of the end position of the string for the benefit of`
	`1116`	`+ * splitting code.`
	`1117`	`+ */`
	`1118`	`+matchctx->match_locs[array_idx]=wide_len;`
	`1119`	`+`
	`1120`	`+if (eml>1)`
	`1121`	`+{`
	`1122`	`+int64maxsiz=eml* (int64)maxlen;`
	`1123`	`+intconv_bufsiz;`
	`1124`	`+`
	`1125`	`+/*`
	`1126`	`+ * Make the conversion buffer large enough for any substring of`
	`1127`	`+ * interest.`
	`1128`	`+ *`
	`1129`	`+ * Worst case: assume we need the maximum size (maxlen*eml), but take`
	`1130`	`+ * advantage of the fact that the original string length in bytes is an`
	`1131`	`+ * upper bound on the byte length of any fetched substring (and we know`
	`1132`	`+ * that len+1 is safe to allocate because the varlena header is longer`
	`1133`	`+ * than 1 byte).`
	`1134`	`+ */`
	`1135`	`+if (maxsiz>orig_len)`
	`1136`	`+conv_bufsiz=orig_len+1;`
	`1137`	`+else`
	`1138`	`+conv_bufsiz=maxsiz+1;/* safe since maxsiz < 2^30 */`
	`1139`	`+`
	`1140`	`+matchctx->conv_buf=palloc(conv_bufsiz);`
	`1141`	`+matchctx->conv_bufsiz=conv_bufsiz;`
	`1142`	`+matchctx->wide_str=wide_str;`
	`1143`	`+}`
	`1144`	`+else`
	`1145`	`+{`
	`1146`	`+/* No need to keep the wide string if we're in a single-byte charset. */`
	`1147`	`+pfree(wide_str);`
	`1148`	`+matchctx->wide_str=NULL;`
	`1149`	`+matchctx->conv_buf=NULL;`
	`1150`	`+matchctx->conv_bufsiz=0;`
	`1151`	`+}`
	`1152`	`+`
`1071`	`1153`	`/* Clean up temp storage */`
`1072`		`-pfree(wide_str);`
`1073`	`1154`	`pfree(pmatch);`
`1074`	`1155`
`1075`	`1156`	`returnmatchctx;`
`1076`	`1157`	`}`
`1077`	`1158`
`1078`		`-/*`
`1079`		`- * cleanup_regexp_matches - release memory of a regexp_matches_ctx`
`1080`		`- */`
`1081`		`-staticvoid`
`1082`		`-cleanup_regexp_matches(regexp_matches_ctx*matchctx)`
`1083`		`-{`
`1084`		`-pfree(matchctx->orig_str);`
`1085`		`-pfree(matchctx->match_locs);`
`1086`		`-if (matchctx->elems)`
`1087`		`-pfree(matchctx->elems);`
`1088`		`-if (matchctx->nulls)`
`1089`		`-pfree(matchctx->nulls);`
`1090`		`-pfree(matchctx);`
`1091`		`-}`
`1092`		`-`
`1093`	`1159`	`/*`
`1094`	`1160`	`* build_regexp_match_result - build output array for current match`
`1095`	`1161`	`*/`
`1096`	`1162`	`staticArrayType*`
`1097`	`1163`	`build_regexp_match_result(regexp_matches_ctx*matchctx)`
`1098`	`1164`	`{`
	`1165`	`+char*buf=matchctx->conv_buf;`
	`1166`	`+intbufsizPG_USED_FOR_ASSERTS_ONLY=matchctx->conv_bufsiz;`
`1099`	`1167`	`Datum*elems=matchctx->elems;`
`1100`	`1168`	`bool*nulls=matchctx->nulls;`
`1101`	`1169`	`intdims[1];`
`@@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)`
`1115`	`1183`	`elems[i]= (Datum)0;`
`1116`	`1184`	`nulls[i]= true;`
`1117`	`1185`	`}`
	`1186`	`+elseif (buf)`
	`1187`	`+{`
	`1188`	`+intlen=pg_wchar2mb_with_len(matchctx->wide_str+so,`
	`1189`	`+buf,`
	`1190`	`+eo-so);`
	`1191`	`+Assert(len<bufsiz);`
	`1192`	`+elems[i]=PointerGetDatum(cstring_to_text_with_len(buf,len));`
	`1193`	`+nulls[i]= false;`
	`1194`	`+}`
`1118`	`1195`	`else`
`1119`	`1196`	`{`
`1120`	`1197`	`elems[i]=DirectFunctionCall3(text_substr,`
`@@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)`
`1168`	`1245`	`splitctx=setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0),pattern,`
`1169`	`1246`	`&re_flags,`
`1170`	`1247`	`PG_GET_COLLATION(),`
`1171`		`-false, true);`
	`1248`	`+false, true, true);`
`1172`	`1249`
`1173`	`1250`	`MemoryContextSwitchTo(oldcontext);`
`1174`	`1251`	`funcctx->user_fctx= (void*)splitctx;`
`@@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)`
`1185`	`1262`	`SRF_RETURN_NEXT(funcctx,result);`
`1186`	`1263`	`}`
`1187`	`1264`
`1188`		`-/* release space in multi-call ctx to avoid intraquery memory leak */`
`1189`		`-cleanup_regexp_matches(splitctx);`
`1190`		`-`
`1191`	`1265`	`SRF_RETURN_DONE(funcctx);`
`1192`	`1266`	`}`
`1193`	`1267`
`@@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)`
`1224`	`1298`	`PG_GETARG_TEXT_PP(1),`
`1225`	`1299`	`&re_flags,`
`1226`	`1300`	`PG_GET_COLLATION(),`
`1227`		`-false, true);`
	`1301`	`+false, true, true);`
`1228`	`1302`
`1229`	`1303`	`while (splitctx->next_match <=splitctx->nmatches)`
`1230`	`1304`	`{`
`@@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)`
`1236`	`1310`	`splitctx->next_match++;`
`1237`	`1311`	`}`
`1238`	`1312`
`1239`		`-/*`
`1240`		`- * We don't call cleanup_regexp_matches here; it would try to pfree the`
`1241`		`- * input string, which we didn't copy. The space is not in a long-lived`
`1242`		`- * memory context anyway.`
`1243`		`- */`
`1244`		`-`
`1245`	`1313`	`PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,CurrentMemoryContext));`
`1246`	`1314`	`}`
`1247`	`1315`
`@@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)`
`1261`	`1329`	`staticDatum`
`1262`	`1330`	`build_regexp_split_result(regexp_matches_ctx*splitctx)`
`1263`	`1331`	`{`
	`1332`	`+char*buf=splitctx->conv_buf;`
`1264`	`1333`	`intstartpos;`
`1265`	`1334`	`intendpos;`
`1266`	`1335`
`@@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)`
`1271`	`1340`	`if (startpos<0)`
`1272`	`1341`	`elog(ERROR,"invalid match ending position");`
`1273`	`1342`
`1274`		`-if (splitctx->next_match<splitctx->nmatches)`
	`1343`	`+if (buf)`
`1275`	`1344`	`{`
	`1345`	`+intbufsizPG_USED_FOR_ASSERTS_ONLY=splitctx->conv_bufsiz;`
	`1346`	`+intlen;`
	`1347`	`+`
`1276`	`1348`	`endpos=splitctx->match_locs[splitctx->next_match*2];`
`1277`	`1349`	`if (endpos<startpos)`
`1278`	`1350`	`elog(ERROR,"invalid match starting position");`
`1279`		`-returnDirectFunctionCall3(text_substr,`
`1280`		`-PointerGetDatum(splitctx->orig_str),`
`1281`		`-Int32GetDatum(startpos+1),`
`1282`		`-Int32GetDatum(endpos-startpos));`
	`1351`	`+len=pg_wchar2mb_with_len(splitctx->wide_str+startpos,`
	`1352`	`+buf,`
	`1353`	`+endpos-startpos);`
	`1354`	`+Assert(len<bufsiz);`
	`1355`	`+returnPointerGetDatum(cstring_to_text_with_len(buf,len));`
`1283`	`1356`	`}`
`1284`	`1357`	`else`
`1285`	`1358`	`{`
`1286`		`-/* no more matches, return rest of string */`
`1287`		`-returnDirectFunctionCall2(text_substr_no_len,`
	`1359`	`+endpos=splitctx->match_locs[splitctx->next_match*2];`
	`1360`	`+if (endpos<startpos)`
	`1361`	`+elog(ERROR,"invalid match starting position");`
	`1362`	`+returnDirectFunctionCall3(text_substr,`
`1288`	`1363`	`PointerGetDatum(splitctx->orig_str),`
`1289`		`-Int32GetDatum(startpos+1));`
	`1364`	`+Int32GetDatum(startpos+1),`
	`1365`	`+Int32GetDatum(endpos-startpos));`
`1290`	`1366`	`}`
`1291`	`1367`	`}`
`1292`	`1368`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitc8ea87e

File tree

1 file changed

1 file changed

`‎src/backend/utils/adt/regexp.c‎`

0 commit comments