3535#include "regex/regex.h"
3636#include "utils/array.h"
3737#include "utils/builtins.h"
38+ #include "utils/memutils.h"
3839
3940#define PG_GETARG_TEXT_PP_IF_EXISTS (_n ) \
4041(PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
@@ -60,6 +61,9 @@ typedef struct regexp_matches_ctx
6061/* workspace for build_regexp_matches_result() */
6162Datum * elems ;/* has npatterns elements */
6263bool * nulls ;/* has npatterns elements */
64+ pg_wchar * wide_str ;/* wide-char version of original string */
65+ char * conv_buf ;/* conversion buffer */
66+ int conv_bufsiz ;/* size thereof */
6367}regexp_matches_ctx ;
6468
6569/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111115Oid collation ,
112116bool force_glob ,
113117bool use_subpatterns ,
114- bool ignore_degenerate );
115- static void cleanup_regexp_matches ( regexp_matches_ctx * matchctx );
118+ bool ignore_degenerate ,
119+ bool fetching_unmatched );
116120static ArrayType * build_regexp_matches_result (regexp_matches_ctx * matchctx );
117121static Datum build_regexp_split_result (regexp_matches_ctx * splitctx );
118122
@@ -863,7 +867,7 @@ regexp_matches(PG_FUNCTION_ARGS)
863867matchctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ),pattern ,
864868flags ,
865869PG_GET_COLLATION (),
866- false, true, false);
870+ false, true, false, false );
867871
868872/* Pre-create workspace that build_regexp_matches_result needs */
869873matchctx -> elems = (Datum * )palloc (sizeof (Datum )* matchctx -> npatterns );
@@ -885,9 +889,6 @@ regexp_matches(PG_FUNCTION_ARGS)
885889SRF_RETURN_NEXT (funcctx ,PointerGetDatum (result_ary ));
886890}
887891
888- /* release space in multi-call ctx to avoid intraquery memory leak */
889- cleanup_regexp_matches (matchctx );
890-
891892SRF_RETURN_DONE (funcctx );
892893}
893894
@@ -906,17 +907,25 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
906907 * all the matching in one swoop. The returned regexp_matches_ctx contains
907908 * the locations of all the substrings matching the pattern.
908909 *
909- * The three bool parameters have only two patterns (one for each caller)
910- * but it seems clearer to distinguish the functionality this way than to
911- * key it all off one "is_split" flag.
910+ * The four bool parameters have only two patterns (one for matching, one for
911+ * splitting) but it seems clearer to distinguish the functionality this way
912+ * than to key it all off one "is_split" flag. We don't currently assume that
913+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
914+ * set, the conversion buffer is large enough to fetch any single matched or
915+ * unmatched string, but not any larger substring. (In practice, when splitting
916+ * the matches are usually small anyway, and it didn't seem worth complicating
917+ * the code further.)
912918 */
913919static regexp_matches_ctx *
914920setup_regexp_matches (text * orig_str ,text * pattern ,text * flags ,
915921Oid collation ,
916- bool force_glob ,bool use_subpatterns ,
917- bool ignore_degenerate )
922+ bool force_glob ,
923+ bool use_subpatterns ,
924+ bool ignore_degenerate ,
925+ bool fetching_unmatched )
918926{
919927regexp_matches_ctx * matchctx = palloc0 (sizeof (regexp_matches_ctx ));
928+ int eml = pg_database_encoding_max_length ();
920929int orig_len ;
921930pg_wchar * wide_str ;
922931int wide_len ;
@@ -928,6 +937,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
928937int array_idx ;
929938int prev_match_end ;
930939int start_search ;
940+ int maxlen = 0 ;/* largest fetch length in characters */
931941
932942/* save original string --- we'll extract result substrings from it */
933943matchctx -> orig_str = orig_str ;
@@ -969,8 +979,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
969979/* temporary output space for RE package */
970980pmatch = palloc (sizeof (regmatch_t )* pmatch_len );
971981
972- /* the real output space (grown dynamically if needed) */
973- array_len = re_flags .glob ?256 :32 ;
982+ /*
983+ * the real output space (grown dynamically if needed)
984+ *
985+ * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
986+ * than at 2^27
987+ */
988+ array_len = re_flags .glob ?255 :31 ;
974989matchctx -> match_locs = (int * )palloc (sizeof (int )* array_len );
975990array_idx = 0 ;
976991
@@ -990,9 +1005,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
9901005pmatch [0 ].rm_eo > prev_match_end ))
9911006{
9921007/* enlarge output space if needed */
993- while (array_idx + matchctx -> npatterns * 2 > array_len )
1008+ while (array_idx + matchctx -> npatterns * 2 + 1 > array_len )
9941009{
995- array_len *=2 ;
1010+ array_len += array_len + 1 ;/* 2^n-1 => 2^(n+1)-1 */
1011+ if (array_len > MaxAllocSize /sizeof (int ))
1012+ ereport (ERROR ,
1013+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
1014+ errmsg ("too many regular expression matches" )));
9961015matchctx -> match_locs = (int * )repalloc (matchctx -> match_locs ,
9971016sizeof (int )* array_len );
9981017}
@@ -1004,16 +1023,33 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
10041023
10051024for (i = 1 ;i <=matchctx -> npatterns ;i ++ )
10061025{
1007- matchctx -> match_locs [array_idx ++ ]= pmatch [i ].rm_so ;
1008- matchctx -> match_locs [array_idx ++ ]= pmatch [i ].rm_eo ;
1026+ int so = pmatch [i ].rm_so ;
1027+ int eo = pmatch [i ].rm_eo ;
1028+ matchctx -> match_locs [array_idx ++ ]= so ;
1029+ matchctx -> match_locs [array_idx ++ ]= eo ;
1030+ if (so >=0 && eo >=0 && (eo - so )> maxlen )
1031+ maxlen = (eo - so );
10091032}
10101033}
10111034else
10121035{
1013- matchctx -> match_locs [array_idx ++ ]= pmatch [0 ].rm_so ;
1014- matchctx -> match_locs [array_idx ++ ]= pmatch [0 ].rm_eo ;
1036+ int so = pmatch [0 ].rm_so ;
1037+ int eo = pmatch [0 ].rm_eo ;
1038+ matchctx -> match_locs [array_idx ++ ]= so ;
1039+ matchctx -> match_locs [array_idx ++ ]= eo ;
1040+ if (so >=0 && eo >=0 && (eo - so )> maxlen )
1041+ maxlen = (eo - so );
10151042}
10161043matchctx -> nmatches ++ ;
1044+
1045+ /*
1046+ * check length of unmatched portion between end of previous match
1047+ * and start of current one
1048+ */
1049+ if (fetching_unmatched &&
1050+ pmatch [0 ].rm_so >=0 &&
1051+ (pmatch [0 ].rm_so - prev_match_end )> maxlen )
1052+ maxlen = (pmatch [0 ].rm_so - prev_match_end );
10171053}
10181054prev_match_end = pmatch [0 ].rm_eo ;
10191055
@@ -1034,34 +1070,67 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
10341070break ;
10351071}
10361072
1073+ /*
1074+ * check length of unmatched portion between end of last match and end of
1075+ * input string
1076+ */
1077+ if (fetching_unmatched &&
1078+ (wide_len - prev_match_end )> maxlen )
1079+ maxlen = (wide_len - prev_match_end );
1080+
1081+ /*
1082+ * Keep a note of the end position of the string for the benefit of
1083+ * splitting code.
1084+ */
1085+ matchctx -> match_locs [array_idx ]= wide_len ;
1086+
1087+ if (eml > 1 )
1088+ {
1089+ int64 maxsiz = eml * (int64 )maxlen ;
1090+ int conv_bufsiz ;
1091+
1092+ /*
1093+ * Make the conversion buffer large enough for any substring of
1094+ * interest.
1095+ *
1096+ * Worst case: assume we need the maximum size (maxlen*eml), but take
1097+ * advantage of the fact that the original string length in bytes is an
1098+ * upper bound on the byte length of any fetched substring (and we know
1099+ * that len+1 is safe to allocate because the varlena header is longer
1100+ * than 1 byte).
1101+ */
1102+ if (maxsiz > orig_len )
1103+ conv_bufsiz = orig_len + 1 ;
1104+ else
1105+ conv_bufsiz = maxsiz + 1 ;/* safe since maxsiz < 2^30 */
1106+
1107+ matchctx -> conv_buf = palloc (conv_bufsiz );
1108+ matchctx -> conv_bufsiz = conv_bufsiz ;
1109+ matchctx -> wide_str = wide_str ;
1110+ }
1111+ else
1112+ {
1113+ /* No need to keep the wide string if we're in a single-byte charset. */
1114+ pfree (wide_str );
1115+ matchctx -> wide_str = NULL ;
1116+ matchctx -> conv_buf = NULL ;
1117+ matchctx -> conv_bufsiz = 0 ;
1118+ }
1119+
10371120/* Clean up temp storage */
1038- pfree (wide_str );
10391121pfree (pmatch );
10401122
10411123return matchctx ;
10421124}
10431125
1044- /*
1045- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
1046- */
1047- static void
1048- cleanup_regexp_matches (regexp_matches_ctx * matchctx )
1049- {
1050- pfree (matchctx -> orig_str );
1051- pfree (matchctx -> match_locs );
1052- if (matchctx -> elems )
1053- pfree (matchctx -> elems );
1054- if (matchctx -> nulls )
1055- pfree (matchctx -> nulls );
1056- pfree (matchctx );
1057- }
1058-
10591126/*
10601127 * build_regexp_matches_result - build output array for current match
10611128 */
10621129static ArrayType *
10631130build_regexp_matches_result (regexp_matches_ctx * matchctx )
10641131{
1132+ char * buf = matchctx -> conv_buf ;
1133+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx -> conv_bufsiz ;
10651134Datum * elems = matchctx -> elems ;
10661135bool * nulls = matchctx -> nulls ;
10671136int dims [1 ];
@@ -1081,6 +1150,15 @@ build_regexp_matches_result(regexp_matches_ctx *matchctx)
10811150elems [i ]= (Datum )0 ;
10821151nulls [i ]= true;
10831152}
1153+ else if (buf )
1154+ {
1155+ int len = pg_wchar2mb_with_len (matchctx -> wide_str + so ,
1156+ buf ,
1157+ eo - so );
1158+ Assert (len < bufsiz );
1159+ elems [i ]= PointerGetDatum (cstring_to_text_with_len (buf ,len ));
1160+ nulls [i ]= false;
1161+ }
10841162else
10851163{
10861164elems [i ]= DirectFunctionCall3 (text_substr ,
@@ -1123,7 +1201,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11231201splitctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ),pattern ,
11241202flags ,
11251203PG_GET_COLLATION (),
1126- true, false, true);
1204+ true, false, true, true );
11271205
11281206MemoryContextSwitchTo (oldcontext );
11291207funcctx -> user_fctx = (void * )splitctx ;
@@ -1140,9 +1218,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11401218SRF_RETURN_NEXT (funcctx ,result );
11411219}
11421220
1143- /* release space in multi-call ctx to avoid intraquery memory leak */
1144- cleanup_regexp_matches (splitctx );
1145-
11461221SRF_RETURN_DONE (funcctx );
11471222}
11481223
@@ -1168,7 +1243,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
11681243PG_GETARG_TEXT_PP (1 ),
11691244PG_GETARG_TEXT_PP_IF_EXISTS (2 ),
11701245PG_GET_COLLATION (),
1171- true, false, true);
1246+ true, false, true, true );
11721247
11731248while (splitctx -> next_match <=splitctx -> nmatches )
11741249{
@@ -1180,12 +1255,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
11801255splitctx -> next_match ++ ;
11811256}
11821257
1183- /*
1184- * We don't call cleanup_regexp_matches here; it would try to pfree the
1185- * input string, which we didn't copy. The space is not in a long-lived
1186- * memory context anyway.
1187- */
1188-
11891258PG_RETURN_ARRAYTYPE_P (makeArrayResult (astate ,CurrentMemoryContext ));
11901259}
11911260
@@ -1205,6 +1274,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
12051274static Datum
12061275build_regexp_split_result (regexp_matches_ctx * splitctx )
12071276{
1277+ char * buf = splitctx -> conv_buf ;
12081278int startpos ;
12091279int endpos ;
12101280
@@ -1215,22 +1285,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
12151285if (startpos < 0 )
12161286elog (ERROR ,"invalid match ending position" );
12171287
1218- if (splitctx -> next_match < splitctx -> nmatches )
1288+ if (buf )
12191289{
1290+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx -> conv_bufsiz ;
1291+ int len ;
1292+
12201293endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
12211294if (endpos < startpos )
12221295elog (ERROR ,"invalid match starting position" );
1223- return DirectFunctionCall3 (text_substr ,
1224- PointerGetDatum (splitctx -> orig_str ),
1225- Int32GetDatum (startpos + 1 ),
1226- Int32GetDatum (endpos - startpos ));
1296+ len = pg_wchar2mb_with_len (splitctx -> wide_str + startpos ,
1297+ buf ,
1298+ endpos - startpos );
1299+ Assert (len < bufsiz );
1300+ return PointerGetDatum (cstring_to_text_with_len (buf ,len ));
12271301}
12281302else
12291303{
1230- /* no more matches, return rest of string */
1231- return DirectFunctionCall2 (text_substr_no_len ,
1304+ endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1305+ if (endpos < startpos )
1306+ elog (ERROR ,"invalid match starting position" );
1307+ return DirectFunctionCall3 (text_substr ,
12321308PointerGetDatum (splitctx -> orig_str ),
1233- Int32GetDatum (startpos + 1 ));
1309+ Int32GetDatum (startpos + 1 ),
1310+ Int32GetDatum (endpos - startpos ));
12341311}
12351312}
12361313