3535#include "regex/regex.h"
3636#include "utils/array.h"
3737#include "utils/builtins.h"
38+ #include "utils/memutils.h"
3839#include "utils/varlena.h"
3940
4041#define PG_GETARG_TEXT_PP_IF_EXISTS (_n ) \
@@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx
6162/* workspace for build_regexp_match_result() */
6263Datum * elems ;/* has npatterns elements */
6364bool * nulls ;/* has npatterns elements */
65+ pg_wchar * wide_str ;/* wide-char version of original string */
66+ char * conv_buf ;/* conversion buffer */
67+ int conv_bufsiz ;/* size thereof */
6468}regexp_matches_ctx ;
6569
6670/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111115pg_re_flags * flags ,
112116Oid collation ,
113117bool use_subpatterns ,
114- bool ignore_degenerate );
115- static void cleanup_regexp_matches ( regexp_matches_ctx * matchctx );
118+ bool ignore_degenerate ,
119+ bool fetching_unmatched );
116120static ArrayType * build_regexp_match_result (regexp_matches_ctx * matchctx );
117121static Datum build_regexp_split_result (regexp_matches_ctx * splitctx );
118122
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
863867errhint ("Use the regexp_matches function instead." )));
864868
865869matchctx = setup_regexp_matches (orig_str ,pattern ,& re_flags ,
866- PG_GET_COLLATION (), true, false);
870+ PG_GET_COLLATION (), true, false, false );
867871
868872if (matchctx -> nmatches == 0 )
869873PG_RETURN_NULL ();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
911915matchctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ),pattern ,
912916& re_flags ,
913917PG_GET_COLLATION (),
914- true, false);
918+ true, false, false );
915919
916920/* Pre-create workspace that build_regexp_match_result needs */
917921matchctx -> elems = (Datum * )palloc (sizeof (Datum )* matchctx -> npatterns );
@@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)
933937SRF_RETURN_NEXT (funcctx ,PointerGetDatum (result_ary ));
934938}
935939
936- /* release space in multi-call ctx to avoid intraquery memory leak */
937- cleanup_regexp_matches (matchctx );
938-
939940SRF_RETURN_DONE (funcctx );
940941}
941942
@@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
954955 * all the matching in one swoop. The returned regexp_matches_ctx contains
955956 * the locations of all the substrings matching the pattern.
956957 *
957- * Thetwo bool parameters have only two patterns (one for matching, one for
958+ * Thethree bool parameters have only two patterns (one for matching, one for
958959 * splitting) but it seems clearer to distinguish the functionality this way
959- * than to key it all off one "is_split" flag.
960+ * than to key it all off one "is_split" flag. We don't currently assume that
961+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
962+ * set, the conversion buffer is large enough to fetch any single matched or
963+ * unmatched string, but not any larger substring. (In practice, when splitting
964+ * the matches are usually small anyway, and it didn't seem worth complicating
965+ * the code further.)
960966 */
961967static regexp_matches_ctx *
962968setup_regexp_matches (text * orig_str ,text * pattern ,pg_re_flags * re_flags ,
963969Oid collation ,
964970bool use_subpatterns ,
965- bool ignore_degenerate )
971+ bool ignore_degenerate ,
972+ bool fetching_unmatched )
966973{
967974regexp_matches_ctx * matchctx = palloc0 (sizeof (regexp_matches_ctx ));
975+ int eml = pg_database_encoding_max_length ();
968976int orig_len ;
969977pg_wchar * wide_str ;
970978int wide_len ;
@@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
975983int array_idx ;
976984int prev_match_end ;
977985int start_search ;
986+ int maxlen = 0 ;/* largest fetch length in characters */
978987
979988/* save original string --- we'll extract result substrings from it */
980989matchctx -> orig_str = orig_str ;
@@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10031012/* temporary output space for RE package */
10041013pmatch = palloc (sizeof (regmatch_t )* pmatch_len );
10051014
1006- /* the real output space (grown dynamically if needed) */
1007- array_len = re_flags -> glob ?256 :32 ;
1015+ /*
1016+ * the real output space (grown dynamically if needed)
1017+ *
1018+ * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
1019+ * than at 2^27
1020+ */
1021+ array_len = re_flags -> glob ?255 :31 ;
10081022matchctx -> match_locs = (int * )palloc (sizeof (int )* array_len );
10091023array_idx = 0 ;
10101024
@@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10241038pmatch [0 ].rm_eo > prev_match_end ))
10251039{
10261040/* enlarge output space if needed */
1027- while (array_idx + matchctx -> npatterns * 2 > array_len )
1041+ while (array_idx + matchctx -> npatterns * 2 + 1 > array_len )
10281042{
1029- array_len *=2 ;
1043+ array_len += array_len + 1 ;/* 2^n-1 => 2^(n+1)-1 */
1044+ if (array_len > MaxAllocSize /sizeof (int ))
1045+ ereport (ERROR ,
1046+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
1047+ errmsg ("too many regular expression matches" )));
10301048matchctx -> match_locs = (int * )repalloc (matchctx -> match_locs ,
10311049sizeof (int )* array_len );
10321050}
@@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10381056
10391057for (i = 1 ;i <=matchctx -> npatterns ;i ++ )
10401058{
1041- matchctx -> match_locs [array_idx ++ ]= pmatch [i ].rm_so ;
1042- matchctx -> match_locs [array_idx ++ ]= pmatch [i ].rm_eo ;
1059+ int so = pmatch [i ].rm_so ;
1060+ int eo = pmatch [i ].rm_eo ;
1061+ matchctx -> match_locs [array_idx ++ ]= so ;
1062+ matchctx -> match_locs [array_idx ++ ]= eo ;
1063+ if (so >=0 && eo >=0 && (eo - so )> maxlen )
1064+ maxlen = (eo - so );
10431065}
10441066}
10451067else
10461068{
1047- matchctx -> match_locs [array_idx ++ ]= pmatch [0 ].rm_so ;
1048- matchctx -> match_locs [array_idx ++ ]= pmatch [0 ].rm_eo ;
1069+ int so = pmatch [0 ].rm_so ;
1070+ int eo = pmatch [0 ].rm_eo ;
1071+ matchctx -> match_locs [array_idx ++ ]= so ;
1072+ matchctx -> match_locs [array_idx ++ ]= eo ;
1073+ if (so >=0 && eo >=0 && (eo - so )> maxlen )
1074+ maxlen = (eo - so );
10491075}
10501076matchctx -> nmatches ++ ;
1077+
1078+ /*
1079+ * check length of unmatched portion between end of previous match
1080+ * and start of current one
1081+ */
1082+ if (fetching_unmatched &&
1083+ pmatch [0 ].rm_so >=0 &&
1084+ (pmatch [0 ].rm_so - prev_match_end )> maxlen )
1085+ maxlen = (pmatch [0 ].rm_so - prev_match_end );
10511086}
10521087prev_match_end = pmatch [0 ].rm_eo ;
10531088
@@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10681103break ;
10691104}
10701105
1106+ /*
1107+ * check length of unmatched portion between end of last match and end of
1108+ * input string
1109+ */
1110+ if (fetching_unmatched &&
1111+ (wide_len - prev_match_end )> maxlen )
1112+ maxlen = (wide_len - prev_match_end );
1113+
1114+ /*
1115+ * Keep a note of the end position of the string for the benefit of
1116+ * splitting code.
1117+ */
1118+ matchctx -> match_locs [array_idx ]= wide_len ;
1119+
1120+ if (eml > 1 )
1121+ {
1122+ int64 maxsiz = eml * (int64 )maxlen ;
1123+ int conv_bufsiz ;
1124+
1125+ /*
1126+ * Make the conversion buffer large enough for any substring of
1127+ * interest.
1128+ *
1129+ * Worst case: assume we need the maximum size (maxlen*eml), but take
1130+ * advantage of the fact that the original string length in bytes is an
1131+ * upper bound on the byte length of any fetched substring (and we know
1132+ * that len+1 is safe to allocate because the varlena header is longer
1133+ * than 1 byte).
1134+ */
1135+ if (maxsiz > orig_len )
1136+ conv_bufsiz = orig_len + 1 ;
1137+ else
1138+ conv_bufsiz = maxsiz + 1 ;/* safe since maxsiz < 2^30 */
1139+
1140+ matchctx -> conv_buf = palloc (conv_bufsiz );
1141+ matchctx -> conv_bufsiz = conv_bufsiz ;
1142+ matchctx -> wide_str = wide_str ;
1143+ }
1144+ else
1145+ {
1146+ /* No need to keep the wide string if we're in a single-byte charset. */
1147+ pfree (wide_str );
1148+ matchctx -> wide_str = NULL ;
1149+ matchctx -> conv_buf = NULL ;
1150+ matchctx -> conv_bufsiz = 0 ;
1151+ }
1152+
10711153/* Clean up temp storage */
1072- pfree (wide_str );
10731154pfree (pmatch );
10741155
10751156return matchctx ;
10761157}
10771158
1078- /*
1079- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
1080- */
1081- static void
1082- cleanup_regexp_matches (regexp_matches_ctx * matchctx )
1083- {
1084- pfree (matchctx -> orig_str );
1085- pfree (matchctx -> match_locs );
1086- if (matchctx -> elems )
1087- pfree (matchctx -> elems );
1088- if (matchctx -> nulls )
1089- pfree (matchctx -> nulls );
1090- pfree (matchctx );
1091- }
1092-
10931159/*
10941160 * build_regexp_match_result - build output array for current match
10951161 */
10961162static ArrayType *
10971163build_regexp_match_result (regexp_matches_ctx * matchctx )
10981164{
1165+ char * buf = matchctx -> conv_buf ;
1166+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx -> conv_bufsiz ;
10991167Datum * elems = matchctx -> elems ;
11001168bool * nulls = matchctx -> nulls ;
11011169int dims [1 ];
@@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
11151183elems [i ]= (Datum )0 ;
11161184nulls [i ]= true;
11171185}
1186+ else if (buf )
1187+ {
1188+ int len = pg_wchar2mb_with_len (matchctx -> wide_str + so ,
1189+ buf ,
1190+ eo - so );
1191+ Assert (len < bufsiz );
1192+ elems [i ]= PointerGetDatum (cstring_to_text_with_len (buf ,len ));
1193+ nulls [i ]= false;
1194+ }
11181195else
11191196{
11201197elems [i ]= DirectFunctionCall3 (text_substr ,
@@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11681245splitctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ),pattern ,
11691246& re_flags ,
11701247PG_GET_COLLATION (),
1171- false, true);
1248+ false, true, true );
11721249
11731250MemoryContextSwitchTo (oldcontext );
11741251funcctx -> user_fctx = (void * )splitctx ;
@@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11851262SRF_RETURN_NEXT (funcctx ,result );
11861263}
11871264
1188- /* release space in multi-call ctx to avoid intraquery memory leak */
1189- cleanup_regexp_matches (splitctx );
1190-
11911265SRF_RETURN_DONE (funcctx );
11921266}
11931267
@@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
12241298PG_GETARG_TEXT_PP (1 ),
12251299& re_flags ,
12261300PG_GET_COLLATION (),
1227- false, true);
1301+ false, true, true );
12281302
12291303while (splitctx -> next_match <=splitctx -> nmatches )
12301304{
@@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
12361310splitctx -> next_match ++ ;
12371311}
12381312
1239- /*
1240- * We don't call cleanup_regexp_matches here; it would try to pfree the
1241- * input string, which we didn't copy. The space is not in a long-lived
1242- * memory context anyway.
1243- */
1244-
12451313PG_RETURN_ARRAYTYPE_P (makeArrayResult (astate ,CurrentMemoryContext ));
12461314}
12471315
@@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
12611329static Datum
12621330build_regexp_split_result (regexp_matches_ctx * splitctx )
12631331{
1332+ char * buf = splitctx -> conv_buf ;
12641333int startpos ;
12651334int endpos ;
12661335
@@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
12711340if (startpos < 0 )
12721341elog (ERROR ,"invalid match ending position" );
12731342
1274- if (splitctx -> next_match < splitctx -> nmatches )
1343+ if (buf )
12751344{
1345+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx -> conv_bufsiz ;
1346+ int len ;
1347+
12761348endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
12771349if (endpos < startpos )
12781350elog (ERROR ,"invalid match starting position" );
1279- return DirectFunctionCall3 (text_substr ,
1280- PointerGetDatum (splitctx -> orig_str ),
1281- Int32GetDatum (startpos + 1 ),
1282- Int32GetDatum (endpos - startpos ));
1351+ len = pg_wchar2mb_with_len (splitctx -> wide_str + startpos ,
1352+ buf ,
1353+ endpos - startpos );
1354+ Assert (len < bufsiz );
1355+ return PointerGetDatum (cstring_to_text_with_len (buf ,len ));
12831356}
12841357else
12851358{
1286- /* no more matches, return rest of string */
1287- return DirectFunctionCall2 (text_substr_no_len ,
1359+ endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1360+ if (endpos < startpos )
1361+ elog (ERROR ,"invalid match starting position" );
1362+ return DirectFunctionCall3 (text_substr ,
12881363PointerGetDatum (splitctx -> orig_str ),
1289- Int32GetDatum (startpos + 1 ));
1364+ Int32GetDatum (startpos + 1 ),
1365+ Int32GetDatum (endpos - startpos ));
12901366}
12911367}
12921368