88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.152 2006/10/07 00:11:53 tgl Exp $
11+ * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.153 2006/11/08 19:22:25 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
2121#include "catalog/pg_type.h"
2222#include "libpq/md5.h"
2323#include "libpq/pqformat.h"
24+ #include "miscadmin.h"
2425#include "parser/scansup.h"
2526#include "regex/regex.h"
2627#include "utils/builtins.h"
@@ -477,6 +478,32 @@ textcat(PG_FUNCTION_ARGS)
477478PG_RETURN_TEXT_P (result );
478479}
479480
481+ /*
482+ * charlen_to_bytelen()
483+ *Compute the number of bytes occupied by n characters starting at *p
484+ *
485+ * It is caller's responsibility that there actually are n characters;
486+ * the string need not be null-terminated.
487+ */
488+ static int
489+ charlen_to_bytelen (const char * p ,int n )
490+ {
491+ if (pg_database_encoding_max_length ()== 1 )
492+ {
493+ /* Optimization for single-byte encodings */
494+ return n ;
495+ }
496+ else
497+ {
498+ const char * s ;
499+
500+ for (s = p ;n > 0 ;n -- )
501+ s += pg_mblen (s );
502+
503+ return s - p ;
504+ }
505+ }
506+
480507/*
481508 * text_substr()
482509 * Return a substring starting at the specified position.
@@ -534,6 +561,8 @@ text_substr_no_len(PG_FUNCTION_ARGS)
534561 *functions.Note that the argument is passed as a Datum, to indicate that
535562 *it may still be in compressed/toasted form. We can avoid detoasting all
536563 *of it in some cases.
564+ *
565+ *The result is always a freshly palloc'd datum.
537566 */
538567static text *
539568text_substring (Datum str ,int32 start ,int32 length ,bool length_not_specified )
@@ -649,11 +678,23 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
649678 */
650679slice_size = (S1 + L1 )* eml ;
651680}
652- slice = DatumGetTextPSlice (str ,slice_start ,slice_size );
681+
682+ /*
683+ * If we're working with an untoasted source, no need to do an
684+ * extra copying step.
685+ */
686+ if (VARATT_IS_EXTENDED (str ))
687+ slice = DatumGetTextPSlice (str ,slice_start ,slice_size );
688+ else
689+ slice = (text * )DatumGetPointer (str );
653690
654691/* see if we got back an empty string */
655692if ((VARSIZE (slice )- VARHDRSZ )== 0 )
693+ {
694+ if (slice != (text * )DatumGetPointer (str ))
695+ pfree (slice );
656696return PG_STR_GET_TEXT ("" );
697+ }
657698
658699/* Now we can get the actual length of the slice in MB characters */
659700slice_strlen = pg_mbstrlen_with_len (VARDATA (slice ),VARSIZE (slice )- VARHDRSZ );
@@ -663,7 +704,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
663704 * says to return a zero-length string.
664705 */
665706if (S1 > slice_strlen )
707+ {
708+ if (slice != (text * )DatumGetPointer (str ))
709+ pfree (slice );
666710return PG_STR_GET_TEXT ("" );
711+ }
667712
668713/*
669714 * Adjust L1 and E1 now that we know the slice string length. Again
@@ -695,6 +740,9 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
695740VARATT_SIZEP (ret )= VARHDRSZ + (p - s );
696741memcpy (VARDATA (ret ),s , (p - s ));
697742
743+ if (slice != (text * )DatumGetPointer (str ))
744+ pfree (slice );
745+
698746return ret ;
699747}
700748else
@@ -2076,10 +2124,11 @@ replace_text(PG_FUNCTION_ARGS)
20762124int src_text_len = TEXTLEN (src_text );
20772125int from_sub_text_len = TEXTLEN (from_sub_text );
20782126TextPositionState state ;
2079- text * chunk_text ;
20802127text * ret_text ;
20812128int start_posn ;
20822129int curr_posn ;
2130+ int chunk_len ;
2131+ char * start_ptr ;
20832132StringInfoData str ;
20842133
20852134if (src_text_len == 0 || from_sub_text_len == 0 )
@@ -2097,31 +2146,31 @@ replace_text(PG_FUNCTION_ARGS)
20972146PG_RETURN_TEXT_P (src_text );
20982147}
20992148
2149+ /* start_ptr points to the start_posn'th character of src_text */
2150+ start_ptr = (char * )VARDATA (src_text );
2151+
21002152initStringInfo (& str );
21012153
21022154do
21032155{
2104- chunk_text = text_substring (PointerGetDatum (src_text ),
2105- start_posn ,
2106- curr_posn - start_posn ,
2107- false);
2108- appendStringInfoText (& str ,chunk_text );
2109- pfree (chunk_text );
2156+ /* copy the data skipped over by last text_position_next() */
2157+ chunk_len = charlen_to_bytelen (start_ptr ,curr_posn - start_posn );
2158+ appendBinaryStringInfo (& str ,start_ptr ,chunk_len );
21102159
21112160appendStringInfoText (& str ,to_sub_text );
21122161
2113- start_posn = curr_posn + from_sub_text_len ;
2162+ start_posn = curr_posn ;
2163+ start_ptr += chunk_len ;
2164+ start_posn += from_sub_text_len ;
2165+ start_ptr += charlen_to_bytelen (start_ptr ,from_sub_text_len );
2166+
21142167curr_posn = text_position_next (start_posn ,& state );
21152168}
21162169while (curr_posn > 0 );
21172170
2118- /* copy trailing chunk */
2119- chunk_text = text_substring (PointerGetDatum (src_text ),
2120- start_posn ,
2121- -1 ,
2122- true);
2123- appendStringInfoText (& str ,chunk_text );
2124- pfree (chunk_text );
2171+ /* copy trailing data */
2172+ chunk_len = ((char * )src_text + VARSIZE (src_text ))- start_ptr ;
2173+ appendBinaryStringInfo (& str ,start_ptr ,chunk_len );
21252174
21262175text_position_cleanup (& state );
21272176
@@ -2166,11 +2215,13 @@ check_replace_text_has_escape_char(const text *replace_text)
21662215 * appendStringInfoRegexpSubstr
21672216 *
21682217 * Append replace_text to str, substituting regexp back references for
2169- * \n escapes.
2218+ * \n escapes. start_ptr is the start of the match in the source string,
2219+ * at logical character position data_pos.
21702220 */
21712221static void
21722222appendStringInfoRegexpSubstr (StringInfo str ,text * replace_text ,
2173- regmatch_t * pmatch ,text * src_text )
2223+ regmatch_t * pmatch ,
2224+ char * start_ptr ,int data_pos )
21742225{
21752226const char * p = VARDATA (replace_text );
21762227const char * p_end = p + (VARSIZE (replace_text )- VARHDRSZ );
@@ -2247,16 +2298,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
22472298if (so != -1 && eo != -1 )
22482299{
22492300/*
2250- * Copy the text that is back reference of regexp.Because so and
2251- * eo are counted in characters not bytes, it's easiest to use
2252- * text_substring to pull out the correct chunk of text.
2301+ * Copy the text that is back reference of regexp.Note so and
2302+ * eo are counted in characters not bytes.
22532303 */
2254- text * append_text ;
2255-
2256- append_text = text_substring (PointerGetDatum (src_text ),
2257- so + 1 , (eo - so ), false);
2258- appendStringInfoText (str ,append_text );
2259- pfree (append_text );
2304+ char * chunk_start ;
2305+ int chunk_len ;
2306+
2307+ Assert (so >=data_pos );
2308+ chunk_start = start_ptr ;
2309+ chunk_start += charlen_to_bytelen (chunk_start ,so - data_pos );
2310+ chunk_len = charlen_to_bytelen (chunk_start ,eo - so );
2311+ appendBinaryStringInfo (str ,chunk_start ,chunk_len );
22602312}
22612313}
22622314}
@@ -2284,6 +2336,7 @@ replace_text_regexp(text *src_text, void *regexp,
22842336size_t data_len ;
22852337int search_start ;
22862338int data_pos ;
2339+ char * start_ptr ;
22872340bool have_escape ;
22882341
22892342initStringInfo (& buf );
@@ -2295,10 +2348,17 @@ replace_text_regexp(text *src_text, void *regexp,
22952348/* Check whether replace_text has escape char. */
22962349have_escape = check_replace_text_has_escape_char (replace_text );
22972350
2298- for (search_start = data_pos = 0 ;search_start <=data_len ;)
2351+ /* start_ptr points to the data_pos'th character of src_text */
2352+ start_ptr = (char * )VARDATA (src_text );
2353+ data_pos = 0 ;
2354+
2355+ search_start = 0 ;
2356+ while (search_start <=data_len )
22992357{
23002358int regexec_result ;
23012359
2360+ CHECK_FOR_INTERRUPTS ();
2361+
23022362regexec_result = pg_regexec (re ,
23032363data ,
23042364data_len ,
@@ -2322,32 +2382,38 @@ replace_text_regexp(text *src_text, void *regexp,
23222382}
23232383
23242384/*
2325- * Copy the text to the left of the match position. Because we are
2326- * working with character not byte indexes, it's easiest to use
2327- * text_substring to pull out the needed data.
2385+ * Copy the text to the left of the match position. Note we are
2386+ * given character not byte indexes.
23282387 */
23292388if (pmatch [0 ].rm_so - data_pos > 0 )
23302389{
2331- text * left_text ;
2332-
2333- left_text = text_substring (PointerGetDatum (src_text ),
2334- data_pos + 1 ,
2335- pmatch [0 ].rm_so - data_pos ,
2336- false);
2337- appendStringInfoText (& buf ,left_text );
2338- pfree (left_text );
2390+ int chunk_len ;
2391+
2392+ chunk_len = charlen_to_bytelen (start_ptr ,
2393+ pmatch [0 ].rm_so - data_pos );
2394+ appendBinaryStringInfo (& buf ,start_ptr ,chunk_len );
2395+ /*
2396+ * Advance start_ptr over that text, to avoid multiple rescans
2397+ * of it if the replace_text contains multiple back-references.
2398+ */
2399+ start_ptr += chunk_len ;
2400+ data_pos = pmatch [0 ].rm_so ;
23392401}
23402402
23412403/*
23422404 * Copy the replace_text. Process back references when the
23432405 * replace_text has escape characters.
23442406 */
23452407if (have_escape )
2346- appendStringInfoRegexpSubstr (& buf ,replace_text ,pmatch ,src_text );
2408+ appendStringInfoRegexpSubstr (& buf ,replace_text ,pmatch ,
2409+ start_ptr ,data_pos );
23472410else
23482411appendStringInfoText (& buf ,replace_text );
23492412
2350- search_start = data_pos = pmatch [0 ].rm_eo ;
2413+ /* Advance start_ptr and data_pos over the matched text. */
2414+ start_ptr += charlen_to_bytelen (start_ptr ,
2415+ pmatch [0 ].rm_eo - data_pos );
2416+ data_pos = pmatch [0 ].rm_eo ;
23512417
23522418/*
23532419 * When global option is off, replace the first instance only.
@@ -2358,6 +2424,7 @@ replace_text_regexp(text *src_text, void *regexp,
23582424/*
23592425 * Search from next character when the matching text is zero width.
23602426 */
2427+ search_start = data_pos ;
23612428if (pmatch [0 ].rm_so == pmatch [0 ].rm_eo )
23622429search_start ++ ;
23632430}
@@ -2367,12 +2434,10 @@ replace_text_regexp(text *src_text, void *regexp,
23672434 */
23682435if (data_pos < data_len )
23692436{
2370- text * right_text ;
2437+ int chunk_len ;
23712438
2372- right_text = text_substring (PointerGetDatum (src_text ),
2373- data_pos + 1 ,-1 , true);
2374- appendStringInfoText (& buf ,right_text );
2375- pfree (right_text );
2439+ chunk_len = ((char * )src_text + VARSIZE (src_text ))- start_ptr ;
2440+ appendBinaryStringInfo (& buf ,start_ptr ,chunk_len );
23762441}
23772442
23782443ret_text = PG_STR_GET_TEXT (buf .data );
@@ -2488,6 +2553,8 @@ text_to_array(PG_FUNCTION_ARGS)
24882553int fldnum ;
24892554int start_posn ;
24902555int end_posn ;
2556+ int chunk_len ;
2557+ char * start_ptr ;
24912558text * result_text ;
24922559ArrayBuildState * astate = NULL ;
24932560
@@ -2506,37 +2573,45 @@ text_to_array(PG_FUNCTION_ARGS)
25062573text_position_setup (inputstring ,fldsep ,& state );
25072574
25082575start_posn = 1 ;
2576+ /* start_ptr points to the start_posn'th character of inputstring */
2577+ start_ptr = (char * )VARDATA (inputstring );
2578+
25092579for (fldnum = 1 ;;fldnum ++ )/* field number is 1 based */
25102580{
25112581end_posn = text_position_next (start_posn ,& state );
25122582
25132583if (end_posn == 0 )
25142584{
25152585/* fetch last field */
2516- result_text = text_substring (PointerGetDatum (inputstring ),
2517- start_posn ,
2518- -1 ,
2519- true);
2586+ chunk_len = ((char * )inputstring + VARSIZE (inputstring ))- start_ptr ;
25202587}
25212588else
25222589{
25232590/* fetch non-last field */
2524- result_text = text_substring (PointerGetDatum (inputstring ),
2525- start_posn ,
2526- end_posn - start_posn ,
2527- false);
2591+ chunk_len = charlen_to_bytelen (start_ptr ,end_posn - start_posn );
25282592}
25292593
2594+ /* must build a temp text datum to pass to accumArrayResult */
2595+ result_text = (text * )palloc (VARHDRSZ + chunk_len );
2596+ VARATT_SIZEP (result_text )= VARHDRSZ + chunk_len ;
2597+ memcpy (VARDATA (result_text ),start_ptr ,chunk_len );
2598+
25302599/* stash away this field */
25312600astate = accumArrayResult (astate ,
25322601PointerGetDatum (result_text ),
25332602 false,
25342603TEXTOID ,
25352604CurrentMemoryContext );
25362605
2606+ pfree (result_text );
2607+
25372608if (end_posn == 0 )
25382609break ;
2539- start_posn = end_posn + fldsep_len ;
2610+
2611+ start_posn = end_posn ;
2612+ start_ptr += chunk_len ;
2613+ start_posn += fldsep_len ;
2614+ start_ptr += charlen_to_bytelen (start_ptr ,fldsep_len );
25402615}
25412616
25422617text_position_cleanup (& state );