NotificationsYou must be signed in to change notification settings
Fork5
Star26

Commita5cf12e

committed

Fix performance issues in replace_text(), replace_text_regexp(), and

text_to_array(): they all had O(N^2) behavior on long input strings inmultibyte encodings, because of repeated rescanning of the input text toidentify substrings whose positions/lengths were computed in charactersinstead of bytes. Fix by tracking the current source position as a charpointer as well as a character-count. Also avoid some unnecessary pallocoperations. text_to_array() also leaked memory intracall due to failureto pfree temporary strings. Per gripe from Tatsuo Ishii.

1 parent9d6f263 commita5cf12eCopy full SHA for a5cf12e

File tree

1 file changed

+131

-56

lines changed

src/backend/utils/adt
- varlena.c

1 file changed

+131

-56

lines changed

`‎src/backend/utils/adt/varlena.c`

Lines changed: 131 additions & 56 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`*`
`9`	`9`	`*`
`10`	`10`	`* IDENTIFICATION`
`11`		`- * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.152 2006/10/07 00:11:53 tgl Exp $`
	`11`	`+ * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.153 2006/11/08 19:22:25 tgl Exp $`
`12`	`12`	`*`
`13`	`13`	`*-------------------------------------------------------------------------`
`14`	`14`	`*/`
`@@ -21,6 +21,7 @@`
`21`	`21`	`#include"catalog/pg_type.h"`
`22`	`22`	`#include"libpq/md5.h"`
`23`	`23`	`#include"libpq/pqformat.h"`
	`24`	`+#include"miscadmin.h"`
`24`	`25`	`#include"parser/scansup.h"`
`25`	`26`	`#include"regex/regex.h"`
`26`	`27`	`#include"utils/builtins.h"`
`@@ -477,6 +478,32 @@ textcat(PG_FUNCTION_ARGS)`
`477`	`478`	`PG_RETURN_TEXT_P(result);`
`478`	`479`	`}`
`479`	`480`
	`481`	`+/*`
	`482`	`+ * charlen_to_bytelen()`
	`483`	`+ Compute the number of bytes occupied by n characters starting at p`
	`484`	`+ *`
	`485`	`+ * It is caller's responsibility that there actually are n characters;`
	`486`	`+ * the string need not be null-terminated.`
	`487`	`+ */`
	`488`	`+staticint`
	`489`	`+charlen_to_bytelen(constchar*p,intn)`
	`490`	`+{`
	`491`	`+if (pg_database_encoding_max_length()==1)`
	`492`	`+{`
	`493`	`+/* Optimization for single-byte encodings */`
	`494`	`+returnn;`
	`495`	`+}`
	`496`	`+else`
	`497`	`+{`
	`498`	`+constchar*s;`
	`499`	`+`
	`500`	`+for (s=p;n>0;n--)`
	`501`	`+s+=pg_mblen(s);`
	`502`	`+`
	`503`	`+returns-p;`
	`504`	`+}`
	`505`	`+}`
	`506`	`+`
`480`	`507`	`/*`
`481`	`508`	`* text_substr()`
`482`	`509`	`* Return a substring starting at the specified position.`
`@@ -534,6 +561,8 @@ text_substr_no_len(PG_FUNCTION_ARGS)`
`534`	`561`	`*functions.Note that the argument is passed as a Datum, to indicate that`
`535`	`562`	`*it may still be in compressed/toasted form. We can avoid detoasting all`
`536`	`563`	`*of it in some cases.`
	`564`	`+ *`
	`565`	`+ *The result is always a freshly palloc'd datum.`
`537`	`566`	`*/`
`538`	`567`	`statictext*`
`539`	`568`	`text_substring(Datumstr,int32start,int32length,boollength_not_specified)`
`@@ -649,11 +678,23 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)`
`649`	`678`	`*/`
`650`	`679`	`slice_size= (S1+L1)*eml;`
`651`	`680`	`}`
`652`		`-slice=DatumGetTextPSlice(str,slice_start,slice_size);`
	`681`	`+`
	`682`	`+/*`
	`683`	`+ * If we're working with an untoasted source, no need to do an`
	`684`	`+ * extra copying step.`
	`685`	`+ */`
	`686`	`+if (VARATT_IS_EXTENDED(str))`
	`687`	`+slice=DatumGetTextPSlice(str,slice_start,slice_size);`
	`688`	`+else`
	`689`	`+slice= (text*)DatumGetPointer(str);`
`653`	`690`
`654`	`691`	`/* see if we got back an empty string */`
`655`	`692`	`if ((VARSIZE(slice)-VARHDRSZ)==0)`
	`693`	`+{`
	`694`	`+if (slice!= (text*)DatumGetPointer(str))`
	`695`	`+pfree(slice);`
`656`	`696`	`returnPG_STR_GET_TEXT("");`
	`697`	`+}`
`657`	`698`
`658`	`699`	`/* Now we can get the actual length of the slice in MB characters */`
`659`	`700`	`slice_strlen=pg_mbstrlen_with_len(VARDATA(slice),VARSIZE(slice)-VARHDRSZ);`
`@@ -663,7 +704,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)`
`663`	`704`	`* says to return a zero-length string.`
`664`	`705`	`*/`
`665`	`706`	`if (S1>slice_strlen)`
	`707`	`+{`
	`708`	`+if (slice!= (text*)DatumGetPointer(str))`
	`709`	`+pfree(slice);`
`666`	`710`	`returnPG_STR_GET_TEXT("");`
	`711`	`+}`
`667`	`712`
`668`	`713`	`/*`
`669`	`714`	`* Adjust L1 and E1 now that we know the slice string length. Again`
`@@ -695,6 +740,9 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)`
`695`	`740`	`VARATT_SIZEP(ret)=VARHDRSZ+ (p-s);`
`696`	`741`	`memcpy(VARDATA(ret),s, (p-s));`
`697`	`742`
	`743`	`+if (slice!= (text*)DatumGetPointer(str))`
	`744`	`+pfree(slice);`
	`745`	`+`
`698`	`746`	`returnret;`
`699`	`747`	`}`
`700`	`748`	`else`
`@@ -2076,10 +2124,11 @@ replace_text(PG_FUNCTION_ARGS)`
`2076`	`2124`	`intsrc_text_len=TEXTLEN(src_text);`
`2077`	`2125`	`intfrom_sub_text_len=TEXTLEN(from_sub_text);`
`2078`	`2126`	`TextPositionStatestate;`
`2079`		`-text*chunk_text;`
`2080`	`2127`	`text*ret_text;`
`2081`	`2128`	`intstart_posn;`
`2082`	`2129`	`intcurr_posn;`
	`2130`	`+intchunk_len;`
	`2131`	`+char*start_ptr;`
`2083`	`2132`	`StringInfoDatastr;`
`2084`	`2133`
`2085`	`2134`	`if (src_text_len==0\|\|from_sub_text_len==0)`
`@@ -2097,31 +2146,31 @@ replace_text(PG_FUNCTION_ARGS)`
`2097`	`2146`	`PG_RETURN_TEXT_P(src_text);`
`2098`	`2147`	`}`
`2099`	`2148`
	`2149`	`+/* start_ptr points to the start_posn'th character of src_text */`
	`2150`	`+start_ptr= (char*)VARDATA(src_text);`
	`2151`	`+`
`2100`	`2152`	`initStringInfo(&str);`
`2101`	`2153`
`2102`	`2154`	`do`
`2103`	`2155`	`{`
`2104`		`-chunk_text=text_substring(PointerGetDatum(src_text),`
`2105`		`-start_posn,`
`2106`		`-curr_posn-start_posn,`
`2107`		`-false);`
`2108`		`-appendStringInfoText(&str,chunk_text);`
`2109`		`-pfree(chunk_text);`
	`2156`	`+/* copy the data skipped over by last text_position_next() */`
	`2157`	`+chunk_len=charlen_to_bytelen(start_ptr,curr_posn-start_posn);`
	`2158`	`+appendBinaryStringInfo(&str,start_ptr,chunk_len);`
`2110`	`2159`
`2111`	`2160`	`appendStringInfoText(&str,to_sub_text);`
`2112`	`2161`
`2113`		`-start_posn=curr_posn+from_sub_text_len;`
	`2162`	`+start_posn=curr_posn;`
	`2163`	`+start_ptr+=chunk_len;`
	`2164`	`+start_posn+=from_sub_text_len;`
	`2165`	`+start_ptr+=charlen_to_bytelen(start_ptr,from_sub_text_len);`
	`2166`	`+`
`2114`	`2167`	`curr_posn=text_position_next(start_posn,&state);`
`2115`	`2168`	`}`
`2116`	`2169`	`while (curr_posn>0);`
`2117`	`2170`
`2118`		`-/* copy trailing chunk */`
`2119`		`-chunk_text=text_substring(PointerGetDatum(src_text),`
`2120`		`-start_posn,`
`2121`		`--1,`
`2122`		`-true);`
`2123`		`-appendStringInfoText(&str,chunk_text);`
`2124`		`-pfree(chunk_text);`
	`2171`	`+/* copy trailing data */`
	`2172`	`+chunk_len= ((char*)src_text+VARSIZE(src_text))-start_ptr;`
	`2173`	`+appendBinaryStringInfo(&str,start_ptr,chunk_len);`
`2125`	`2174`
`2126`	`2175`	`text_position_cleanup(&state);`
`2127`	`2176`
`@@ -2166,11 +2215,13 @@ check_replace_text_has_escape_char(const text *replace_text)`
`2166`	`2215`	`* appendStringInfoRegexpSubstr`
`2167`	`2216`	`*`
`2168`	`2217`	`* Append replace_text to str, substituting regexp back references for`
`2169`		`- * \n escapes.`
	`2218`	`+ * \n escapes. start_ptr is the start of the match in the source string,`
	`2219`	`+ * at logical character position data_pos.`
`2170`	`2220`	`*/`
`2171`	`2221`	`staticvoid`
`2172`	`2222`	`appendStringInfoRegexpSubstr(StringInfostr,text*replace_text,`
`2173`		`-regmatch_tpmatch,textsrc_text)`
	`2223`	`+regmatch_t*pmatch,`
	`2224`	`+char*start_ptr,intdata_pos)`
`2174`	`2225`	`{`
`2175`	`2226`	`constchar*p=VARDATA(replace_text);`
`2176`	`2227`	`constchar*p_end=p+ (VARSIZE(replace_text)-VARHDRSZ);`
`@@ -2247,16 +2298,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,`
`2247`	`2298`	`if (so!=-1&&eo!=-1)`
`2248`	`2299`	`{`
`2249`	`2300`	`/*`
`2250`		`- * Copy the text that is back reference of regexp.Because so and`
`2251`		`- * eo are counted in characters not bytes, it's easiest to use`
`2252`		`- * text_substring to pull out the correct chunk of text.`
	`2301`	`+ * Copy the text that is back reference of regexp.Note so and`
	`2302`	`+ * eo are counted in characters not bytes.`
`2253`	`2303`	`*/`
`2254`		`-text*append_text;`
`2255`		`-`
`2256`		`-append_text=text_substring(PointerGetDatum(src_text),`
`2257`		`-so+1, (eo-so), false);`
`2258`		`-appendStringInfoText(str,append_text);`
`2259`		`-pfree(append_text);`
	`2304`	`+char*chunk_start;`
	`2305`	`+intchunk_len;`
	`2306`	`+`
	`2307`	`+Assert(so >=data_pos);`
	`2308`	`+chunk_start=start_ptr;`
	`2309`	`+chunk_start+=charlen_to_bytelen(chunk_start,so-data_pos);`
	`2310`	`+chunk_len=charlen_to_bytelen(chunk_start,eo-so);`
	`2311`	`+appendBinaryStringInfo(str,chunk_start,chunk_len);`
`2260`	`2312`	`}`
`2261`	`2313`	`}`
`2262`	`2314`	`}`
`@@ -2284,6 +2336,7 @@ replace_text_regexp(text src_text, void regexp,`
`2284`	`2336`	`size_tdata_len;`
`2285`	`2337`	`intsearch_start;`
`2286`	`2338`	`intdata_pos;`
	`2339`	`+char*start_ptr;`
`2287`	`2340`	`boolhave_escape;`
`2288`	`2341`
`2289`	`2342`	`initStringInfo(&buf);`
`@@ -2295,10 +2348,17 @@ replace_text_regexp(text src_text, void regexp,`
`2295`	`2348`	`/* Check whether replace_text has escape char. */`
`2296`	`2349`	`have_escape=check_replace_text_has_escape_char(replace_text);`
`2297`	`2350`
`2298`		`-for (search_start=data_pos=0;search_start <=data_len;)`
	`2351`	`+/* start_ptr points to the data_pos'th character of src_text */`
	`2352`	`+start_ptr= (char*)VARDATA(src_text);`
	`2353`	`+data_pos=0;`
	`2354`	`+`
	`2355`	`+search_start=0;`
	`2356`	`+while (search_start <=data_len)`
`2299`	`2357`	`{`
`2300`	`2358`	`intregexec_result;`
`2301`	`2359`
	`2360`	`+CHECK_FOR_INTERRUPTS();`
	`2361`	`+`
`2302`	`2362`	`regexec_result=pg_regexec(re,`
`2303`	`2363`	`data,`
`2304`	`2364`	`data_len,`
`@@ -2322,32 +2382,38 @@ replace_text_regexp(text src_text, void regexp,`
`2322`	`2382`	`}`
`2323`	`2383`
`2324`	`2384`	`/*`
`2325`		`- * Copy the text to the left of the match position. Because we are`
`2326`		`- * working with character not byte indexes, it's easiest to use`
`2327`		`- * text_substring to pull out the needed data.`
	`2385`	`+ * Copy the text to the left of the match position. Note we are`
	`2386`	`+ * given character not byte indexes.`
`2328`	`2387`	`*/`
`2329`	`2388`	`if (pmatch[0].rm_so-data_pos>0)`
`2330`	`2389`	`{`
`2331`		`-text*left_text;`
`2332`		`-`
`2333`		`-left_text=text_substring(PointerGetDatum(src_text),`
`2334`		`-data_pos+1,`
`2335`		`-pmatch[0].rm_so-data_pos,`
`2336`		`- false);`
`2337`		`-appendStringInfoText(&buf,left_text);`
`2338`		`-pfree(left_text);`
	`2390`	`+intchunk_len;`
	`2391`	`+`
	`2392`	`+chunk_len=charlen_to_bytelen(start_ptr,`
	`2393`	`+pmatch[0].rm_so-data_pos);`
	`2394`	`+appendBinaryStringInfo(&buf,start_ptr,chunk_len);`
	`2395`	`+/*`
	`2396`	`+ * Advance start_ptr over that text, to avoid multiple rescans`
	`2397`	`+ * of it if the replace_text contains multiple back-references.`
	`2398`	`+ */`
	`2399`	`+start_ptr+=chunk_len;`
	`2400`	`+data_pos=pmatch[0].rm_so;`
`2339`	`2401`	`}`
`2340`	`2402`
`2341`	`2403`	`/*`
`2342`	`2404`	`* Copy the replace_text. Process back references when the`
`2343`	`2405`	`* replace_text has escape characters.`
`2344`	`2406`	`*/`
`2345`	`2407`	`if (have_escape)`
`2346`		`-appendStringInfoRegexpSubstr(&buf,replace_text,pmatch,src_text);`
	`2408`	`+appendStringInfoRegexpSubstr(&buf,replace_text,pmatch,`
	`2409`	`+start_ptr,data_pos);`
`2347`	`2410`	`else`
`2348`	`2411`	`appendStringInfoText(&buf,replace_text);`
`2349`	`2412`
`2350`		`-search_start=data_pos=pmatch[0].rm_eo;`
	`2413`	`+/* Advance start_ptr and data_pos over the matched text. */`
	`2414`	`+start_ptr+=charlen_to_bytelen(start_ptr,`
	`2415`	`+pmatch[0].rm_eo-data_pos);`
	`2416`	`+data_pos=pmatch[0].rm_eo;`
`2351`	`2417`
`2352`	`2418`	`/*`
`2353`	`2419`	`* When global option is off, replace the first instance only.`
`@@ -2358,6 +2424,7 @@ replace_text_regexp(text src_text, void regexp,`
`2358`	`2424`	`/*`
`2359`	`2425`	`* Search from next character when the matching text is zero width.`
`2360`	`2426`	`*/`
	`2427`	`+search_start=data_pos;`
`2361`	`2428`	`if (pmatch[0].rm_so==pmatch[0].rm_eo)`
`2362`	`2429`	`search_start++;`
`2363`	`2430`	`}`
`@@ -2367,12 +2434,10 @@ replace_text_regexp(text src_text, void regexp,`
`2367`	`2434`	`*/`
`2368`	`2435`	`if (data_pos<data_len)`
`2369`	`2436`	`{`
`2370`		`-text*right_text;`
	`2437`	`+intchunk_len;`
`2371`	`2438`
`2372`		`-right_text=text_substring(PointerGetDatum(src_text),`
`2373`		`-data_pos+1,-1, true);`
`2374`		`-appendStringInfoText(&buf,right_text);`
`2375`		`-pfree(right_text);`
	`2439`	`+chunk_len= ((char*)src_text+VARSIZE(src_text))-start_ptr;`
	`2440`	`+appendBinaryStringInfo(&buf,start_ptr,chunk_len);`
`2376`	`2441`	`}`
`2377`	`2442`
`2378`	`2443`	`ret_text=PG_STR_GET_TEXT(buf.data);`
`@@ -2488,6 +2553,8 @@ text_to_array(PG_FUNCTION_ARGS)`
`2488`	`2553`	`intfldnum;`
`2489`	`2554`	`intstart_posn;`
`2490`	`2555`	`intend_posn;`
	`2556`	`+intchunk_len;`
	`2557`	`+char*start_ptr;`
`2491`	`2558`	`text*result_text;`
`2492`	`2559`	`ArrayBuildState*astate=NULL;`
`2493`	`2560`
`@@ -2506,37 +2573,45 @@ text_to_array(PG_FUNCTION_ARGS)`
`2506`	`2573`	`text_position_setup(inputstring,fldsep,&state);`
`2507`	`2574`
`2508`	`2575`	`start_posn=1;`
	`2576`	`+/* start_ptr points to the start_posn'th character of inputstring */`
	`2577`	`+start_ptr= (char*)VARDATA(inputstring);`
	`2578`	`+`
`2509`	`2579`	`for (fldnum=1;;fldnum++)/* field number is 1 based */`
`2510`	`2580`	`{`
`2511`	`2581`	`end_posn=text_position_next(start_posn,&state);`
`2512`	`2582`
`2513`	`2583`	`if (end_posn==0)`
`2514`	`2584`	`{`
`2515`	`2585`	`/* fetch last field */`
`2516`		`-result_text=text_substring(PointerGetDatum(inputstring),`
`2517`		`-start_posn,`
`2518`		`--1,`
`2519`		`- true);`
	`2586`	`+chunk_len= ((char*)inputstring+VARSIZE(inputstring))-start_ptr;`
`2520`	`2587`	`}`
`2521`	`2588`	`else`
`2522`	`2589`	`{`
`2523`	`2590`	`/* fetch non-last field */`
`2524`		`-result_text=text_substring(PointerGetDatum(inputstring),`
`2525`		`-start_posn,`
`2526`		`-end_posn-start_posn,`
`2527`		`- false);`
	`2591`	`+chunk_len=charlen_to_bytelen(start_ptr,end_posn-start_posn);`
`2528`	`2592`	`}`
`2529`	`2593`
	`2594`	`+/* must build a temp text datum to pass to accumArrayResult */`
	`2595`	`+result_text= (text*)palloc(VARHDRSZ+chunk_len);`
	`2596`	`+VARATT_SIZEP(result_text)=VARHDRSZ+chunk_len;`
	`2597`	`+memcpy(VARDATA(result_text),start_ptr,chunk_len);`
	`2598`	`+`
`2530`	`2599`	`/* stash away this field */`
`2531`	`2600`	`astate=accumArrayResult(astate,`
`2532`	`2601`	`PointerGetDatum(result_text),`
`2533`	`2602`	`false,`
`2534`	`2603`	`TEXTOID,`
`2535`	`2604`	`CurrentMemoryContext);`
`2536`	`2605`
	`2606`	`+pfree(result_text);`
	`2607`	`+`
`2537`	`2608`	`if (end_posn==0)`
`2538`	`2609`	`break;`
`2539`		`-start_posn=end_posn+fldsep_len;`
	`2610`	`+`
	`2611`	`+start_posn=end_posn;`
	`2612`	`+start_ptr+=chunk_len;`
	`2613`	`+start_posn+=fldsep_len;`
	`2614`	`+start_ptr+=charlen_to_bytelen(start_ptr,fldsep_len);`
`2540`	`2615`	`}`
`2541`	`2616`
`2542`	`2617`	`text_position_cleanup(&state);`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commita5cf12e

File tree

1 file changed

1 file changed

`‎src/backend/utils/adt/varlena.c`

0 commit comments