Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commita5cf12e

Browse files
committed
Fix performance issues in replace_text(), replace_text_regexp(), and
text_to_array(): they all had O(N^2) behavior on long input strings inmultibyte encodings, because of repeated rescanning of the input text toidentify substrings whose positions/lengths were computed in charactersinstead of bytes. Fix by tracking the current source position as a charpointer as well as a character-count. Also avoid some unnecessary pallocoperations. text_to_array() also leaked memory intracall due to failureto pfree temporary strings. Per gripe from Tatsuo Ishii.
1 parent9d6f263 commita5cf12e

File tree

1 file changed

+131
-56
lines changed

1 file changed

+131
-56
lines changed

‎src/backend/utils/adt/varlena.c

Lines changed: 131 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.152 2006/10/07 00:11:53 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.153 2006/11/08 19:22:25 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -21,6 +21,7 @@
2121
#include"catalog/pg_type.h"
2222
#include"libpq/md5.h"
2323
#include"libpq/pqformat.h"
24+
#include"miscadmin.h"
2425
#include"parser/scansup.h"
2526
#include"regex/regex.h"
2627
#include"utils/builtins.h"
@@ -477,6 +478,32 @@ textcat(PG_FUNCTION_ARGS)
477478
PG_RETURN_TEXT_P(result);
478479
}
479480

481+
/*
482+
* charlen_to_bytelen()
483+
*Compute the number of bytes occupied by n characters starting at *p
484+
*
485+
* It is caller's responsibility that there actually are n characters;
486+
* the string need not be null-terminated.
487+
*/
488+
staticint
489+
charlen_to_bytelen(constchar*p,intn)
490+
{
491+
if (pg_database_encoding_max_length()==1)
492+
{
493+
/* Optimization for single-byte encodings */
494+
returnn;
495+
}
496+
else
497+
{
498+
constchar*s;
499+
500+
for (s=p;n>0;n--)
501+
s+=pg_mblen(s);
502+
503+
returns-p;
504+
}
505+
}
506+
480507
/*
481508
* text_substr()
482509
* Return a substring starting at the specified position.
@@ -534,6 +561,8 @@ text_substr_no_len(PG_FUNCTION_ARGS)
534561
*functions.Note that the argument is passed as a Datum, to indicate that
535562
*it may still be in compressed/toasted form. We can avoid detoasting all
536563
*of it in some cases.
564+
*
565+
*The result is always a freshly palloc'd datum.
537566
*/
538567
statictext*
539568
text_substring(Datumstr,int32start,int32length,boollength_not_specified)
@@ -649,11 +678,23 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
649678
*/
650679
slice_size= (S1+L1)*eml;
651680
}
652-
slice=DatumGetTextPSlice(str,slice_start,slice_size);
681+
682+
/*
683+
* If we're working with an untoasted source, no need to do an
684+
* extra copying step.
685+
*/
686+
if (VARATT_IS_EXTENDED(str))
687+
slice=DatumGetTextPSlice(str,slice_start,slice_size);
688+
else
689+
slice= (text*)DatumGetPointer(str);
653690

654691
/* see if we got back an empty string */
655692
if ((VARSIZE(slice)-VARHDRSZ)==0)
693+
{
694+
if (slice!= (text*)DatumGetPointer(str))
695+
pfree(slice);
656696
returnPG_STR_GET_TEXT("");
697+
}
657698

658699
/* Now we can get the actual length of the slice in MB characters */
659700
slice_strlen=pg_mbstrlen_with_len(VARDATA(slice),VARSIZE(slice)-VARHDRSZ);
@@ -663,7 +704,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
663704
* says to return a zero-length string.
664705
*/
665706
if (S1>slice_strlen)
707+
{
708+
if (slice!= (text*)DatumGetPointer(str))
709+
pfree(slice);
666710
returnPG_STR_GET_TEXT("");
711+
}
667712

668713
/*
669714
* Adjust L1 and E1 now that we know the slice string length. Again
@@ -695,6 +740,9 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
695740
VARATT_SIZEP(ret)=VARHDRSZ+ (p-s);
696741
memcpy(VARDATA(ret),s, (p-s));
697742

743+
if (slice!= (text*)DatumGetPointer(str))
744+
pfree(slice);
745+
698746
returnret;
699747
}
700748
else
@@ -2076,10 +2124,11 @@ replace_text(PG_FUNCTION_ARGS)
20762124
intsrc_text_len=TEXTLEN(src_text);
20772125
intfrom_sub_text_len=TEXTLEN(from_sub_text);
20782126
TextPositionStatestate;
2079-
text*chunk_text;
20802127
text*ret_text;
20812128
intstart_posn;
20822129
intcurr_posn;
2130+
intchunk_len;
2131+
char*start_ptr;
20832132
StringInfoDatastr;
20842133

20852134
if (src_text_len==0||from_sub_text_len==0)
@@ -2097,31 +2146,31 @@ replace_text(PG_FUNCTION_ARGS)
20972146
PG_RETURN_TEXT_P(src_text);
20982147
}
20992148

2149+
/* start_ptr points to the start_posn'th character of src_text */
2150+
start_ptr= (char*)VARDATA(src_text);
2151+
21002152
initStringInfo(&str);
21012153

21022154
do
21032155
{
2104-
chunk_text=text_substring(PointerGetDatum(src_text),
2105-
start_posn,
2106-
curr_posn-start_posn,
2107-
false);
2108-
appendStringInfoText(&str,chunk_text);
2109-
pfree(chunk_text);
2156+
/* copy the data skipped over by last text_position_next() */
2157+
chunk_len=charlen_to_bytelen(start_ptr,curr_posn-start_posn);
2158+
appendBinaryStringInfo(&str,start_ptr,chunk_len);
21102159

21112160
appendStringInfoText(&str,to_sub_text);
21122161

2113-
start_posn=curr_posn+from_sub_text_len;
2162+
start_posn=curr_posn;
2163+
start_ptr+=chunk_len;
2164+
start_posn+=from_sub_text_len;
2165+
start_ptr+=charlen_to_bytelen(start_ptr,from_sub_text_len);
2166+
21142167
curr_posn=text_position_next(start_posn,&state);
21152168
}
21162169
while (curr_posn>0);
21172170

2118-
/* copy trailing chunk */
2119-
chunk_text=text_substring(PointerGetDatum(src_text),
2120-
start_posn,
2121-
-1,
2122-
true);
2123-
appendStringInfoText(&str,chunk_text);
2124-
pfree(chunk_text);
2171+
/* copy trailing data */
2172+
chunk_len= ((char*)src_text+VARSIZE(src_text))-start_ptr;
2173+
appendBinaryStringInfo(&str,start_ptr,chunk_len);
21252174

21262175
text_position_cleanup(&state);
21272176

@@ -2166,11 +2215,13 @@ check_replace_text_has_escape_char(const text *replace_text)
21662215
* appendStringInfoRegexpSubstr
21672216
*
21682217
* Append replace_text to str, substituting regexp back references for
2169-
* \n escapes.
2218+
* \n escapes. start_ptr is the start of the match in the source string,
2219+
* at logical character position data_pos.
21702220
*/
21712221
staticvoid
21722222
appendStringInfoRegexpSubstr(StringInfostr,text*replace_text,
2173-
regmatch_t*pmatch,text*src_text)
2223+
regmatch_t*pmatch,
2224+
char*start_ptr,intdata_pos)
21742225
{
21752226
constchar*p=VARDATA(replace_text);
21762227
constchar*p_end=p+ (VARSIZE(replace_text)-VARHDRSZ);
@@ -2247,16 +2298,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
22472298
if (so!=-1&&eo!=-1)
22482299
{
22492300
/*
2250-
* Copy the text that is back reference of regexp.Because so and
2251-
* eo are counted in characters not bytes, it's easiest to use
2252-
* text_substring to pull out the correct chunk of text.
2301+
* Copy the text that is back reference of regexp.Note so and
2302+
* eo are counted in characters not bytes.
22532303
*/
2254-
text*append_text;
2255-
2256-
append_text=text_substring(PointerGetDatum(src_text),
2257-
so+1, (eo-so), false);
2258-
appendStringInfoText(str,append_text);
2259-
pfree(append_text);
2304+
char*chunk_start;
2305+
intchunk_len;
2306+
2307+
Assert(so >=data_pos);
2308+
chunk_start=start_ptr;
2309+
chunk_start+=charlen_to_bytelen(chunk_start,so-data_pos);
2310+
chunk_len=charlen_to_bytelen(chunk_start,eo-so);
2311+
appendBinaryStringInfo(str,chunk_start,chunk_len);
22602312
}
22612313
}
22622314
}
@@ -2284,6 +2336,7 @@ replace_text_regexp(text *src_text, void *regexp,
22842336
size_tdata_len;
22852337
intsearch_start;
22862338
intdata_pos;
2339+
char*start_ptr;
22872340
boolhave_escape;
22882341

22892342
initStringInfo(&buf);
@@ -2295,10 +2348,17 @@ replace_text_regexp(text *src_text, void *regexp,
22952348
/* Check whether replace_text has escape char. */
22962349
have_escape=check_replace_text_has_escape_char(replace_text);
22972350

2298-
for (search_start=data_pos=0;search_start <=data_len;)
2351+
/* start_ptr points to the data_pos'th character of src_text */
2352+
start_ptr= (char*)VARDATA(src_text);
2353+
data_pos=0;
2354+
2355+
search_start=0;
2356+
while (search_start <=data_len)
22992357
{
23002358
intregexec_result;
23012359

2360+
CHECK_FOR_INTERRUPTS();
2361+
23022362
regexec_result=pg_regexec(re,
23032363
data,
23042364
data_len,
@@ -2322,32 +2382,38 @@ replace_text_regexp(text *src_text, void *regexp,
23222382
}
23232383

23242384
/*
2325-
* Copy the text to the left of the match position. Because we are
2326-
* working with character not byte indexes, it's easiest to use
2327-
* text_substring to pull out the needed data.
2385+
* Copy the text to the left of the match position. Note we are
2386+
* given character not byte indexes.
23282387
*/
23292388
if (pmatch[0].rm_so-data_pos>0)
23302389
{
2331-
text*left_text;
2332-
2333-
left_text=text_substring(PointerGetDatum(src_text),
2334-
data_pos+1,
2335-
pmatch[0].rm_so-data_pos,
2336-
false);
2337-
appendStringInfoText(&buf,left_text);
2338-
pfree(left_text);
2390+
intchunk_len;
2391+
2392+
chunk_len=charlen_to_bytelen(start_ptr,
2393+
pmatch[0].rm_so-data_pos);
2394+
appendBinaryStringInfo(&buf,start_ptr,chunk_len);
2395+
/*
2396+
* Advance start_ptr over that text, to avoid multiple rescans
2397+
* of it if the replace_text contains multiple back-references.
2398+
*/
2399+
start_ptr+=chunk_len;
2400+
data_pos=pmatch[0].rm_so;
23392401
}
23402402

23412403
/*
23422404
* Copy the replace_text. Process back references when the
23432405
* replace_text has escape characters.
23442406
*/
23452407
if (have_escape)
2346-
appendStringInfoRegexpSubstr(&buf,replace_text,pmatch,src_text);
2408+
appendStringInfoRegexpSubstr(&buf,replace_text,pmatch,
2409+
start_ptr,data_pos);
23472410
else
23482411
appendStringInfoText(&buf,replace_text);
23492412

2350-
search_start=data_pos=pmatch[0].rm_eo;
2413+
/* Advance start_ptr and data_pos over the matched text. */
2414+
start_ptr+=charlen_to_bytelen(start_ptr,
2415+
pmatch[0].rm_eo-data_pos);
2416+
data_pos=pmatch[0].rm_eo;
23512417

23522418
/*
23532419
* When global option is off, replace the first instance only.
@@ -2358,6 +2424,7 @@ replace_text_regexp(text *src_text, void *regexp,
23582424
/*
23592425
* Search from next character when the matching text is zero width.
23602426
*/
2427+
search_start=data_pos;
23612428
if (pmatch[0].rm_so==pmatch[0].rm_eo)
23622429
search_start++;
23632430
}
@@ -2367,12 +2434,10 @@ replace_text_regexp(text *src_text, void *regexp,
23672434
*/
23682435
if (data_pos<data_len)
23692436
{
2370-
text*right_text;
2437+
intchunk_len;
23712438

2372-
right_text=text_substring(PointerGetDatum(src_text),
2373-
data_pos+1,-1, true);
2374-
appendStringInfoText(&buf,right_text);
2375-
pfree(right_text);
2439+
chunk_len= ((char*)src_text+VARSIZE(src_text))-start_ptr;
2440+
appendBinaryStringInfo(&buf,start_ptr,chunk_len);
23762441
}
23772442

23782443
ret_text=PG_STR_GET_TEXT(buf.data);
@@ -2488,6 +2553,8 @@ text_to_array(PG_FUNCTION_ARGS)
24882553
intfldnum;
24892554
intstart_posn;
24902555
intend_posn;
2556+
intchunk_len;
2557+
char*start_ptr;
24912558
text*result_text;
24922559
ArrayBuildState*astate=NULL;
24932560

@@ -2506,37 +2573,45 @@ text_to_array(PG_FUNCTION_ARGS)
25062573
text_position_setup(inputstring,fldsep,&state);
25072574

25082575
start_posn=1;
2576+
/* start_ptr points to the start_posn'th character of inputstring */
2577+
start_ptr= (char*)VARDATA(inputstring);
2578+
25092579
for (fldnum=1;;fldnum++)/* field number is 1 based */
25102580
{
25112581
end_posn=text_position_next(start_posn,&state);
25122582

25132583
if (end_posn==0)
25142584
{
25152585
/* fetch last field */
2516-
result_text=text_substring(PointerGetDatum(inputstring),
2517-
start_posn,
2518-
-1,
2519-
true);
2586+
chunk_len= ((char*)inputstring+VARSIZE(inputstring))-start_ptr;
25202587
}
25212588
else
25222589
{
25232590
/* fetch non-last field */
2524-
result_text=text_substring(PointerGetDatum(inputstring),
2525-
start_posn,
2526-
end_posn-start_posn,
2527-
false);
2591+
chunk_len=charlen_to_bytelen(start_ptr,end_posn-start_posn);
25282592
}
25292593

2594+
/* must build a temp text datum to pass to accumArrayResult */
2595+
result_text= (text*)palloc(VARHDRSZ+chunk_len);
2596+
VARATT_SIZEP(result_text)=VARHDRSZ+chunk_len;
2597+
memcpy(VARDATA(result_text),start_ptr,chunk_len);
2598+
25302599
/* stash away this field */
25312600
astate=accumArrayResult(astate,
25322601
PointerGetDatum(result_text),
25332602
false,
25342603
TEXTOID,
25352604
CurrentMemoryContext);
25362605

2606+
pfree(result_text);
2607+
25372608
if (end_posn==0)
25382609
break;
2539-
start_posn=end_posn+fldsep_len;
2610+
2611+
start_posn=end_posn;
2612+
start_ptr+=chunk_len;
2613+
start_posn+=fldsep_len;
2614+
start_ptr+=charlen_to_bytelen(start_ptr,fldsep_len);
25402615
}
25412616

25422617
text_position_cleanup(&state);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp