Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit329304c

Browse files
committed
Support text position search functions with nondeterministic collations
This allows using text position search functions with nondeterministiccollations. These functions are- position, strpos- replace- split_part- string_to_array- string_to_tablewhich all use common internal infrastructure.There was previously no internal implementation of this, so it was metwith a not-supported error. This adds the internal implementation andremoves the error.Unlike with deterministic collations, the search cannot use anybyte-by-byte optimized techniques but has to go substring bysubstring. We also need to consider that the found match could have adifferent length than the needle and that there could be substrings ofdifferent length matching at a position. In most cases, we need tofind the longest such substring (greedy semantics), but this can beconfigured by each caller.Reviewed-by: Euler Taveira <euler@eulerto.com>Discussion:https://www.postgresql.org/message-id/flat/582b2613-0900-48ca-8b0d-340c06f4d400@eisentraut.org
1 parent41336bf commit329304c

File tree

3 files changed

+246
-48
lines changed

3 files changed

+246
-48
lines changed

‎src/backend/utils/adt/varlena.c

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ typedef struct varlena VarString;
5454
*/
5555
typedefstruct
5656
{
57+
pg_locale_tlocale;/* collation used for substring matching */
5758
boolis_multibyte_char_in_char;/* need to check char boundaries? */
59+
boolgreedy;/* find longest possible substring? */
5860

5961
char*str1;/* haystack string */
6062
char*str2;/* needle string */
@@ -65,7 +67,13 @@ typedef struct
6567
intskiptablemask;/* mask for ANDing with skiptable subscripts */
6668
intskiptable[256];/* skip distance for given mismatched char */
6769

70+
/*
71+
* Note that with nondeterministic collations, the length of the last
72+
* match is not necessarily equal to the length of the "needle" passed in.
73+
*/
6874
char*last_match;/* pointer to last match in 'str1' */
75+
intlast_match_len;/* length of last match */
76+
intlast_match_len_tmp;/* same but for internal use */
6977

7078
/*
7179
* Sometimes we need to convert the byte position of a match to a
@@ -1178,15 +1186,21 @@ text_position(text *t1, text *t2, Oid collid)
11781186
TextPositionStatestate;
11791187
intresult;
11801188

1189+
check_collation_set(collid);
1190+
11811191
/* Empty needle always matches at position 1 */
11821192
if (VARSIZE_ANY_EXHDR(t2)<1)
11831193
return1;
11841194

11851195
/* Otherwise, can't match if haystack is shorter than needle */
1186-
if (VARSIZE_ANY_EXHDR(t1)<VARSIZE_ANY_EXHDR(t2))
1196+
if (VARSIZE_ANY_EXHDR(t1)<VARSIZE_ANY_EXHDR(t2)&&
1197+
pg_newlocale_from_collation(collid)->deterministic)
11871198
return0;
11881199

11891200
text_position_setup(t1,t2,collid,&state);
1201+
/* don't need greedy mode here */
1202+
state.greedy= false;
1203+
11901204
if (!text_position_next(&state))
11911205
result=0;
11921206
else
@@ -1217,18 +1231,17 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12171231
{
12181232
intlen1=VARSIZE_ANY_EXHDR(t1);
12191233
intlen2=VARSIZE_ANY_EXHDR(t2);
1220-
pg_locale_tmylocale;
12211234

12221235
check_collation_set(collid);
12231236

1224-
mylocale=pg_newlocale_from_collation(collid);
1237+
state->locale=pg_newlocale_from_collation(collid);
12251238

1226-
if (!mylocale->deterministic)
1227-
ereport(ERROR,
1228-
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1229-
errmsg("nondeterministic collations are not supported for substring searches")));
1239+
/*
1240+
* Most callers need greedy mode, but some might want to unset this to
1241+
* optimize.
1242+
*/
1243+
state->greedy= true;
12301244

1231-
Assert(len1>0);
12321245
Assert(len2>0);
12331246

12341247
/*
@@ -1264,8 +1277,11 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12641277
* point in wasting cycles initializing the table. We also choose not to
12651278
* use B-M-H for needles of length 1, since the skip table can't possibly
12661279
* save anything in that case.
1280+
*
1281+
* (With nondeterministic collations, the search is already
1282+
* multibyte-aware, so we don't need this.)
12671283
*/
1268-
if (len1 >=len2&&len2>1)
1284+
if (len1 >=len2&&len2>1&&state->locale->deterministic)
12691285
{
12701286
intsearchlength=len1-len2;
12711287
intskiptablemask;
@@ -1343,7 +1359,7 @@ text_position_next(TextPositionState *state)
13431359

13441360
/* Start from the point right after the previous match. */
13451361
if (state->last_match)
1346-
start_ptr=state->last_match+needle_len;
1362+
start_ptr=state->last_match+state->last_match_len;
13471363
else
13481364
start_ptr=state->str1;
13491365

@@ -1359,7 +1375,7 @@ text_position_next(TextPositionState *state)
13591375
* multi-byte character, we need to verify that the match was at a
13601376
* character boundary, not in the middle of a multi-byte character.
13611377
*/
1362-
if (state->is_multibyte_char_in_char)
1378+
if (state->is_multibyte_char_in_char&&state->locale->deterministic)
13631379
{
13641380
/* Walk one character at a time, until we reach the match. */
13651381

@@ -1387,6 +1403,7 @@ text_position_next(TextPositionState *state)
13871403
}
13881404

13891405
state->last_match=matchptr;
1406+
state->last_match_len=state->last_match_len_tmp;
13901407
return true;
13911408
}
13921409

@@ -1408,7 +1425,62 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
14081425

14091426
Assert(start_ptr >=haystack&&start_ptr <=haystack_end);
14101427

1411-
if (needle_len==1)
1428+
state->last_match_len_tmp=needle_len;
1429+
1430+
if (!state->locale->deterministic)
1431+
{
1432+
/*
1433+
* With a nondeterministic collation, we have to use an unoptimized
1434+
* route. We walk through the haystack and see if at each position
1435+
* there is a substring of the remaining string that is equal to the
1436+
* needle under the given collation.
1437+
*
1438+
* Note, the found substring could have a different length than the
1439+
* needle, including being empty. Callers that want to skip over the
1440+
* found string need to read the length of the found substring from
1441+
* last_match_len rather than just using the length of their needle.
1442+
*
1443+
* Most callers will require "greedy" semantics, meaning that we need
1444+
* to find the longest such substring, not the shortest. For callers
1445+
* that don't need greedy semantics, we can finish on the first match.
1446+
*/
1447+
constchar*result_hptr=NULL;
1448+
1449+
hptr=start_ptr;
1450+
while (hptr<haystack_end)
1451+
{
1452+
/*
1453+
* First check the common case that there is a match in the
1454+
* haystack of exactly the length of the needle.
1455+
*/
1456+
if (!state->greedy&&
1457+
haystack_end-hptr >=needle_len&&
1458+
pg_strncoll(hptr,needle_len,needle,needle_len,state->locale)==0)
1459+
return (char*)hptr;
1460+
1461+
/*
1462+
* Else check if any of the possible substrings starting at hptr
1463+
* are equal to the needle.
1464+
*/
1465+
for (constchar*test_end=hptr;test_end<haystack_end;test_end+=pg_mblen(test_end))
1466+
{
1467+
if (pg_strncoll(hptr, (test_end-hptr),needle,needle_len,state->locale)==0)
1468+
{
1469+
state->last_match_len_tmp= (test_end-hptr);
1470+
result_hptr=hptr;
1471+
if (!state->greedy)
1472+
break;
1473+
}
1474+
}
1475+
if (result_hptr)
1476+
break;
1477+
1478+
hptr+=pg_mblen(hptr);
1479+
}
1480+
1481+
return (char*)result_hptr;
1482+
}
1483+
elseif (needle_len==1)
14121484
{
14131485
/* No point in using B-M-H for a one-character needle */
14141486
charnchar=*needle;
@@ -4055,7 +4127,7 @@ replace_text(PG_FUNCTION_ARGS)
40554127

40564128
appendStringInfoText(&str,to_sub_text);
40574129

4058-
start_ptr=curr_ptr+from_sub_text_len;
4130+
start_ptr=curr_ptr+state.last_match_len;
40594131

40604132
found=text_position_next(&state);
40614133
if (found)
@@ -4445,7 +4517,7 @@ split_part(PG_FUNCTION_ARGS)
44454517
/* special case of last field does not require an extra pass */
44464518
if (fldnum==-1)
44474519
{
4448-
start_ptr=text_position_get_match_ptr(&state)+fldsep_len;
4520+
start_ptr=text_position_get_match_ptr(&state)+state.last_match_len;
44494521
end_ptr=VARDATA_ANY(inputstring)+inputstring_len;
44504522
text_position_cleanup(&state);
44514523
PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
@@ -4475,7 +4547,7 @@ split_part(PG_FUNCTION_ARGS)
44754547
while (found&&--fldnum>0)
44764548
{
44774549
/* identify bounds of next field */
4478-
start_ptr=end_ptr+fldsep_len;
4550+
start_ptr=end_ptr+state.last_match_len;
44794551
found=text_position_next(&state);
44804552
if (found)
44814553
end_ptr=text_position_get_match_ptr(&state);
@@ -4691,7 +4763,7 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
46914763
if (!found)
46924764
break;
46934765

4694-
start_ptr=end_ptr+fldsep_len;
4766+
start_ptr=end_ptr+state.last_match_len;
46954767
}
46964768

46974769
text_position_cleanup(&state);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp