Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite9e535d

Browse files
committed
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions toPOSIX-style regular expressions did not consider that square bracketscan be nested. For example, in an expression like [[:alpha:]%_], thelogic replaced the placeholders '_' and '%' but it should not.This commit fixes the conversion logic by tracking the nesting level ofsquare brackets marking character class areas, while considering thatin expressions like []] or [^]] the first closing square bracket is aregular character. Multiple tests are added to show how the conversionsshould or should not apply applied while in a character class area, withspecific cases added for all the characters converted outside characterclasses like an opening parenthesis '(', dollar sign '$', etc.Author: Laurenz Albe <laurenz.albe@cybertec.at>Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>Reviewed-by: Michael Paquier <michael@paquier.xyz>Discussion:https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.atBackpatch-through: 13
1 parenta7d3e32 commite9e535d

File tree

3 files changed

+114
-6
lines changed

3 files changed

+114
-6
lines changed

‎src/backend/utils/adt/regexp.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -774,8 +774,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
774774
intplen,
775775
elen;
776776
boolafterescape= false;
777-
boolincharclass= false;
778777
intnquotes=0;
778+
intcharclass_depth=0;/* Nesting level of character classes,
779+
* encompassed by square brackets */
780+
intcharclass_start=0;/* State of the character class start,
781+
* for carets */
779782

780783
p=VARDATA_ANY(pat_text);
781784
plen=VARSIZE_ANY_EXHDR(pat_text);
@@ -905,7 +908,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
905908
/* fast path */
906909
if (afterescape)
907910
{
908-
if (pchar=='"'&&!incharclass)/* escape-double-quote? */
911+
if (pchar=='"'&&charclass_depth<1)/* escape-double-quote? */
909912
{
910913
/* emit appropriate part separator, per notes above */
911914
if (nquotes==0)
@@ -954,18 +957,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
954957
/* SQL escape character; do not send to output */
955958
afterescape= true;
956959
}
957-
elseif (incharclass)
960+
elseif (charclass_depth>0)
958961
{
959962
if (pchar=='\\')
960963
*r++='\\';
961964
*r++=pchar;
962-
if (pchar==']')
963-
incharclass= false;
965+
966+
/*
967+
* Ignore a closing bracket at the start of a character class.
968+
* Such a bracket is taken literally rather than closing the
969+
* class. "charclass_start" is 1 right at the beginning of a
970+
* class and 2 after an initial caret.
971+
*/
972+
if (pchar==']'&&charclass_start>2)
973+
charclass_depth--;
974+
elseif (pchar=='[')
975+
charclass_depth++;
976+
977+
/*
978+
* If there is a caret right after the opening bracket, it negates
979+
* the character class, but a following closing bracket should
980+
* still be treated as a normal character. That holds only for
981+
* the first caret, so only the values 1 and 2 mean that closing
982+
* brackets should be taken literally.
983+
*/
984+
if (pchar=='^')
985+
charclass_start++;
986+
else
987+
charclass_start=3;/* definitely past the start */
964988
}
965989
elseif (pchar=='[')
966990
{
991+
/* start of a character class */
967992
*r++=pchar;
968-
incharclass= true;
993+
charclass_depth++;
994+
charclass_start=1;
969995
}
970996
elseif (pchar=='%')
971997
{

‎src/test/regress/expected/strings.out

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
596596
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
597597
ERROR: invalid escape string
598598
HINT: Escape string must be empty or one character.
599+
-- Characters that should be left alone in character classes when a
600+
-- SIMILAR TO regexp pattern is converted to POSIX style.
601+
-- Underscore "_"
602+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
603+
QUERY PLAN
604+
------------------------------------------------
605+
Seq Scan on text_tbl
606+
Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
607+
(2 rows)
608+
609+
-- Percentage "%"
610+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
611+
QUERY PLAN
612+
--------------------------------------------------
613+
Seq Scan on text_tbl
614+
Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
615+
(2 rows)
616+
617+
-- Dot "."
618+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
619+
QUERY PLAN
620+
--------------------------------------------------
621+
Seq Scan on text_tbl
622+
Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
623+
(2 rows)
624+
625+
-- Dollar "$"
626+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
627+
QUERY PLAN
628+
--------------------------------------------------
629+
Seq Scan on text_tbl
630+
Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
631+
(2 rows)
632+
633+
-- Opening parenthesis "("
634+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
635+
ERROR: invalid regular expression: parentheses () not balanced
636+
-- Caret "^"
637+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
638+
QUERY PLAN
639+
------------------------------------------------------------------------
640+
Seq Scan on text_tbl
641+
Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
642+
(2 rows)
643+
644+
-- Closing square bracket "]" at the beginning of character class
645+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
646+
QUERY PLAN
647+
------------------------------------------------
648+
Seq Scan on text_tbl
649+
Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
650+
(2 rows)
651+
652+
-- Closing square bracket effective after two carets at the beginning
653+
-- of character class.
654+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
655+
QUERY PLAN
656+
---------------------------------------
657+
Seq Scan on text_tbl
658+
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
659+
(2 rows)
660+
599661
-- Test backslash escapes in regexp_replace's replacement string
600662
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
601663
regexp_replace

‎src/test/regress/sql/strings.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
193193
SELECT'abcdefg' SIMILAR TO'_bcd%' ESCAPENULLASnull;
194194
SELECT'abcdefg' SIMILAR TO'_bcd#%' ESCAPE'##'AS error;
195195

196+
-- Characters that should be left alone in character classes when a
197+
-- SIMILAR TO regexp pattern is converted to POSIX style.
198+
-- Underscore "_"
199+
EXPLAIN (COSTS OFF)SELECT*FROM TEXT_TBLWHERE f1 SIMILAR TO'_[_[:alpha:]_]_';
200+
-- Percentage "%"
201+
EXPLAIN (COSTS OFF)SELECT*FROM TEXT_TBLWHERE f1 SIMILAR TO'%[%[:alnum:]%]%';
202+
-- Dot "."
203+
EXPLAIN (COSTS OFF)SELECT*FROM TEXT_TBLWHERE f1 SIMILAR TO'.[.[:alnum:].].';
204+
-- Dollar "$"
205+
EXPLAIN (COSTS OFF)SELECT*FROM TEXT_TBLWHERE f1 SIMILAR TO'$[$[:alnum:]$]$';
206+
-- Opening parenthesis "("
207+
EXPLAIN (COSTS OFF)SELECT*FROM TEXT_TBLWHERE f1 SIMILAR TO'([([:alnum:](](';
208+
-- Caret "^"
209+
EXPLAIN (COSTS OFF)SELECT*FROM TEXT_TBLWHERE f1 SIMILAR TO'^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
210+
-- Closing square bracket "]" at the beginning of character class
211+
EXPLAIN (COSTS OFF)SELECT*FROM TEXT_TBLWHERE f1 SIMILAR TO'[]%][^]%][^%]%';
212+
-- Closing square bracket effective after two carets at the beginning
213+
-- of character class.
214+
EXPLAIN (COSTS OFF)SELECT*FROM TEXT_TBLWHERE f1 SIMILAR TO'[^^]^';
215+
196216
-- Test backslash escapes in regexp_replace's replacement string
197217
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1)\\2-\\3');
198218
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y','g');

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp