Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit1e16a81

Browse files
committed
Teach regular expression operators to honor collations.
This involves getting the character classification and case-foldingfunctions in the regex library to use the collations infrastructure.Most of this work had been done already in connection with the upper/lowerand LIKE logic, so it was a simple matter of transposition.While at it, split out these functions into a separate source fileregc_pg_locale.c, so that they can be correctly labeled with the Postgresproject's license rather than the Scriptics license. These functions are100% Postgres-written code whereas what remains in regc_locale.c is stillmostly not ours, so lumping them both under the same copyright notice wasgetting more and more misleading.
1 parent210f95f commit1e16a81

File tree

12 files changed

+819
-192
lines changed

12 files changed

+819
-192
lines changed

‎doc/src/sgml/charset.sgml

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -221,17 +221,21 @@ initdb --locale=sv_SE
221221

222222
<listitem>
223223
<para>
224-
The ability to use indexes with <literal>LIKE</> clauses
225-
<indexterm><primary>LIKE</><secondary>and locales</></indexterm>
224+
The <function>upper</>, <function>lower</>, and <function>initcap</>
225+
functions
226+
<indexterm><primary>upper</><secondary>and locales</></indexterm>
227+
<indexterm><primary>lower</><secondary>and locales</></indexterm>
226228
</para>
227229
</listitem>
228230

229231
<listitem>
230232
<para>
231-
The <function>upper</>, <function>lower</>, and <function>initcap</>
232-
functions
233-
<indexterm><primary>upper</><secondary>and locales</></indexterm>
234-
<indexterm><primary>lower</><secondary>and locales</></indexterm>
233+
Pattern matching operators (<literal>LIKE</>, <literal>SIMILAR TO</>,
234+
and POSIX-style regular expressions); locales affect both case
235+
insensitive matching and the classification of characters by
236+
character-class regular expressions
237+
<indexterm><primary>LIKE</><secondary>and locales</></indexterm>
238+
<indexterm><primary>regular expressions</><secondary>and locales</></indexterm>
235239
</para>
236240
</listitem>
237241

@@ -241,6 +245,12 @@ initdb --locale=sv_SE
241245
<indexterm><primary>to_char</><secondary>and locales</></indexterm>
242246
</para>
243247
</listitem>
248+
249+
<listitem>
250+
<para>
251+
The ability to use indexes with <literal>LIKE</> clauses
252+
</para>
253+
</listitem>
244254
</itemizedlist>
245255
</para>
246256

@@ -319,8 +329,8 @@ initdb --locale=sv_SE
319329
<indexterm zone="collation"><primary>collation</></>
320330

321331
<para>
322-
The collation feature allows specifying the sort order andcertain
323-
other locale aspects of data per-column, or even per-operation.
332+
The collation feature allows specifying the sort order andcharacter
333+
classification behavior of data per-column, or even per-operation.
324334
This alleviates the restriction that the
325335
<symbol>LC_COLLATE</symbol> and <symbol>LC_CTYPE</symbol> settings
326336
of a database cannot be changed after its creation.
@@ -351,8 +361,8 @@ initdb --locale=sv_SE
351361
</para>
352362

353363
<para>
354-
When the database system has to perform an ordering or a
355-
comparison, it uses the collation of the input expression. This
364+
When the database system has to perform an ordering or a character
365+
classification, it uses the collation of the input expression. This
356366
happens, for example, with <literal>ORDER BY</literal> clauses
357367
and function or operator calls such as <literal>&lt;</literal>.
358368
The collation to apply for an <literal>ORDER BY</literal> clause
@@ -361,7 +371,8 @@ initdb --locale=sv_SE
361371
below. In addition to comparison operators, collations are taken into
362372
account by functions that convert between lower and upper case
363373
letters, such as <function>lower</>, <function>upper</>, and
364-
<function>initcap</>.
374+
<function>initcap</>; by pattern matching operators; and by
375+
<function>to_char</> and related functions.
365376
</para>
366377

367378
<para>

‎src/backend/libpq/hba.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include<arpa/inet.h>
2626
#include<unistd.h>
2727

28+
#include"catalog/pg_collation.h"
2829
#include"libpq/ip.h"
2930
#include"libpq/libpq.h"
3031
#include"regex/regex.h"
@@ -1781,7 +1782,7 @@ parse_ident_usermap(List *line, int line_number, const char *usermap_name,
17811782
* XXX: Major room for optimization: regexps could be compiled when
17821783
* the file is loaded and then re-used in every connection.
17831784
*/
1784-
r=pg_regcomp(&re,wstr,wlen,REG_ADVANCED);
1785+
r=pg_regcomp(&re,wstr,wlen,REG_ADVANCED,C_COLLATION_OID);
17851786
if (r)
17861787
{
17871788
charerrstr[100];

‎src/backend/regex/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ OBJS = regcomp.o regerror.o regexec.o regfree.o
1717
include$(top_srcdir)/src/backend/common.mk
1818

1919
# mark inclusion dependencies between .c files explicitly
20-
regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c regc_locale.c
20+
regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c\
21+
regc_locale.c regc_pg_locale.c
2122

2223
regexec.o: regexec.c rege_dfa.c

‎src/backend/regex/regc_locale.c

Lines changed: 0 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -350,171 +350,6 @@ static const struct cname
350350
};
351351

352352

353-
/*
354-
* ctype functions adapted to work on pg_wchar (a/k/a chr)
355-
*
356-
* When working in UTF8 encoding, we use the <wctype.h> functions if
357-
* available. This assumes that every platform uses Unicode codepoints
358-
* directly as the wchar_t representation of Unicode. On some platforms
359-
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
360-
*
361-
* In all other encodings, we use the <ctype.h> functions for pg_wchar
362-
* values up to 255, and punt for values above that. This is only 100%
363-
* correct in single-byte encodings such as LATINn. However, non-Unicode
364-
* multibyte encodings are mostly Far Eastern character sets for which the
365-
* properties being tested here aren't relevant for higher code values anyway.
366-
*
367-
* NB: the coding here assumes pg_wchar is an unsigned type.
368-
*/
369-
370-
staticint
371-
pg_wc_isdigit(pg_wcharc)
372-
{
373-
#ifdefUSE_WIDE_UPPER_LOWER
374-
if (GetDatabaseEncoding()==PG_UTF8)
375-
{
376-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
377-
returniswdigit((wint_t)c);
378-
}
379-
#endif
380-
return (c <= (pg_wchar)UCHAR_MAX&&isdigit((unsignedchar)c));
381-
}
382-
383-
staticint
384-
pg_wc_isalpha(pg_wcharc)
385-
{
386-
#ifdefUSE_WIDE_UPPER_LOWER
387-
if (GetDatabaseEncoding()==PG_UTF8)
388-
{
389-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
390-
returniswalpha((wint_t)c);
391-
}
392-
#endif
393-
return (c <= (pg_wchar)UCHAR_MAX&&isalpha((unsignedchar)c));
394-
}
395-
396-
staticint
397-
pg_wc_isalnum(pg_wcharc)
398-
{
399-
#ifdefUSE_WIDE_UPPER_LOWER
400-
if (GetDatabaseEncoding()==PG_UTF8)
401-
{
402-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
403-
returniswalnum((wint_t)c);
404-
}
405-
#endif
406-
return (c <= (pg_wchar)UCHAR_MAX&&isalnum((unsignedchar)c));
407-
}
408-
409-
staticint
410-
pg_wc_isupper(pg_wcharc)
411-
{
412-
#ifdefUSE_WIDE_UPPER_LOWER
413-
if (GetDatabaseEncoding()==PG_UTF8)
414-
{
415-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
416-
returniswupper((wint_t)c);
417-
}
418-
#endif
419-
return (c <= (pg_wchar)UCHAR_MAX&&isupper((unsignedchar)c));
420-
}
421-
422-
staticint
423-
pg_wc_islower(pg_wcharc)
424-
{
425-
#ifdefUSE_WIDE_UPPER_LOWER
426-
if (GetDatabaseEncoding()==PG_UTF8)
427-
{
428-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
429-
returniswlower((wint_t)c);
430-
}
431-
#endif
432-
return (c <= (pg_wchar)UCHAR_MAX&&islower((unsignedchar)c));
433-
}
434-
435-
staticint
436-
pg_wc_isgraph(pg_wcharc)
437-
{
438-
#ifdefUSE_WIDE_UPPER_LOWER
439-
if (GetDatabaseEncoding()==PG_UTF8)
440-
{
441-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
442-
returniswgraph((wint_t)c);
443-
}
444-
#endif
445-
return (c <= (pg_wchar)UCHAR_MAX&&isgraph((unsignedchar)c));
446-
}
447-
448-
staticint
449-
pg_wc_isprint(pg_wcharc)
450-
{
451-
#ifdefUSE_WIDE_UPPER_LOWER
452-
if (GetDatabaseEncoding()==PG_UTF8)
453-
{
454-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
455-
returniswprint((wint_t)c);
456-
}
457-
#endif
458-
return (c <= (pg_wchar)UCHAR_MAX&&isprint((unsignedchar)c));
459-
}
460-
461-
staticint
462-
pg_wc_ispunct(pg_wcharc)
463-
{
464-
#ifdefUSE_WIDE_UPPER_LOWER
465-
if (GetDatabaseEncoding()==PG_UTF8)
466-
{
467-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
468-
returniswpunct((wint_t)c);
469-
}
470-
#endif
471-
return (c <= (pg_wchar)UCHAR_MAX&&ispunct((unsignedchar)c));
472-
}
473-
474-
staticint
475-
pg_wc_isspace(pg_wcharc)
476-
{
477-
#ifdefUSE_WIDE_UPPER_LOWER
478-
if (GetDatabaseEncoding()==PG_UTF8)
479-
{
480-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
481-
returniswspace((wint_t)c);
482-
}
483-
#endif
484-
return (c <= (pg_wchar)UCHAR_MAX&&isspace((unsignedchar)c));
485-
}
486-
487-
staticpg_wchar
488-
pg_wc_toupper(pg_wcharc)
489-
{
490-
#ifdefUSE_WIDE_UPPER_LOWER
491-
if (GetDatabaseEncoding()==PG_UTF8)
492-
{
493-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
494-
returntowupper((wint_t)c);
495-
}
496-
#endif
497-
if (c <= (pg_wchar)UCHAR_MAX)
498-
returntoupper((unsignedchar)c);
499-
returnc;
500-
}
501-
502-
staticpg_wchar
503-
pg_wc_tolower(pg_wcharc)
504-
{
505-
#ifdefUSE_WIDE_UPPER_LOWER
506-
if (GetDatabaseEncoding()==PG_UTF8)
507-
{
508-
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
509-
returntowlower((wint_t)c);
510-
}
511-
#endif
512-
if (c <= (pg_wchar)UCHAR_MAX)
513-
returntolower((unsignedchar)c);
514-
returnc;
515-
}
516-
517-
518353
/*
519354
* element - map collating-element name to celt
520355
*/

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp