Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit0d32342

Browse files
committed
Teach the regular expression functions to do case-insensitive matching and
locale-dependent character classification properly when the database encodingis UTF8.The previous coding worked okay in single-byte encodings, or in any case forASCII characters, but failed entirely on multibyte characters. The fixassumes that the <wctype.h> functions use Unicode code points as the wcharrepresentation for Unicode, ie, wchar matches pg_wchar.This is only a partial solution, since we're still stupid about non-ASCIIcharacters in multibyte encodings other than UTF8. The practical effectof that is limited, however, since those cases are generally Far Easternglyphs for which concepts like case-folding don't apply anyway. Certainlyall or nearly all of the field reports of problems have been about UTF8.A more general solution would require switching to the platform's wcharrepresentation for all regex operations; which is possible but would havesubstantial disadvantages. Let's try this and see if it's sufficient inpractice.
1 parentef51395 commit0d32342

File tree

2 files changed

+117
-14
lines changed

2 files changed

+117
-14
lines changed

‎src/backend/regex/regc_locale.c

Lines changed: 105 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
* permission to use and distribute the software in accordance with the
4848
* terms specified in this license.
4949
*
50-
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.9 2008/02/14 17:33:37 tgl Exp $
50+
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.10 2009/12/01 21:00:24 tgl Exp $
5151
*/
5252

5353
/* ASCII character-name table */
@@ -349,75 +349,167 @@ static const struct cname
349349
}
350350
};
351351

352+
352353
/*
353-
* some ctype functions with non-ascii-char guard
354+
* ctype functions adapted to work on pg_wchar (a/k/a chr)
355+
*
356+
* When working in UTF8 encoding, we use the <wctype.h> functions if
357+
* available. This assumes that every platform uses Unicode codepoints
358+
* directly as the wchar_t representation of Unicode. On some platforms
359+
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
360+
*
361+
* In all other encodings, we use the <ctype.h> functions for pg_wchar
362+
* values up to 255, and punt for values above that. This is only 100%
363+
* correct in single-byte encodings such as LATINn. However, non-Unicode
364+
* multibyte encodings are mostly Far Eastern character sets for which the
365+
* properties being tested here aren't relevant for higher code values anyway.
366+
*
367+
* NB: the coding here assumes pg_wchar is an unsigned type.
354368
*/
369+
355370
staticint
356371
pg_wc_isdigit(pg_wcharc)
357372
{
358-
return (c >=0&&c <=UCHAR_MAX&&isdigit((unsignedchar)c));
373+
#ifdefUSE_WIDE_UPPER_LOWER
374+
if (GetDatabaseEncoding()==PG_UTF8)
375+
{
376+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
377+
returniswdigit((wint_t)c);
378+
}
379+
#endif
380+
return (c <= (pg_wchar)UCHAR_MAX&&isdigit((unsignedchar)c));
359381
}
360382

361383
staticint
362384
pg_wc_isalpha(pg_wcharc)
363385
{
364-
return (c >=0&&c <=UCHAR_MAX&&isalpha((unsignedchar)c));
386+
#ifdefUSE_WIDE_UPPER_LOWER
387+
if (GetDatabaseEncoding()==PG_UTF8)
388+
{
389+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
390+
returniswalpha((wint_t)c);
391+
}
392+
#endif
393+
return (c <= (pg_wchar)UCHAR_MAX&&isalpha((unsignedchar)c));
365394
}
366395

367396
staticint
368397
pg_wc_isalnum(pg_wcharc)
369398
{
370-
return (c >=0&&c <=UCHAR_MAX&&isalnum((unsignedchar)c));
399+
#ifdefUSE_WIDE_UPPER_LOWER
400+
if (GetDatabaseEncoding()==PG_UTF8)
401+
{
402+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
403+
returniswalnum((wint_t)c);
404+
}
405+
#endif
406+
return (c <= (pg_wchar)UCHAR_MAX&&isalnum((unsignedchar)c));
371407
}
372408

373409
staticint
374410
pg_wc_isupper(pg_wcharc)
375411
{
376-
return (c >=0&&c <=UCHAR_MAX&&isupper((unsignedchar)c));
412+
#ifdefUSE_WIDE_UPPER_LOWER
413+
if (GetDatabaseEncoding()==PG_UTF8)
414+
{
415+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
416+
returniswupper((wint_t)c);
417+
}
418+
#endif
419+
return (c <= (pg_wchar)UCHAR_MAX&&isupper((unsignedchar)c));
377420
}
378421

379422
staticint
380423
pg_wc_islower(pg_wcharc)
381424
{
382-
return (c >=0&&c <=UCHAR_MAX&&islower((unsignedchar)c));
425+
#ifdefUSE_WIDE_UPPER_LOWER
426+
if (GetDatabaseEncoding()==PG_UTF8)
427+
{
428+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
429+
returniswlower((wint_t)c);
430+
}
431+
#endif
432+
return (c <= (pg_wchar)UCHAR_MAX&&islower((unsignedchar)c));
383433
}
384434

385435
staticint
386436
pg_wc_isgraph(pg_wcharc)
387437
{
388-
return (c >=0&&c <=UCHAR_MAX&&isgraph((unsignedchar)c));
438+
#ifdefUSE_WIDE_UPPER_LOWER
439+
if (GetDatabaseEncoding()==PG_UTF8)
440+
{
441+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
442+
returniswgraph((wint_t)c);
443+
}
444+
#endif
445+
return (c <= (pg_wchar)UCHAR_MAX&&isgraph((unsignedchar)c));
389446
}
390447

391448
staticint
392449
pg_wc_isprint(pg_wcharc)
393450
{
394-
return (c >=0&&c <=UCHAR_MAX&&isprint((unsignedchar)c));
451+
#ifdefUSE_WIDE_UPPER_LOWER
452+
if (GetDatabaseEncoding()==PG_UTF8)
453+
{
454+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
455+
returniswprint((wint_t)c);
456+
}
457+
#endif
458+
return (c <= (pg_wchar)UCHAR_MAX&&isprint((unsignedchar)c));
395459
}
396460

397461
staticint
398462
pg_wc_ispunct(pg_wcharc)
399463
{
400-
return (c >=0&&c <=UCHAR_MAX&&ispunct((unsignedchar)c));
464+
#ifdefUSE_WIDE_UPPER_LOWER
465+
if (GetDatabaseEncoding()==PG_UTF8)
466+
{
467+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
468+
returniswpunct((wint_t)c);
469+
}
470+
#endif
471+
return (c <= (pg_wchar)UCHAR_MAX&&ispunct((unsignedchar)c));
401472
}
402473

403474
staticint
404475
pg_wc_isspace(pg_wcharc)
405476
{
406-
return (c >=0&&c <=UCHAR_MAX&&isspace((unsignedchar)c));
477+
#ifdefUSE_WIDE_UPPER_LOWER
478+
if (GetDatabaseEncoding()==PG_UTF8)
479+
{
480+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
481+
returniswspace((wint_t)c);
482+
}
483+
#endif
484+
return (c <= (pg_wchar)UCHAR_MAX&&isspace((unsignedchar)c));
407485
}
408486

409487
staticpg_wchar
410488
pg_wc_toupper(pg_wcharc)
411489
{
412-
if (c >=0&&c <=UCHAR_MAX)
490+
#ifdefUSE_WIDE_UPPER_LOWER
491+
if (GetDatabaseEncoding()==PG_UTF8)
492+
{
493+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
494+
returntowupper((wint_t)c);
495+
}
496+
#endif
497+
if (c <= (pg_wchar)UCHAR_MAX)
413498
returntoupper((unsignedchar)c);
414499
returnc;
415500
}
416501

417502
staticpg_wchar
418503
pg_wc_tolower(pg_wcharc)
419504
{
420-
if (c >=0&&c <=UCHAR_MAX)
505+
#ifdefUSE_WIDE_UPPER_LOWER
506+
if (GetDatabaseEncoding()==PG_UTF8)
507+
{
508+
if (sizeof(wchar_t) >=4||c <= (pg_wchar)0xFFFF)
509+
returntowlower((wint_t)c);
510+
}
511+
#endif
512+
if (c <= (pg_wchar)UCHAR_MAX)
421513
returntolower((unsignedchar)c);
422514
returnc;
423515
}

‎src/include/regex/regcustom.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
2626
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
*
28-
* $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.7 2008/02/14 17:33:37 tgl Exp $
28+
* $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.8 2009/12/01 21:00:24 tgl Exp $
2929
*/
3030

3131
/* headers if any */
@@ -34,6 +34,17 @@
3434
#include<ctype.h>
3535
#include<limits.h>
3636

37+
/*
38+
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems
39+
* declare them in <wchar.h>.
40+
*/
41+
#ifdefHAVE_WCHAR_H
42+
#include<wchar.h>
43+
#endif
44+
#ifdefHAVE_WCTYPE_H
45+
#include<wctype.h>
46+
#endif
47+
3748
#include"mb/pg_wchar.h"
3849

3950

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp