11/*-------------------------------------------------------------------------
22 * category_test.c
3- *Program to test Unicode general categoryfunctions .
3+ *Program to test Unicode general categoryand character properties .
44 *
55 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
66 *
1414#include <stdio.h>
1515#include <stdlib.h>
1616#include <string.h>
17+ #include <wctype.h>
1718
1819#ifdef USE_ICU
1920#include <unicode/uchar.h>
2021#endif
22+
2123#include "common/unicode_category.h"
2224#include "common/unicode_version.h"
2325
26+ static int pg_unicode_version = 0 ;
27+ #ifdef USE_ICU
28+ static int icu_unicode_version = 0 ;
29+ #endif
30+
2431/*
2532 * Parse version into integer for easy comparison.
2633 */
27- #ifdef USE_ICU
2834static int
2935parse_unicode_version (const char * version )
3036{
@@ -39,57 +45,175 @@ parse_unicode_version(const char *version)
3945
4046return major * 100 + minor ;
4147}
42- #endif
4348
49+ #ifdef USE_ICU
4450/*
45- * Exhaustively test that the Unicode category for each codepoint matches that
46- * returned by ICU.
51+ * Test Postgres Unicode tables by comparing with ICU. Test the General
52+ * Category, as well as the properties Alphabetic, Lowercase, Uppercase,
53+ * White_Space, and Hex_Digit.
4754 */
48- int
49- main ( int argc , char * * argv )
55+ static void
56+ icu_test ( )
5057{
51- #ifdef USE_ICU
52- int pg_unicode_version = parse_unicode_version (PG_UNICODE_VERSION );
53- int icu_unicode_version = parse_unicode_version (U_UNICODE_VERSION );
58+ int successful = 0 ;
5459int pg_skipped_codepoints = 0 ;
5560int icu_skipped_codepoints = 0 ;
5661
57- printf ("category_test: Postgres Unicode version:\t%s\n" ,PG_UNICODE_VERSION );
58- printf ("category_test: ICU Unicode version:\t\t%s\n" ,U_UNICODE_VERSION );
59-
60- for (UChar32 code = 0 ;code <=0x10ffff ;code ++ )
62+ for (pg_wchar code = 0 ;code <=0x10ffff ;code ++ )
6163{
6264uint8_t pg_category = unicode_category (code );
6365uint8_t icu_category = u_charType (code );
6466
67+ /* Property tests */
68+ bool prop_alphabetic = pg_u_prop_alphabetic (code );
69+ bool prop_lowercase = pg_u_prop_lowercase (code );
70+ bool prop_uppercase = pg_u_prop_uppercase (code );
71+ bool prop_cased = pg_u_prop_cased (code );
72+ bool prop_case_ignorable = pg_u_prop_case_ignorable (code );
73+ bool prop_white_space = pg_u_prop_white_space (code );
74+ bool prop_hex_digit = pg_u_prop_hex_digit (code );
75+ bool prop_join_control = pg_u_prop_join_control (code );
76+
77+ bool icu_prop_alphabetic = u_hasBinaryProperty (
78+ code ,UCHAR_ALPHABETIC );
79+ bool icu_prop_lowercase = u_hasBinaryProperty (
80+ code ,UCHAR_LOWERCASE );
81+ bool icu_prop_uppercase = u_hasBinaryProperty (
82+ code ,UCHAR_UPPERCASE );
83+ bool icu_prop_cased = u_hasBinaryProperty (
84+ code ,UCHAR_CASED );
85+ bool icu_prop_case_ignorable = u_hasBinaryProperty (
86+ code ,UCHAR_CASE_IGNORABLE );
87+ bool icu_prop_white_space = u_hasBinaryProperty (
88+ code ,UCHAR_WHITE_SPACE );
89+ bool icu_prop_hex_digit = u_hasBinaryProperty (
90+ code ,UCHAR_HEX_DIGIT );
91+ bool icu_prop_join_control = u_hasBinaryProperty (
92+ code ,UCHAR_JOIN_CONTROL );
93+
94+ /*
95+ * Compare with ICU for character classes using:
96+ *
97+ * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details
98+ *
99+ * which describes how to use ICU to test for membership in regex
100+ * character classes.
101+ *
102+ * NB: the document suggests testing for some properties such as
103+ * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the
104+ * "POSIX Compatible" character classes.
105+ */
106+ bool isalpha = pg_u_isalpha (code );
107+ bool islower = pg_u_islower (code );
108+ bool isupper = pg_u_isupper (code );
109+ bool ispunct = pg_u_ispunct (code , false);
110+ bool isdigit = pg_u_isdigit (code , false);
111+ bool isxdigit = pg_u_isxdigit (code , false);
112+ bool isalnum = pg_u_isalnum (code , false);
113+ bool isspace = pg_u_isspace (code );
114+ bool isblank = pg_u_isblank (code );
115+ bool iscntrl = pg_u_iscntrl (code );
116+ bool isgraph = pg_u_isgraph (code );
117+ bool isprint = pg_u_isprint (code );
118+
119+ bool icu_isalpha = u_isUAlphabetic (code );
120+ bool icu_islower = u_isULowercase (code );
121+ bool icu_isupper = u_isUUppercase (code );
122+ bool icu_ispunct = u_ispunct (code );
123+ bool icu_isdigit = u_isdigit (code );
124+ bool icu_isxdigit = u_hasBinaryProperty (code ,
125+ UCHAR_POSIX_XDIGIT );
126+ bool icu_isalnum = u_hasBinaryProperty (code ,
127+ UCHAR_POSIX_ALNUM );
128+ bool icu_isspace = u_isUWhiteSpace (code );
129+ bool icu_isblank = u_isblank (code );
130+ bool icu_iscntrl = icu_category == PG_U_CONTROL ;
131+ bool icu_isgraph = u_hasBinaryProperty (code ,
132+ UCHAR_POSIX_GRAPH );
133+ bool icu_isprint = u_hasBinaryProperty (code ,
134+ UCHAR_POSIX_PRINT );
135+
136+ /*
137+ * A version mismatch means that some assigned codepoints in the newer
138+ * version may be unassigned in the older version. That's OK, though
139+ * the test will not cover those codepoints marked unassigned in the
140+ * older version (that is, it will no longer be an exhaustive test).
141+ */
142+ if (pg_category == PG_U_UNASSIGNED &&
143+ icu_category != PG_U_UNASSIGNED &&
144+ pg_unicode_version < icu_unicode_version )
145+ {
146+ pg_skipped_codepoints ++ ;
147+ continue ;
148+ }
149+
150+ if (icu_category == PG_U_UNASSIGNED &&
151+ pg_category != PG_U_UNASSIGNED &&
152+ icu_unicode_version < pg_unicode_version )
153+ {
154+ icu_skipped_codepoints ++ ;
155+ continue ;
156+ }
157+
65158if (pg_category != icu_category )
66159{
67- /*
68- * A version mismatch means that some assigned codepoints in the
69- * newer version may be unassigned in the older version. That's
70- * OK, though the test will not cover those codepoints marked
71- * unassigned in the older version (that is, it will no longer be
72- * an exhaustive test).
73- */
74- if (pg_category == PG_U_UNASSIGNED &&
75- pg_unicode_version < icu_unicode_version )
76- pg_skipped_codepoints ++ ;
77- else if (icu_category == PG_U_UNASSIGNED &&
78- icu_unicode_version < pg_unicode_version )
79- icu_skipped_codepoints ++ ;
80- else
81- {
82- printf ("category_test: FAILURE for codepoint 0x%06x\n" ,code );
83- printf ("category_test: Postgres category:%02d %s %s\n" ,pg_category ,
84- unicode_category_abbrev (pg_category ),
85- unicode_category_string (pg_category ));
86- printf ("category_test: ICU category:%02d %s %s\n" ,icu_category ,
87- unicode_category_abbrev (icu_category ),
88- unicode_category_string (icu_category ));
89- printf ("\n" );
90- exit (1 );
91- }
160+ printf ("category_test: FAILURE for codepoint 0x%06x\n" ,code );
161+ printf ("category_test: Postgres category:%02d %s %s\n" ,pg_category ,
162+ unicode_category_abbrev (pg_category ),
163+ unicode_category_string (pg_category ));
164+ printf ("category_test: ICU category:%02d %s %s\n" ,icu_category ,
165+ unicode_category_abbrev (icu_category ),
166+ unicode_category_string (icu_category ));
167+ printf ("\n" );
168+ exit (1 );
169+ }
170+
171+ if (prop_alphabetic != icu_prop_alphabetic ||
172+ prop_lowercase != icu_prop_lowercase ||
173+ prop_uppercase != icu_prop_uppercase ||
174+ prop_cased != icu_prop_cased ||
175+ prop_case_ignorable != icu_prop_case_ignorable ||
176+ prop_white_space != icu_prop_white_space ||
177+ prop_hex_digit != icu_prop_hex_digit ||
178+ prop_join_control != icu_prop_join_control )
179+ {
180+ printf ("category_test: FAILURE for codepoint 0x%06x\n" ,code );
181+ printf ("category_test: Postgrespropertyalphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n" ,
182+ prop_alphabetic ,prop_lowercase ,prop_uppercase ,
183+ prop_cased ,prop_case_ignorable ,
184+ prop_white_space ,prop_hex_digit ,prop_join_control );
185+ printf ("category_test: ICUpropertyalphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n" ,
186+ icu_prop_alphabetic ,icu_prop_lowercase ,icu_prop_uppercase ,
187+ icu_prop_cased ,icu_prop_case_ignorable ,
188+ icu_prop_white_space ,icu_prop_hex_digit ,icu_prop_join_control );
189+ printf ("\n" );
190+ exit (1 );
92191}
192+
193+ if (isalpha != icu_isalpha ||
194+ islower != icu_islower ||
195+ isupper != icu_isupper ||
196+ ispunct != icu_ispunct ||
197+ isdigit != icu_isdigit ||
198+ isxdigit != icu_isxdigit ||
199+ isalnum != icu_isalnum ||
200+ isspace != icu_isspace ||
201+ isblank != icu_isblank ||
202+ iscntrl != icu_iscntrl ||
203+ isgraph != icu_isgraph ||
204+ isprint != icu_isprint )
205+ {
206+ printf ("category_test: FAILURE for codepoint 0x%06x\n" ,code );
207+ printf ("category_test: Postgresclassalpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n" ,
208+ isalpha ,islower ,isupper ,ispunct ,isdigit ,isxdigit ,isalnum ,isspace ,isblank ,iscntrl ,isgraph ,isprint );
209+ printf ("category_test: ICU classalpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n" ,
210+ icu_isalpha ,icu_islower ,icu_isupper ,icu_ispunct ,icu_isdigit ,icu_isxdigit ,icu_isalnum ,icu_isspace ,icu_isblank ,icu_iscntrl ,icu_isgraph ,icu_isprint );
211+ printf ("\n" );
212+ exit (1 );
213+ }
214+
215+ if (pg_category != PG_U_UNASSIGNED )
216+ successful ++ ;
93217}
94218
95219if (pg_skipped_codepoints > 0 )
@@ -99,10 +223,22 @@ main(int argc, char **argv)
99223printf ("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n" ,
100224icu_skipped_codepoints );
101225
102- printf ("category_test: success\n" );
103- exit (0 );
226+ printf ("category_test: ICU test: %d codepoints successful\n" ,successful );
227+ }
228+ #endif
229+
230+ int
231+ main (int argc ,char * * argv )
232+ {
233+ pg_unicode_version = parse_unicode_version (PG_UNICODE_VERSION );
234+ printf ("category_test: Postgres Unicode version:\t%s\n" ,PG_UNICODE_VERSION );
235+
236+ #ifdef USE_ICU
237+ icu_unicode_version = parse_unicode_version (U_UNICODE_VERSION );
238+ printf ("category_test: ICU Unicode version:\t\t%s\n" ,U_UNICODE_VERSION );
239+
240+ icu_test ();
104241#else
105- printf ("category_test: ICU support required for test; skipping\n" );
106- exit (0 );
242+ printf ("category_test: ICU not available; skipping\n" );
107243#endif
108244}