1818#include <wctype.h>
1919
2020#ifdef USE_ICU
21+ #include <unicode/ucasemap.h>
2122#include <unicode/uchar.h>
2223#endif
2324#include "common/unicode_case.h"
2425#include "common/unicode_category.h"
2526#include "common/unicode_version.h"
2627
28+ /* enough to hold largest source or result string, including NUL */
29+ #define BUFSZ 256
30+
31+ #ifdef USE_ICU
32+ static UCaseMap * casemap = NULL ;
33+ #endif
34+
35+ typedef size_t (* TestFunc ) (char * dst ,size_t dstsize ,const char * src ,
36+ ssize_t srclen );
37+
38+ /* simple boundary iterator copied from pg_locale_builtin.c */
39+ struct WordBoundaryState
40+ {
41+ const char * str ;
42+ size_t len ;
43+ size_t offset ;
44+ bool init ;
45+ bool prev_alnum ;
46+ };
47+
48+ static size_t
49+ initcap_wbnext (void * state )
50+ {
51+ struct WordBoundaryState * wbstate = (struct WordBoundaryState * )state ;
52+
53+ while (wbstate -> offset < wbstate -> len &&
54+ wbstate -> str [wbstate -> offset ]!= '\0' )
55+ {
56+ pg_wchar u = utf8_to_unicode ((unsignedchar * )wbstate -> str +
57+ wbstate -> offset );
58+ bool curr_alnum = pg_u_isalnum (u , true);
59+
60+ if (!wbstate -> init || curr_alnum != wbstate -> prev_alnum )
61+ {
62+ size_t prev_offset = wbstate -> offset ;
63+
64+ wbstate -> init = true;
65+ wbstate -> offset += unicode_utf8len (u );
66+ wbstate -> prev_alnum = curr_alnum ;
67+ return prev_offset ;
68+ }
69+
70+ wbstate -> offset += unicode_utf8len (u );
71+ }
72+
73+ return wbstate -> len ;
74+ }
75+
2776#ifdef USE_ICU
2877
2978static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
4897}
4998}
5099
100+ static void
101+ icu_test_full (char * str )
102+ {
103+ char lower [BUFSZ ];
104+ char title [BUFSZ ];
105+ char upper [BUFSZ ];
106+ char icu_lower [BUFSZ ];
107+ char icu_title [BUFSZ ];
108+ char icu_upper [BUFSZ ];
109+ UErrorCode status ;
110+ struct WordBoundaryState wbstate = {
111+ .str = str ,
112+ .len = strlen (str ),
113+ .offset = 0 ,
114+ .init = false,
115+ .prev_alnum = false,
116+ };
117+
118+ unicode_strlower (lower ,BUFSZ ,str ,-1 , true);
119+ unicode_strtitle (title ,BUFSZ ,str ,-1 , true,initcap_wbnext ,& wbstate );
120+ unicode_strupper (upper ,BUFSZ ,str ,-1 , true);
121+ status = U_ZERO_ERROR ;
122+ ucasemap_utf8ToLower (casemap ,icu_lower ,BUFSZ ,str ,-1 ,& status );
123+ status = U_ZERO_ERROR ;
124+ ucasemap_utf8ToTitle (casemap ,icu_title ,BUFSZ ,str ,-1 ,& status );
125+ status = U_ZERO_ERROR ;
126+ ucasemap_utf8ToUpper (casemap ,icu_upper ,BUFSZ ,str ,-1 ,& status );
127+
128+ if (strcmp (lower ,icu_lower )!= 0 )
129+ {
130+ printf ("case_test: str='%s' lower='%s' icu_lower='%s'\n" ,str ,lower ,
131+ icu_lower );
132+ exit (1 );
133+ }
134+ if (strcmp (title ,icu_title )!= 0 )
135+ {
136+ printf ("case_test: str='%s' title='%s' icu_title='%s'\n" ,str ,title ,
137+ icu_title );
138+ exit (1 );
139+ }
140+ if (strcmp (upper ,icu_upper )!= 0 )
141+ {
142+ printf ("case_test: str='%s' upper='%s' icu_upper='%s'\n" ,str ,upper ,
143+ icu_upper );
144+ exit (1 );
145+ }
146+ }
147+
51148/*
52149 * Exhaustively compare case mappings with the results from ICU.
53150 */
@@ -64,6 +161,7 @@ test_icu(void)
64161if (category != PG_U_UNASSIGNED )
65162{
66163uint8_t icu_category = u_charType (code );
164+ char code_str [5 ]= {0 };
67165
68166if (icu_category == PG_U_UNASSIGNED )
69167{
@@ -72,6 +170,9 @@ test_icu(void)
72170}
73171
74172icu_test_simple (code );
173+ unicode_to_utf8 (code , (unsignedchar * )code_str );
174+ icu_test_full (code_str );
175+
75176successful ++ ;
76177}
77178}
@@ -86,7 +187,7 @@ test_icu(void)
86187#endif
87188
88189static void
89- test_strlower ( const char * test_string ,const char * expected )
190+ test_convert ( TestFunc tfunc , const char * test_string ,const char * expected )
90191{
91192size_t src1len = strlen (test_string );
92193size_t src2len = -1 ;/* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
102203
103204/* neither source nor destination are NUL-terminated */
104205memset (dst1 ,0x7F ,dst1len );
105- needed = unicode_strlower (dst1 ,dst1len ,src1 ,src1len );
206+ needed = tfunc (dst1 ,dst1len ,src1 ,src1len );
106207if (needed != strlen (expected ))
107208{
108- printf ("case_test: convert_case test1 FAILURE: needed %zu\n" ,needed );
209+ printf ("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n" ,
210+ test_string ,needed ,strlen (expected ));
109211exit (1 );
110212}
111213if (memcmp (dst1 ,expected ,dst1len )!= 0 )
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
117219
118220/* destination is NUL-terminated and source is not */
119221memset (dst2 ,0x7F ,dst2len );
120- needed = unicode_strlower (dst2 ,dst2len ,src1 ,src1len );
222+ needed = tfunc (dst2 ,dst2len ,src1 ,src1len );
121223if (needed != strlen (expected ))
122224{
123- printf ("case_test: convert_case test2 FAILURE: needed %zu\n" ,needed );
225+ printf ("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n" ,
226+ test_string ,needed ,strlen (expected ));
124227exit (1 );
125228}
126229if (strcmp (dst2 ,expected )!= 0 )
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
132235
133236/* source is NUL-terminated and destination is not */
134237memset (dst1 ,0x7F ,dst1len );
135- needed = unicode_strlower (dst1 ,dst1len ,src2 ,src2len );
238+ needed = tfunc (dst1 ,dst1len ,src2 ,src2len );
136239if (needed != strlen (expected ))
137240{
241+ printf ("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n" ,
242+ test_string ,needed ,strlen (expected ));
138243printf ("case_test: convert_case test3 FAILURE: needed %zu\n" ,needed );
139244exit (1 );
140245}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
147252
148253/* both source and destination are NUL-terminated */
149254memset (dst2 ,0x7F ,dst2len );
150- needed = unicode_strlower (dst2 ,dst2len ,src2 ,src2len );
255+ needed = tfunc (dst2 ,dst2len ,src2 ,src2len );
151256if (needed != strlen (expected ))
152257{
153- printf ("case_test: convert_case test4 FAILURE: needed %zu\n" ,needed );
258+ printf ("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n" ,
259+ test_string ,needed ,strlen (expected ));
154260exit (1 );
155261}
156262if (strcmp (dst2 ,expected )!= 0 )
@@ -166,22 +272,92 @@ test_strlower(const char *test_string, const char *expected)
166272free (dst2 );
167273}
168274
275+ static size_t
276+ tfunc_lower (char * dst ,size_t dstsize ,const char * src ,
277+ ssize_t srclen )
278+ {
279+ return unicode_strlower (dst ,dstsize ,src ,srclen , true);
280+ }
281+
282+ static size_t
283+ tfunc_title (char * dst ,size_t dstsize ,const char * src ,
284+ ssize_t srclen )
285+ {
286+ struct WordBoundaryState wbstate = {
287+ .str = src ,
288+ .len = srclen ,
289+ .offset = 0 ,
290+ .init = false,
291+ .prev_alnum = false,
292+ };
293+
294+ return unicode_strtitle (dst ,dstsize ,src ,srclen , true,initcap_wbnext ,
295+ & wbstate );
296+ }
297+
298+ static size_t
299+ tfunc_upper (char * dst ,size_t dstsize ,const char * src ,
300+ ssize_t srclen )
301+ {
302+ return unicode_strupper (dst ,dstsize ,src ,srclen , true);
303+ }
304+
305+
169306static void
170307test_convert_case ()
171308{
172309/* test string with no case changes */
173- test_strlower ("√∞" ,"√∞" );
310+ test_convert (tfunc_lower ,"√∞" ,"√∞" );
311+ /* test adjust-to-cased behavior */
312+ test_convert (tfunc_title ,"abc 123xyz" ,"Abc 123xyz" );
174313/* test string with case changes */
175- test_strlower ( "ABC " ,"abc " );
314+ test_convert ( tfunc_upper , "abc " ,"ABC " );
176315/* test string with case changes and byte length changes */
177- test_strlower ("ȺȺȺ" ,"ⱥⱥⱥ" );
316+ test_convert (tfunc_lower ,"ȺȺȺ" ,"ⱥⱥⱥ" );
317+ /* test special case conversions */
318+ test_convert (tfunc_upper ,"ß" ,"SS" );
319+ test_convert (tfunc_lower ,"ıiIİ" ,"ıiii\u0307" );
320+ test_convert (tfunc_upper ,"ıiIİ" ,"IIIİ" );
321+ /* test final sigma */
322+ test_convert (tfunc_lower ,"σςΣ ΣΣΣ" ,"σςς σσς" );
323+ test_convert (tfunc_lower ,"σς'Σ' ΣΣ'Σ'" ,"σς'ς' σσ'ς'" );
324+ test_convert (tfunc_title ,"σςΣ ΣΣΣ" ,"Σςς Σσς" );
325+
326+ #ifdef USE_ICU
327+ icu_test_full ("" );
328+ icu_test_full ("ȺȺȺ" );
329+ icu_test_full ("ßßß" );
330+ icu_test_full ("√∞" );
331+ icu_test_full ("a b" );
332+ icu_test_full ("abc 123xyz" );
333+ icu_test_full ("σςΣ ΣΣΣ" );
334+ icu_test_full ("ıiIİ" );
335+ /* test <alpha><iota_subscript><acute> */
336+ icu_test_full ("\u0391\u0345\u0301" );
337+ #endif
178338
179339printf ("case_test: convert_case: success\n" );
180340}
181341
182342int
183343main (int argc ,char * * argv )
184344{
345+ #ifdef USE_ICU
346+ UErrorCode status = U_ZERO_ERROR ;
347+
348+ /*
349+ * Disable ICU's word break adjustment for titlecase to match the expected
350+ * behavior of unicode_strtitle().
351+ */
352+ casemap = ucasemap_open ("und" ,U_TITLECASE_NO_BREAK_ADJUSTMENT ,& status );
353+ if (U_FAILURE (status ))
354+ {
355+ printf ("case_test: failure opening UCaseMap: %s\n" ,
356+ u_errorName (status ));
357+ exit (1 );
358+ }
359+ #endif
360+
185361printf ("case_test: Postgres Unicode version:\t%s\n" ,PG_UNICODE_VERSION );
186362#ifdef USE_ICU
187363printf ("case_test: ICU Unicode version:\t\t%s\n" ,U_UNICODE_VERSION );
@@ -191,5 +367,9 @@ main(int argc, char **argv)
191367#endif
192368
193369test_convert_case ();
370+
371+ #ifdef USE_ICU
372+ ucasemap_close (casemap );
373+ #endif
194374exit (0 );
195375}