11/*-------------------------------------------------------------------------
22 *
33 * ts_locale.c
4- *localecompatiblility layer for tsearch
4+ *localecompatibility layer for tsearch
55 *
66 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
77 *
88 *
99 * IDENTIFICATION
10- * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
10+ * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.3 2007/11/09 22:37:35 tgl Exp $
1111 *
1212 *-------------------------------------------------------------------------
1313 */
1616#include "tsearch/ts_locale.h"
1717#include "tsearch/ts_public.h"
1818
19- #ifdef TS_USE_WIDE
2019
21- #ifdef WIN32
20+ #ifdef TS_USE_WIDE
2221
22+ /*
23+ * wchar2char --- convert wide characters to multibyte format
24+ *
25+ * This has the same API as the standard wcstombs() function; in particular,
26+ * tolen is the maximum number of bytes to store at *to, and *from should be
27+ * zero-terminated. The output will be zero-terminated iff there is room.
28+ */
2329size_t
24- wchar2char (char * to ,const wchar_t * from ,size_t len )
30+ wchar2char (char * to ,const wchar_t * from ,size_t tolen )
2531{
26- if (len == 0 )
32+ if (tolen == 0 )
2733return 0 ;
2834
35+ #ifdef WIN32
2936if (GetDatabaseEncoding ()== PG_UTF8 )
3037{
3138int r ;
3239
33- r = WideCharToMultiByte (CP_UTF8 ,0 ,from ,-1 ,to ,len ,
40+ r = WideCharToMultiByte (CP_UTF8 ,0 ,from ,-1 ,to ,tolen ,
3441NULL ,NULL );
3542
36- if (r == 0 )
37- ereport (ERROR ,
38- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
39- errmsg ("UTF-16 to UTF-8 translation failed: %lu" ,
40- GetLastError ())));
41- Assert (r <=len );
43+ if (r <=0 )
44+ return (size_t )-1 ;
45+
46+ Assert (r <=tolen );
4247
43- return r ;
48+ /* Microsoft counts the zero terminator in the result */
49+ return r - 1 ;
4450}
51+ #endif /* WIN32 */
4552
46- return wcstombs (to ,from ,len );
53+ return wcstombs (to ,from ,tolen );
4754}
48- #endif /* WIN32 */
4955
56+ /*
57+ * char2wchar --- convert multibyte characters to wide characters
58+ *
59+ * This has almost the API of mbstowcs(), except that *from need not be
60+ * null-terminated; instead, the number of input bytes is specified as
61+ * fromlen. Also, we ereport() rather than returning -1 for invalid
62+ * input encoding. tolen is the maximum number of wchar_t's to store at *to.
63+ * The output will be zero-terminated iff there is room.
64+ */
5065size_t
51- char2wchar (wchar_t * to ,const char * from ,size_t len )
66+ char2wchar (wchar_t * to ,size_t tolen , const char * from ,size_t fromlen )
5267{
53- if (len == 0 )
68+ if (tolen == 0 )
5469return 0 ;
5570
5671#ifdef WIN32
5772if (GetDatabaseEncoding ()== PG_UTF8 )
5873{
5974int r ;
6075
61- r = MultiByteToWideChar (CP_UTF8 ,0 ,from ,len ,to ,len );
76+ r = MultiByteToWideChar (CP_UTF8 ,0 ,from ,fromlen ,to ,tolen );
6277
63- if (! r )
78+ if (r <= 0 )
6479{
65- pg_verifymbstr (from ,len , false);
80+ pg_verifymbstr (from ,fromlen , false);
6681ereport (ERROR ,
6782(errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
6883errmsg ("invalid multibyte character for locale" ),
6984errhint ("The server's LC_CTYPE locale is probably incompatible with the database encoding." )));
7085}
7186
72- Assert (r <=len );
87+ Assert (r <=tolen );
7388
74- return r ;
89+ /* Microsoft counts the zero terminator in the result */
90+ return r - 1 ;
7591}
76- else
7792#endif /* WIN32 */
93+
7894if (lc_ctype_is_c ())
7995{
8096/*
8197 * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
8298 * allocated with sufficient space
8399 */
84- return pg_mb2wchar_with_len (from , (pg_wchar * )to ,len );
100+ return pg_mb2wchar_with_len (from , (pg_wchar * )to ,fromlen );
85101}
86102else
87103{
88104/*
89- * mbstowcsrequire ending '\0'
105+ * mbstowcsrequires ending '\0'
90106 */
91- char * str = pnstrdup (from ,len );
92- size_t tolen ;
107+ char * str = pnstrdup (from ,fromlen );
108+ size_t result ;
109+
110+ result = mbstowcs (to ,str ,tolen );
93111
94- tolen = mbstowcs (to ,str ,len );
95112pfree (str );
96113
97- return tolen ;
114+ if (result == (size_t )-1 )
115+ {
116+ pg_verifymbstr (from ,fromlen , false);
117+ ereport (ERROR ,
118+ (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
119+ errmsg ("invalid multibyte character for locale" ),
120+ errhint ("The server's LC_CTYPE locale is probably incompatible with the database encoding." )));
121+ }
122+
123+ if (result < tolen )
124+ to [result ]= 0 ;
125+
126+ return result ;
98127}
99128}
100129
130+
101131int
102- _t_isalpha (const char * ptr )
132+ t_isdigit (const char * ptr )
103133{
134+ int clen = pg_mblen (ptr );
104135wchar_t character [2 ];
105136
106- if (lc_ctype_is_c ())
137+ if (clen == 1 || lc_ctype_is_c ())
138+ return isdigit (TOUCHAR (ptr ));
139+
140+ char2wchar (character ,2 ,ptr ,clen );
141+
142+ return iswdigit ((wint_t )character [0 ]);
143+ }
144+
145+ int
146+ t_isspace (const char * ptr )
147+ {
148+ int clen = pg_mblen (ptr );
149+ wchar_t character [2 ];
150+
151+ if (clen == 1 || lc_ctype_is_c ())
152+ return isspace (TOUCHAR (ptr ));
153+
154+ char2wchar (character ,2 ,ptr ,clen );
155+
156+ return iswspace ((wint_t )character [0 ]);
157+ }
158+
159+ int
160+ t_isalpha (const char * ptr )
161+ {
162+ int clen = pg_mblen (ptr );
163+ wchar_t character [2 ];
164+
165+ if (clen == 1 || lc_ctype_is_c ())
107166return isalpha (TOUCHAR (ptr ));
108167
109- char2wchar (character ,ptr ,1 );
168+ char2wchar (character ,2 , ptr ,clen );
110169
111- return iswalpha ((wint_t )* character );
170+ return iswalpha ((wint_t )character [ 0 ] );
112171}
113172
114173int
115- _t_isprint (const char * ptr )
174+ t_isprint (const char * ptr )
116175{
176+ int clen = pg_mblen (ptr );
117177wchar_t character [2 ];
118178
119- if (lc_ctype_is_c ())
179+ if (clen == 1 || lc_ctype_is_c ())
120180return isprint (TOUCHAR (ptr ));
121181
122- char2wchar (character ,ptr ,1 );
182+ char2wchar (character ,2 , ptr ,clen );
123183
124- return iswprint ((wint_t )* character );
184+ return iswprint ((wint_t )character [ 0 ] );
125185}
186+
126187#endif /* TS_USE_WIDE */
127188
128189
@@ -168,19 +229,27 @@ t_readline(FILE *fp)
168229return recoded ;
169230}
170231
232+ /*
233+ * lowerstr --- fold null-terminated string to lower case
234+ *
235+ * Returned string is palloc'd
236+ */
171237char *
172- lowerstr (char * str )
238+ lowerstr (const char * str )
173239{
174240return lowerstr_with_len (str ,strlen (str ));
175241}
176242
177243/*
244+ * lowerstr_with_len --- fold string to lower case
245+ *
246+ * Input string need not be null-terminated.
247+ *
178248 * Returned string is palloc'd
179249 */
180250char *
181- lowerstr_with_len (char * str ,int len )
251+ lowerstr_with_len (const char * str ,int len )
182252{
183- char * ptr = str ;
184253char * out ;
185254
186255if (len == 0 )
@@ -202,23 +271,13 @@ lowerstr_with_len(char *str, int len)
202271
203272/*
204273 * alloc number of wchar_t for worst case, len contains number of
205- * bytes< = number of characters and alloc 1 wchar_t for 0, because
206- * wchar2char(wcstombs in really) wants zero-terminated string
274+ * bytes> = number of characters and alloc 1 wchar_t for 0, because
275+ * wchar2char wants zero-terminated string
207276 */
208277wptr = wstr = (wchar_t * )palloc (sizeof (wchar_t )* (len + 1 ));
209278
210- /*
211- * str SHOULD be cstring, so wlen contains number of converted
212- * character
213- */
214- wlen = char2wchar (wstr ,str ,len );
215- if (wlen < 0 )
216- ereport (ERROR ,
217- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
218- errmsg ("translation failed from server encoding to wchar_t" )));
219-
279+ wlen = char2wchar (wstr ,len + 1 ,str ,len );
220280Assert (wlen <=len );
221- wstr [wlen ]= 0 ;
222281
223282while (* wptr )
224283{
@@ -229,31 +288,29 @@ lowerstr_with_len(char *str, int len)
229288/*
230289 * Alloc result string for worst case + '\0'
231290 */
232- len = sizeof ( char ) * pg_database_encoding_max_length ()* ( wlen + 1 ) ;
291+ len = pg_database_encoding_max_length ()* wlen + 1 ;
233292out = (char * )palloc (len );
234293
235- /*
236- * wlen now is number of bytes which is always >= number of characters
237- */
238294wlen = wchar2char (out ,wstr ,len );
295+
239296pfree (wstr );
240297
241298if (wlen < 0 )
242299ereport (ERROR ,
243300(errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
244- errmsg ("translation failed from wchar_t to server encoding %d" ,errno )));
245- Assert (wlen <=len );
246- out [wlen ]= '\0' ;
301+ errmsg ("translation from wchar_t to server encoding failed: %m" )));
302+ Assert (wlen < len );
247303}
248304else
249- #endif
305+ #endif /* TS_USE_WIDE */
250306{
307+ const char * ptr = str ;
251308char * outptr ;
252309
253310outptr = out = (char * )palloc (sizeof (char )* (len + 1 ));
254- while (* ptr && ptr - str < len )
311+ while (( ptr - str ) < len && * ptr )
255312{
256- * outptr ++ = tolower (* ( unsigned char * ) ptr );
313+ * outptr ++ = tolower (TOUCHAR ( ptr ) );
257314ptr ++ ;
258315}
259316* outptr = '\0' ;