11/*
22 * conversion functions between pg_wchar and multibyte streams.
33 * Tatsuo Ishii
4- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.39 2004/12/02 22:37:13 momjian Exp $
4+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40 2004/12/03 01:20:20 momjian Exp $
55 *
66 * WIN1250 client encoding updated by Pavel Behal
77 *
@@ -343,31 +343,6 @@ pg_johab_dsplen(const unsigned char *s)
343343return (pg_euc_dsplen (s ));
344344}
345345
346- bool isLegalUTF8 (const UTF8 * source ,int len ) {
347- UTF8 a ;
348- const UTF8 * srcptr = source + len ;
349- if (!source || (pg_utf_mblen (source )!= len ))return false;
350- switch (len ) {
351- default :return false;
352- /* Everything else falls through when "true"... */
353- case 6 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
354- case 5 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
355- case 4 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
356- case 3 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
357- case 2 :if ((a = (* -- srcptr ))> 0xBF )return false;
358- switch (* source ) {
359- /* no fall-through in this inner switch */
360- case 0xE0 :if (a < 0xA0 )return false;break ;
361- case 0xF0 :if (a < 0x90 )return false;break ;
362- case 0xF4 :if (a > 0x8F )return false;break ;
363- default :if (a < 0x80 )return false;
364- }
365- case 1 :if (* source >=0x80 && * source < 0xC2 )return false;
366- if (* source > 0xFD )return false;
367- }
368- return true;
369- }
370-
371346/*
372347 * convert UTF-8 string to pg_wchar (UCS-2)
373348 * caller should allocate enough space for "to"
@@ -423,27 +398,21 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
423398 * returns the byte length of a UTF-8 word pointed to by s
424399 */
425400int
426- pg_utf_mblen (const UTF8 * s )
401+ pg_utf_mblen (const unsigned char * s )
427402{
428403int len = 1 ;
429404
430405if ((* s & 0x80 )== 0 )
431406len = 1 ;
432407else if ((* s & 0xe0 )== 0xc0 )
433408len = 2 ;
434- else if ((* s & 0xf0 )== 0xe0 )
435- len = 3 ;
436- else if ((* s & 0xf8 )== 0xf0 )
437- len = 4 ;
438- else if ((* s & 0xfc )== 0xf8 )
439- len = 5 ;
440- else if ((* s & 0xfe )== 0xfc )
441- len = 6 ;
409+ else if ((* s & 0xe0 )== 0xe0 )
410+ len = 3 ;
442411return (len );
443412}
444413
445414static int
446- pg_utf_dsplen (const UTF8 * s )
415+ pg_utf_dsplen (const unsigned char * s )
447416{
448417return 1 ;/* XXX fix me! */
449418}
@@ -752,8 +721,8 @@ pg_wchar_tbl pg_wchar_table[] = {
752721{pg_euckr2wchar_with_len ,pg_euckr_mblen ,pg_euckr_dsplen ,3 },/* 3; PG_EUC_KR */
753722{pg_euctw2wchar_with_len ,pg_euctw_mblen ,pg_euctw_dsplen ,3 },/* 4; PG_EUC_TW */
754723{pg_johab2wchar_with_len ,pg_johab_mblen ,pg_johab_dsplen ,3 },/* 5; PG_JOHAB */
755- {pg_utf2wchar_with_len ,pg_utf_mblen ,pg_utf_dsplen ,6 }, /* 6; PG_UNICODE */
756- {pg_mule2wchar_with_len ,pg_mule_mblen ,pg_mule_dsplen ,3 },/* 7; PG_MULE_INTERNAL */
724+ {pg_utf2wchar_with_len ,pg_utf_mblen ,pg_utf_dsplen ,3 }, /* 6; PG_UNICODE */
725+ {pg_mule2wchar_with_len ,pg_mule_mblen ,pg_mule_dsplen ,3 },/* 7; PG_MULE_INTERNAL */
757726{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 8; PG_LATIN1 */
758727{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 9; PG_LATIN2 */
759728{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 10; PG_LATIN3 */
@@ -775,11 +744,11 @@ pg_wchar_tbl pg_wchar_table[] = {
775744{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 26; ISO-8859-7 */
776745{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 27; ISO-8859-8 */
777746{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 28; PG_WIN1250 */
778- {0 ,pg_sjis_mblen ,pg_sjis_dsplen ,2 },/* 29; PG_SJIS */
779- {0 ,pg_big5_mblen ,pg_big5_dsplen ,2 },/* 30; PG_BIG5 */
780- {0 ,pg_gbk_mblen ,pg_gbk_dsplen ,2 },/* 31; PG_GBK */
781- {0 ,pg_uhc_mblen ,pg_uhc_dsplen ,2 },/* 32; PG_UHC */
782- {0 ,pg_gb18030_mblen ,pg_gb18030_dsplen ,2 }/* 33; PG_GB18030 */
747+ {0 ,pg_sjis_mblen ,pg_sjis_dsplen ,2 },/* 29; PG_SJIS */
748+ {0 ,pg_big5_mblen ,pg_big5_dsplen ,2 },/* 30; PG_BIG5 */
749+ {0 ,pg_gbk_mblen ,pg_gbk_dsplen ,2 },/* 31; PG_GBK */
750+ {0 ,pg_uhc_mblen ,pg_uhc_dsplen ,2 },/* 32; PG_UHC */
751+ {0 ,pg_gb18030_mblen ,pg_gb18030_dsplen ,2 }/* 33; PG_GB18030 */
783752};
784753
785754/* returns the byte length of a word for mule internal code */
@@ -853,48 +822,51 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
853822
854823while (len > 0 && * mbstr )
855824{
825+ /* special UTF-8 check */
826+ if (encoding == PG_UTF8 && (* mbstr & 0xf8 )== 0xf0 )
827+ {
828+ if (noError )
829+ return false;
830+ ereport (ERROR ,
831+ (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
832+ errmsg ("Unicode characters greater than or equal to 0x10000 are not supported" )));
833+ }
834+
856835l = pg_mblen (mbstr );
857836
858- /* special UTF-8 check */
859- if (encoding == PG_UTF8 ) {
860- if (!isLegalUTF8 (mbstr ,l )) {
861- if (noError )return false;
862- ereport (ERROR ,(errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),errmsg ("Invalid UNICODE byte sequence detected near character %c" ,* mbstr )));
863- }
864- }else {
865- for (i = 1 ;i < l ;i ++ )
837+ for (i = 1 ;i < l ;i ++ )
838+ {
839+ /*
840+ * we expect that every multibyte char consists of bytes
841+ * having the 8th bit set
842+ */
843+ if (i >=len || (mbstr [i ]& 0x80 )== 0 )
866844{
867- /*
868- * we expect that every multibyte char consists of bytes
869- * having the 8th bit set
870- */
871- if (i >=len || (mbstr [i ]& 0x80 )== 0 )
872- {
873- char buf [8 * 2 + 1 ];
874- char * p = buf ;
875- int j ,
845+ char buf [8 * 2 + 1 ];
846+ char * p = buf ;
847+ int j ,
876848jlimit ;
877849
878- if (noError )
879- return false;
850+ if (noError )
851+ return false;
880852
881- jlimit = Min (l ,len );
882- jlimit = Min (jlimit ,8 );/* prevent buffer overrun */
853+ jlimit = Min (l ,len );
854+ jlimit = Min (jlimit ,8 );/* prevent buffer overrun */
883855
884- for (j = 0 ;j < jlimit ;j ++ )
885- p += sprintf (p ,"%02x" ,mbstr [j ]);
856+ for (j = 0 ;j < jlimit ;j ++ )
857+ p += sprintf (p ,"%02x" ,mbstr [j ]);
886858
887- ereport (ERROR ,
888- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
889- errmsg ("invalid byte sequence for encoding \"%s\": 0x%s" ,
890- GetDatabaseEncodingName (),buf )));
891- }
859+ ereport (ERROR ,
860+ (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
861+ errmsg ("invalid byte sequence for encoding \"%s\": 0x%s" ,
862+ GetDatabaseEncodingName (),buf )));
892863}
893-
894864}
865+
895866len -= l ;
896867mbstr += l ;
897868}
869+
898870return true;
899871}
900872