11/*
22 * conversion functions between pg_wchar and multibyte streams.
33 * Tatsuo Ishii
4- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.38 2004/09/17 21:59:57 petere Exp $
4+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.39 2004/12/02 22:37:13 momjian Exp $
55 *
66 * WIN1250 client encoding updated by Pavel Behal
77 *
@@ -343,6 +343,31 @@ pg_johab_dsplen(const unsigned char *s)
343343return (pg_euc_dsplen (s ));
344344}
345345
346+ bool isLegalUTF8 (const UTF8 * source ,int len ) {
347+ UTF8 a ;
348+ const UTF8 * srcptr = source + len ;
349+ if (!source || (pg_utf_mblen (source )!= len ))return false;
350+ switch (len ) {
351+ default :return false;
352+ /* Everything else falls through when "true"... */
353+ case 6 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
354+ case 5 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
355+ case 4 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
356+ case 3 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
357+ case 2 :if ((a = (* -- srcptr ))> 0xBF )return false;
358+ switch (* source ) {
359+ /* no fall-through in this inner switch */
360+ case 0xE0 :if (a < 0xA0 )return false;break ;
361+ case 0xF0 :if (a < 0x90 )return false;break ;
362+ case 0xF4 :if (a > 0x8F )return false;break ;
363+ default :if (a < 0x80 )return false;
364+ }
365+ case 1 :if (* source >=0x80 && * source < 0xC2 )return false;
366+ if (* source > 0xFD )return false;
367+ }
368+ return true;
369+ }
370+
346371/*
347372 * convert UTF-8 string to pg_wchar (UCS-2)
348373 * caller should allocate enough space for "to"
@@ -398,21 +423,27 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
398423 * returns the byte length of a UTF-8 word pointed to by s
399424 */
400425int
401- pg_utf_mblen (const unsigned char * s )
426+ pg_utf_mblen (const UTF8 * s )
402427{
403428int len = 1 ;
404429
405430if ((* s & 0x80 )== 0 )
406431len = 1 ;
407432else if ((* s & 0xe0 )== 0xc0 )
408433len = 2 ;
409- else if ((* s & 0xe0 )== 0xe0 )
410- len = 3 ;
434+ else if ((* s & 0xf0 )== 0xe0 )
435+ len = 3 ;
436+ else if ((* s & 0xf8 )== 0xf0 )
437+ len = 4 ;
438+ else if ((* s & 0xfc )== 0xf8 )
439+ len = 5 ;
440+ else if ((* s & 0xfe )== 0xfc )
441+ len = 6 ;
411442return (len );
412443}
413444
414445static int
415- pg_utf_dsplen (const unsigned char * s )
446+ pg_utf_dsplen (const UTF8 * s )
416447{
417448return 1 ;/* XXX fix me! */
418449}
@@ -721,8 +752,8 @@ pg_wchar_tbl pg_wchar_table[] = {
721752{pg_euckr2wchar_with_len ,pg_euckr_mblen ,pg_euckr_dsplen ,3 },/* 3; PG_EUC_KR */
722753{pg_euctw2wchar_with_len ,pg_euctw_mblen ,pg_euctw_dsplen ,3 },/* 4; PG_EUC_TW */
723754{pg_johab2wchar_with_len ,pg_johab_mblen ,pg_johab_dsplen ,3 },/* 5; PG_JOHAB */
724- {pg_utf2wchar_with_len ,pg_utf_mblen ,pg_utf_dsplen ,3 }, /* 6; PG_UNICODE */
725- {pg_mule2wchar_with_len ,pg_mule_mblen ,pg_mule_dsplen ,3 },/* 7; PG_MULE_INTERNAL */
755+ {pg_utf2wchar_with_len ,pg_utf_mblen ,pg_utf_dsplen ,6 }, /* 6; PG_UNICODE */
756+ {pg_mule2wchar_with_len ,pg_mule_mblen ,pg_mule_dsplen ,3 },/* 7; PG_MULE_INTERNAL */
726757{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 8; PG_LATIN1 */
727758{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 9; PG_LATIN2 */
728759{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 10; PG_LATIN3 */
@@ -744,11 +775,11 @@ pg_wchar_tbl pg_wchar_table[] = {
744775{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 26; ISO-8859-7 */
745776{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 27; ISO-8859-8 */
746777{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 28; PG_WIN1250 */
747- {0 ,pg_sjis_mblen ,pg_sjis_dsplen ,2 },/* 29; PG_SJIS */
748- {0 ,pg_big5_mblen ,pg_big5_dsplen ,2 },/* 30; PG_BIG5 */
749- {0 ,pg_gbk_mblen ,pg_gbk_dsplen ,2 },/* 31; PG_GBK */
750- {0 ,pg_uhc_mblen ,pg_uhc_dsplen ,2 },/* 32; PG_UHC */
751- {0 ,pg_gb18030_mblen ,pg_gb18030_dsplen ,2 }/* 33; PG_GB18030 */
778+ {0 ,pg_sjis_mblen ,pg_sjis_dsplen ,2 },/* 29; PG_SJIS */
779+ {0 ,pg_big5_mblen ,pg_big5_dsplen ,2 },/* 30; PG_BIG5 */
780+ {0 ,pg_gbk_mblen ,pg_gbk_dsplen ,2 },/* 31; PG_GBK */
781+ {0 ,pg_uhc_mblen ,pg_uhc_dsplen ,2 },/* 32; PG_UHC */
782+ {0 ,pg_gb18030_mblen ,pg_gb18030_dsplen ,2 }/* 33; PG_GB18030 */
752783};
753784
754785/* returns the byte length of a word for mule internal code */
@@ -822,51 +853,48 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
822853
823854while (len > 0 && * mbstr )
824855{
825- /* special UTF-8 check */
826- if (encoding == PG_UTF8 && (* mbstr & 0xf8 )== 0xf0 )
827- {
828- if (noError )
829- return false;
830- ereport (ERROR ,
831- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
832- errmsg ("Unicode characters greater than or equal to 0x10000 are not supported" )));
833- }
834-
835856l = pg_mblen (mbstr );
836857
837- for (i = 1 ;i < l ;i ++ )
838- {
839- /*
840- * we expect that every multibyte char consists of bytes
841- * having the 8th bit set
842- */
843- if (i >=len || (mbstr [i ]& 0x80 )== 0 )
858+ /* special UTF-8 check */
859+ if (encoding == PG_UTF8 ) {
860+ if (!isLegalUTF8 (mbstr ,l )) {
861+ if (noError )return false;
862+ ereport (ERROR ,(errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),errmsg ("Invalid UNICODE byte sequence detected near character %c" ,* mbstr )));
863+ }
864+ }else {
865+ for (i = 1 ;i < l ;i ++ )
844866{
845- char buf [8 * 2 + 1 ];
846- char * p = buf ;
847- int j ,
867+ /*
868+ * we expect that every multibyte char consists of bytes
869+ * having the 8th bit set
870+ */
871+ if (i >=len || (mbstr [i ]& 0x80 )== 0 )
872+ {
873+ char buf [8 * 2 + 1 ];
874+ char * p = buf ;
875+ int j ,
848876jlimit ;
849877
850- if (noError )
851- return false;
878+ if (noError )
879+ return false;
852880
853- jlimit = Min (l ,len );
854- jlimit = Min (jlimit ,8 );/* prevent buffer overrun */
881+ jlimit = Min (l ,len );
882+ jlimit = Min (jlimit ,8 );/* prevent buffer overrun */
855883
856- for (j = 0 ;j < jlimit ;j ++ )
857- p += sprintf (p ,"%02x" ,mbstr [j ]);
884+ for (j = 0 ;j < jlimit ;j ++ )
885+ p += sprintf (p ,"%02x" ,mbstr [j ]);
858886
859- ereport (ERROR ,
860- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
861- errmsg ("invalid byte sequence for encoding \"%s\": 0x%s" ,
862- GetDatabaseEncodingName (),buf )));
887+ ereport (ERROR ,
888+ (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
889+ errmsg ("invalid byte sequence for encoding \"%s\": 0x%s" ,
890+ GetDatabaseEncodingName (),buf )));
891+ }
863892}
864- }
865893
894+ }
866895len -= l ;
867896mbstr += l ;
868897}
869-
870898return true;
871899}
872900