11/*
22 * conversion functions between pg_wchar and multibyte streams.
33 * Tatsuo Ishii
4- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.43 2005/03/14 18:31:20 momjian Exp $
4+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.44 2005/06/15 00:15:08 momjian Exp $
55 *
66 * WIN1250 client encoding updated by Pavel Behal
77 *
@@ -406,8 +406,14 @@ pg_utf_mblen(const unsigned char *s)
406406len = 1 ;
407407else if ((* s & 0xe0 )== 0xc0 )
408408len = 2 ;
409- else if ((* s & 0xe0 )== 0xe0 )
410- len = 3 ;
409+ else if ((* s & 0xf0 )== 0xe0 )
410+ len = 3 ;
411+ else if ((* s & 0xf8 )== 0xf0 )
412+ len = 4 ;
413+ else if ((* s & 0xfc )== 0xf8 )
414+ len = 5 ;
415+ else if ((* s & 0xfe )== 0xfc )
416+ len = 6 ;
411417return (len );
412418}
413419
@@ -721,7 +727,7 @@ pg_wchar_tbl pg_wchar_table[] = {
721727{pg_euckr2wchar_with_len ,pg_euckr_mblen ,pg_euckr_dsplen ,3 },/* 3; PG_EUC_KR */
722728{pg_euctw2wchar_with_len ,pg_euctw_mblen ,pg_euctw_dsplen ,3 },/* 4; PG_EUC_TW */
723729{pg_johab2wchar_with_len ,pg_johab_mblen ,pg_johab_dsplen ,3 },/* 5; PG_JOHAB */
724- {pg_utf2wchar_with_len ,pg_utf_mblen ,pg_utf_dsplen ,3 }, /* 6; PG_UTF8 */
730+ {pg_utf2wchar_with_len ,pg_utf_mblen ,pg_utf_dsplen ,4 }, /* 6; PG_UTF8 */
725731{pg_mule2wchar_with_len ,pg_mule_mblen ,pg_mule_dsplen ,3 },/* 7; PG_MULE_INTERNAL */
726732{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 8; PG_LATIN1 */
727733{pg_latin12wchar_with_len ,pg_latin1_mblen ,pg_latin1_dsplen ,1 },/* 9; PG_LATIN2 */
@@ -800,6 +806,31 @@ pg_encoding_max_length(int encoding)
800806
801807#ifndef FRONTEND
802808
809+ bool pg_utf8_islegal (const unsignedchar * source ,int length ) {
810+ unsignedchar a ;
811+ const unsignedchar * srcptr = source + length ;
812+ switch (length ) {
813+ default :return false;
814+ /* Everything else falls through when "true"... */
815+ case 4 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
816+ case 3 :if ((a = (* -- srcptr ))< 0x80 || a > 0xBF )return false;
817+ case 2 :if ((a = (* -- srcptr ))> 0xBF )return false;
818+ switch (* source ) {
819+ /* no fall-through in this inner switch */
820+ case 0xE0 :if (a < 0xA0 )return false;break ;
821+ case 0xED :if (a > 0x9F )return false;break ;
822+ case 0xF0 :if (a < 0x90 )return false;break ;
823+ case 0xF4 :if (a > 0x8F )return false;break ;
824+ default :if (a < 0x80 )return false;
825+ }
826+
827+ case 1 :if (* source >=0x80 && * source < 0xC2 )return false;
828+ }
829+ if (* source > 0xF4 )return false;
830+ return true;
831+ }
832+
833+
803834/*
804835 * Verify mbstr to make sure that it has a valid character sequence.
805836 * mbstr is not necessarily NULL terminated; length of mbstr is
@@ -823,51 +854,47 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
823854
824855while (len > 0 && * mbstr )
825856{
826- /* special UTF8 check */
827- if (encoding == PG_UTF8 && (* mbstr & 0xf8 )== 0xf0 )
828- {
829- if (noError )
830- return false;
831- ereport (ERROR ,
832- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
833- errmsg ("Unicode characters greater than or equal to 0x10000 are not supported" )));
834- }
835-
836857l = pg_mblen (mbstr );
837-
838- for (i = 1 ;i < l ;i ++ )
839- {
840- /*
841- * we expect that every multibyte char consists of bytes
842- * having the 8th bit set
843- */
844- if (i >=len || (mbstr [i ]& 0x80 )== 0 )
858+
859+ /* special UTF-8 check */
860+ if (encoding == PG_UTF8 ) {
861+ if (!pg_utf8_islegal (mbstr ,l )) {
862+ if (noError )return false;
863+ ereport (ERROR ,(errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),errmsg ("Invalid UNICODE byte sequence detected near byte %c" ,* mbstr )));
864+ }
865+ }else {
866+ for (i = 1 ;i < l ;i ++ )
845867{
846- char buf [8 * 2 + 1 ];
847- char * p = buf ;
848- int j ,
868+ /*
869+ * we expect that every multibyte char consists of bytes
870+ * having the 8th bit set
871+ */
872+ if (i >=len || (mbstr [i ]& 0x80 )== 0 )
873+ {
874+ char buf [8 * 2 + 1 ];
875+ char * p = buf ;
876+ int j ,
849877jlimit ;
850878
851- if (noError )
852- return false;
879+ if (noError )
880+ return false;
853881
854- jlimit = Min (l ,len );
855- jlimit = Min (jlimit ,8 );/* prevent buffer overrun */
882+ jlimit = Min (l ,len );
883+ jlimit = Min (jlimit ,8 );/* prevent buffer overrun */
856884
857- for (j = 0 ;j < jlimit ;j ++ )
858- p += sprintf (p ,"%02x" ,mbstr [j ]);
885+ for (j = 0 ;j < jlimit ;j ++ )
886+ p += sprintf (p ,"%02x" ,mbstr [j ]);
859887
860- ereport (ERROR ,
861- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
862- errmsg ("invalid byte sequence for encoding \"%s\": 0x%s" ,
863- GetDatabaseEncodingName (),buf )));
888+ ereport (ERROR ,
889+ (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
890+ errmsg ("invalid byte sequence for encoding \"%s\": 0x%s" ,
891+ GetDatabaseEncodingName (),buf )));
892+ }
864893}
865894}
866-
867895len -= l ;
868896mbstr += l ;
869897}
870-
871898return true;
872899}
873900