Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit4ea4f8b

Browse files
committed
Fix for Unicode characters above 0x10000.
John Hansen
1 parent917c8bb commit4ea4f8b

File tree

2 files changed

+83
-45
lines changed

2 files changed

+83
-45
lines changed

‎src/backend/utils/mb/wchar.c

Lines changed: 72 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* conversion functions between pg_wchar and multibyte streams.
33
* Tatsuo Ishii
4-
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.38 2004/09/17 21:59:57 petere Exp $
4+
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.39 2004/12/02 22:37:13 momjian Exp $
55
*
66
* WIN1250 client encoding updated by Pavel Behal
77
*
@@ -343,6 +343,31 @@ pg_johab_dsplen(const unsigned char *s)
343343
return (pg_euc_dsplen(s));
344344
}
345345

346+
boolisLegalUTF8(constUTF8*source,intlen) {
347+
UTF8a;
348+
constUTF8*srcptr=source+len;
349+
if(!source|| (pg_utf_mblen(source)!=len))return false;
350+
switch (len) {
351+
default:return false;
352+
/* Everything else falls through when "true"... */
353+
case6:if ((a= (*--srcptr))<0x80||a>0xBF)return false;
354+
case5:if ((a= (*--srcptr))<0x80||a>0xBF)return false;
355+
case4:if ((a= (*--srcptr))<0x80||a>0xBF)return false;
356+
case3:if ((a= (*--srcptr))<0x80||a>0xBF)return false;
357+
case2:if ((a= (*--srcptr))>0xBF)return false;
358+
switch (*source) {
359+
/* no fall-through in this inner switch */
360+
case0xE0:if (a<0xA0)return false;break;
361+
case0xF0:if (a<0x90)return false;break;
362+
case0xF4:if (a>0x8F)return false;break;
363+
default:if (a<0x80)return false;
364+
}
365+
case1:if (*source >=0x80&&*source<0xC2)return false;
366+
if (*source>0xFD)return false;
367+
}
368+
return true;
369+
}
370+
346371
/*
347372
* convert UTF-8 string to pg_wchar (UCS-2)
348373
* caller should allocate enough space for "to"
@@ -398,21 +423,27 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
398423
* returns the byte length of a UTF-8 word pointed to by s
399424
*/
400425
int
401-
pg_utf_mblen(constunsignedchar*s)
426+
pg_utf_mblen(constUTF8*s)
402427
{
403428
intlen=1;
404429

405430
if ((*s&0x80)==0)
406431
len=1;
407432
elseif ((*s&0xe0)==0xc0)
408433
len=2;
409-
elseif ((*s&0xe0)==0xe0)
410-
len=3;
434+
elseif ((*s&0xf0)==0xe0)
435+
len=3;
436+
elseif ((*s&0xf8)==0xf0)
437+
len=4;
438+
elseif ((*s&0xfc)==0xf8)
439+
len=5;
440+
elseif ((*s&0xfe)==0xfc)
441+
len=6;
411442
return (len);
412443
}
413444

414445
staticint
415-
pg_utf_dsplen(constunsignedchar*s)
446+
pg_utf_dsplen(constUTF8*s)
416447
{
417448
return1;/* XXX fix me! */
418449
}
@@ -721,8 +752,8 @@ pg_wchar_tbl pg_wchar_table[] = {
721752
{pg_euckr2wchar_with_len,pg_euckr_mblen,pg_euckr_dsplen,3},/* 3; PG_EUC_KR */
722753
{pg_euctw2wchar_with_len,pg_euctw_mblen,pg_euctw_dsplen,3},/* 4; PG_EUC_TW */
723754
{pg_johab2wchar_with_len,pg_johab_mblen,pg_johab_dsplen,3},/* 5; PG_JOHAB */
724-
{pg_utf2wchar_with_len,pg_utf_mblen,pg_utf_dsplen,3},/* 6; PG_UNICODE */
725-
{pg_mule2wchar_with_len,pg_mule_mblen,pg_mule_dsplen,3},/* 7; PG_MULE_INTERNAL */
755+
{pg_utf2wchar_with_len,pg_utf_mblen,pg_utf_dsplen,6},/* 6; PG_UNICODE */
756+
{pg_mule2wchar_with_len,pg_mule_mblen,pg_mule_dsplen,3},/* 7; PG_MULE_INTERNAL */
726757
{pg_latin12wchar_with_len,pg_latin1_mblen,pg_latin1_dsplen,1},/* 8; PG_LATIN1 */
727758
{pg_latin12wchar_with_len,pg_latin1_mblen,pg_latin1_dsplen,1},/* 9; PG_LATIN2 */
728759
{pg_latin12wchar_with_len,pg_latin1_mblen,pg_latin1_dsplen,1},/* 10; PG_LATIN3 */
@@ -744,11 +775,11 @@ pg_wchar_tbl pg_wchar_table[] = {
744775
{pg_latin12wchar_with_len,pg_latin1_mblen,pg_latin1_dsplen,1},/* 26; ISO-8859-7 */
745776
{pg_latin12wchar_with_len,pg_latin1_mblen,pg_latin1_dsplen,1},/* 27; ISO-8859-8 */
746777
{pg_latin12wchar_with_len,pg_latin1_mblen,pg_latin1_dsplen,1},/* 28; PG_WIN1250 */
747-
{0,pg_sjis_mblen,pg_sjis_dsplen,2},/* 29; PG_SJIS */
748-
{0,pg_big5_mblen,pg_big5_dsplen,2},/* 30; PG_BIG5 */
749-
{0,pg_gbk_mblen,pg_gbk_dsplen,2},/* 31; PG_GBK */
750-
{0,pg_uhc_mblen,pg_uhc_dsplen,2},/* 32; PG_UHC */
751-
{0,pg_gb18030_mblen,pg_gb18030_dsplen,2}/* 33; PG_GB18030 */
778+
{0,pg_sjis_mblen,pg_sjis_dsplen,2},/* 29; PG_SJIS */
779+
{0,pg_big5_mblen,pg_big5_dsplen,2},/* 30; PG_BIG5 */
780+
{0,pg_gbk_mblen,pg_gbk_dsplen,2},/* 31; PG_GBK */
781+
{0,pg_uhc_mblen,pg_uhc_dsplen,2},/* 32; PG_UHC */
782+
{0,pg_gb18030_mblen,pg_gb18030_dsplen,2}/* 33; PG_GB18030 */
752783
};
753784

754785
/* returns the byte length of a word for mule internal code */
@@ -822,51 +853,48 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
822853

823854
while (len>0&&*mbstr)
824855
{
825-
/* special UTF-8 check */
826-
if (encoding==PG_UTF8&& (*mbstr&0xf8)==0xf0)
827-
{
828-
if (noError)
829-
return false;
830-
ereport(ERROR,
831-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
832-
errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
833-
}
834-
835856
l=pg_mblen(mbstr);
836857

837-
for (i=1;i<l;i++)
838-
{
839-
/*
840-
* we expect that every multibyte char consists of bytes
841-
* having the 8th bit set
842-
*/
843-
if (i >=len|| (mbstr[i]&0x80)==0)
858+
/* special UTF-8 check */
859+
if (encoding==PG_UTF8) {
860+
if(!isLegalUTF8(mbstr,l)) {
861+
if (noError)return false;
862+
ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near character %c",*mbstr)));
863+
}
864+
}else {
865+
for (i=1;i<l;i++)
844866
{
845-
charbuf[8*2+1];
846-
char*p=buf;
847-
intj,
867+
/*
868+
* we expect that every multibyte char consists of bytes
869+
* having the 8th bit set
870+
*/
871+
if (i >=len|| (mbstr[i]&0x80)==0)
872+
{
873+
charbuf[8*2+1];
874+
char*p=buf;
875+
intj,
848876
jlimit;
849877

850-
if (noError)
851-
return false;
878+
if (noError)
879+
return false;
852880

853-
jlimit=Min(l,len);
854-
jlimit=Min(jlimit,8);/* prevent buffer overrun */
881+
jlimit=Min(l,len);
882+
jlimit=Min(jlimit,8);/* prevent buffer overrun */
855883

856-
for (j=0;j<jlimit;j++)
857-
p+=sprintf(p,"%02x",mbstr[j]);
884+
for (j=0;j<jlimit;j++)
885+
p+=sprintf(p,"%02x",mbstr[j]);
858886

859-
ereport(ERROR,
860-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
861-
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
862-
GetDatabaseEncodingName(),buf)));
887+
ereport(ERROR,
888+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
889+
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
890+
GetDatabaseEncodingName(),buf)));
891+
}
863892
}
864-
}
865893

894+
}
866895
len-=l;
867896
mbstr+=l;
868897
}
869-
870898
return true;
871899
}
872900

‎src/include/mb/pg_wchar.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.53 2004/12/02 22:14:38 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.54 2004/12/02 22:37:14 momjian Exp $ */
22

33
#ifndefPG_WCHAR_H
44
#definePG_WCHAR_H
@@ -17,6 +17,14 @@
1717
*/
1818
typedefunsignedintpg_wchar;
1919

20+
21+
/*
22+
* The UTF types
23+
*/
24+
typedefunsignedintUTF32;/* at least 32 bits */
25+
typedefunsigned shortUTF16;/* at least 16 bits */
26+
typedefunsignedcharUTF8;/* typically 8 bits */
27+
2028
/*
2129
* various definitions for EUC
2230
*/
@@ -340,4 +348,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
340348
externvoidlatin2mic_with_table(unsignedchar*l,unsignedchar*p,intlen,intlc,unsignedchar*tab);
341349
externvoidmic2latin_with_table(unsignedchar*mic,unsignedchar*p,intlen,intlc,unsignedchar*tab);
342350

351+
externboolisLegalUTF8(constUTF8*source,intlen);
352+
343353
#endif/* PG_WCHAR_H */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp