Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5955945

Browse files
committed
Support 3 and 4-byte unicode characters.
John Hansen
1 parentf4c4f1c commit5955945

File tree

3 files changed

+76
-40
lines changed

3 files changed

+76
-40
lines changed

‎src/backend/utils/mb/conv.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Portions Copyright (c) 1994, Regents of the University of California
77
*
88
* IDENTIFICATION
9-
* $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.52 2005/03/07 04:30:52 momjian Exp $
9+
* $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.53 2005/06/15 00:15:08 momjian Exp $
1010
*
1111
*-------------------------------------------------------------------------
1212
*/
@@ -361,12 +361,19 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
361361
iutf=*utf++ <<8;
362362
iutf |=*utf++;
363363
}
364-
else
364+
elseif (l==3)
365365
{
366366
iutf=*utf++ <<16;
367367
iutf |=*utf++ <<8;
368368
iutf |=*utf++;
369369
}
370+
elseif (l==4)
371+
{
372+
iutf=*utf++ <<24;
373+
iutf |=*utf++ <<16;
374+
iutf |=*utf++ <<8;
375+
iutf |=*utf++;
376+
}
370377
p=bsearch(&iutf,map,size,
371378
sizeof(pg_utf_to_local),compare1);
372379
if (p==NULL)

‎src/backend/utils/mb/wchar.c

Lines changed: 64 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* conversion functions between pg_wchar and multibyte streams.
33
* Tatsuo Ishii
4-
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.43 2005/03/14 18:31:20 momjian Exp $
4+
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.44 2005/06/15 00:15:08 momjian Exp $
55
*
66
* WIN1250 client encoding updated by Pavel Behal
77
*
@@ -406,8 +406,14 @@ pg_utf_mblen(const unsigned char *s)
406406
len=1;
407407
elseif ((*s&0xe0)==0xc0)
408408
len=2;
409-
elseif ((*s&0xe0)==0xe0)
410-
len=3;
409+
elseif ((*s&0xf0)==0xe0)
410+
len=3;
411+
elseif ((*s&0xf8)==0xf0)
412+
len=4;
413+
elseif ((*s&0xfc)==0xf8)
414+
len=5;
415+
elseif ((*s&0xfe)==0xfc)
416+
len=6;
411417
return (len);
412418
}
413419

@@ -721,7 +727,7 @@ pg_wchar_tbl pg_wchar_table[] = {
721727
{pg_euckr2wchar_with_len,pg_euckr_mblen,pg_euckr_dsplen,3},/* 3; PG_EUC_KR */
722728
{pg_euctw2wchar_with_len,pg_euctw_mblen,pg_euctw_dsplen,3},/* 4; PG_EUC_TW */
723729
{pg_johab2wchar_with_len,pg_johab_mblen,pg_johab_dsplen,3},/* 5; PG_JOHAB */
724-
{pg_utf2wchar_with_len,pg_utf_mblen,pg_utf_dsplen,3},/* 6; PG_UTF8 */
730+
{pg_utf2wchar_with_len,pg_utf_mblen,pg_utf_dsplen,4},/* 6; PG_UTF8 */
725731
{pg_mule2wchar_with_len,pg_mule_mblen,pg_mule_dsplen,3},/* 7; PG_MULE_INTERNAL */
726732
{pg_latin12wchar_with_len,pg_latin1_mblen,pg_latin1_dsplen,1},/* 8; PG_LATIN1 */
727733
{pg_latin12wchar_with_len,pg_latin1_mblen,pg_latin1_dsplen,1},/* 9; PG_LATIN2 */
@@ -800,6 +806,31 @@ pg_encoding_max_length(int encoding)
800806

801807
#ifndefFRONTEND
802808

809+
boolpg_utf8_islegal(constunsignedchar*source,intlength) {
810+
unsignedchara;
811+
constunsignedchar*srcptr=source+length;
812+
switch (length) {
813+
default:return false;
814+
/* Everything else falls through when "true"... */
815+
case4:if ((a= (*--srcptr))<0x80||a>0xBF)return false;
816+
case3:if ((a= (*--srcptr))<0x80||a>0xBF)return false;
817+
case2:if ((a= (*--srcptr))>0xBF)return false;
818+
switch (*source) {
819+
/* no fall-through in this inner switch */
820+
case0xE0:if (a<0xA0)return false;break;
821+
case0xED:if (a>0x9F)return false;break;
822+
case0xF0:if (a<0x90)return false;break;
823+
case0xF4:if (a>0x8F)return false;break;
824+
default:if (a<0x80)return false;
825+
}
826+
827+
case1:if (*source >=0x80&&*source<0xC2)return false;
828+
}
829+
if (*source>0xF4)return false;
830+
return true;
831+
}
832+
833+
803834
/*
804835
* Verify mbstr to make sure that it has a valid character sequence.
805836
* mbstr is not necessarily NULL terminated; length of mbstr is
@@ -823,51 +854,47 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
823854

824855
while (len>0&&*mbstr)
825856
{
826-
/* special UTF8 check */
827-
if (encoding==PG_UTF8&& (*mbstr&0xf8)==0xf0)
828-
{
829-
if (noError)
830-
return false;
831-
ereport(ERROR,
832-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
833-
errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
834-
}
835-
836857
l=pg_mblen(mbstr);
837-
838-
for (i=1;i<l;i++)
839-
{
840-
/*
841-
* we expect that every multibyte char consists of bytes
842-
* having the 8th bit set
843-
*/
844-
if (i >=len|| (mbstr[i]&0x80)==0)
858+
859+
/* special UTF-8 check */
860+
if (encoding==PG_UTF8) {
861+
if(!pg_utf8_islegal(mbstr,l)) {
862+
if (noError)return false;
863+
ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near byte %c",*mbstr)));
864+
}
865+
}else {
866+
for (i=1;i<l;i++)
845867
{
846-
charbuf[8*2+1];
847-
char*p=buf;
848-
intj,
868+
/*
869+
* we expect that every multibyte char consists of bytes
870+
* having the 8th bit set
871+
*/
872+
if (i >=len|| (mbstr[i]&0x80)==0)
873+
{
874+
charbuf[8*2+1];
875+
char*p=buf;
876+
intj,
849877
jlimit;
850878

851-
if (noError)
852-
return false;
879+
if (noError)
880+
return false;
853881

854-
jlimit=Min(l,len);
855-
jlimit=Min(jlimit,8);/* prevent buffer overrun */
882+
jlimit=Min(l,len);
883+
jlimit=Min(jlimit,8);/* prevent buffer overrun */
856884

857-
for (j=0;j<jlimit;j++)
858-
p+=sprintf(p,"%02x",mbstr[j]);
885+
for (j=0;j<jlimit;j++)
886+
p+=sprintf(p,"%02x",mbstr[j]);
859887

860-
ereport(ERROR,
861-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
862-
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
863-
GetDatabaseEncodingName(),buf)));
888+
ereport(ERROR,
889+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
890+
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
891+
GetDatabaseEncodingName(),buf)));
892+
}
864893
}
865894
}
866-
867895
len-=l;
868896
mbstr+=l;
869897
}
870-
871898
return true;
872899
}
873900

‎src/include/mb/pg_wchar.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.58 2005/03/14 18:31:24 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.59 2005/06/15 00:15:08 momjian Exp $ */
22

33
#ifndefPG_WCHAR_H
44
#definePG_WCHAR_H
@@ -340,4 +340,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
340340
externvoidlatin2mic_with_table(unsignedchar*l,unsignedchar*p,intlen,intlc,unsignedchar*tab);
341341
externvoidmic2latin_with_table(unsignedchar*mic,unsignedchar*p,intlen,intlc,unsignedchar*tab);
342342

343+
externboolpg_utf8_islegal(constunsignedchar*source,intlength);
344+
343345
#endif/* PG_WCHAR_H */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp