Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit09022de

Browse files
committed
Improve documentation about MULE encoding.
This commit improves the comments in pg_wchar.h and creates #define symbolsfor some formerly hard-coded values. No substantive code changes.Tatsuo Ishii and Tom Lane
1 parent47a2adc commit09022de

File tree

3 files changed

+110
-58
lines changed

3 files changed

+110
-58
lines changed

‎src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,8 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
168168
*p++=LC_CNS11643_2;
169169
else
170170
{
171-
*p++=0x9d;/* LCPRV2 */
171+
/* other planes are MULE private charsets */
172+
*p++=LCPRV2_B;
172173
*p++=c1-0xa3+LC_CNS11643_3;
173174
}
174175
*p++=euc[2];
@@ -235,9 +236,9 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
235236
*p++=mic[1];
236237
*p++=mic[2];
237238
}
238-
elseif (c1==0x9d&&
239+
elseif (c1==LCPRV2_B&&
239240
mic[1] >=LC_CNS11643_3&&mic[1] <=LC_CNS11643_7)
240-
{/* LCPRV2? */
241+
{
241242
*p++=SS2;
242243
*p++=mic[1]-LC_CNS11643_3+0xa3;
243244
*p++=mic[2];
@@ -286,10 +287,9 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
286287
cnsBuf=BIG5toCNS(big5buf,&lc);
287288
if (lc!=0)
288289
{
290+
/* Planes 3 and 4 are MULE private charsets */
289291
if (lc==LC_CNS11643_3||lc==LC_CNS11643_4)
290-
{
291-
*p++=0x9d;/* LCPRV2 */
292-
}
292+
*p++=LCPRV2_B;
293293
*p++=lc;/* Plane No. */
294294
*p++= (cnsBuf >>8)&0x00ff;
295295
*p++=cnsBuf&0x00ff;
@@ -332,10 +332,9 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
332332
if (l<0)
333333
report_invalid_encoding(PG_MULE_INTERNAL,
334334
(constchar*)mic,len);
335-
/* 0x9d means LCPRV2 */
336-
if (c1==LC_CNS11643_1||c1==LC_CNS11643_2||c1==0x9d)
335+
if (c1==LC_CNS11643_1||c1==LC_CNS11643_2||c1==LCPRV2_B)
337336
{
338-
if (c1==0x9d)
337+
if (c1==LCPRV2_B)
339338
{
340339
c1=mic[1];/* get plane no. */
341340
cnsBuf= (mic[2] <<8) |mic[3];

‎src/backend/utils/mb/wchar.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,12 @@ pg_mule_dsplen(const unsigned char *s)
742742
{
743743
intlen;
744744

745+
/*
746+
* Note: it's not really appropriate to assume that all multibyte charsets
747+
* are double-wide on screen. But this seems an okay approximation for
748+
* the MULE charsets we currently support.
749+
*/
750+
745751
if (IS_LC1(*s))
746752
len=1;
747753
elseif (IS_LCPRV1(*s))

‎src/include/mb/pg_wchar.h

Lines changed: 96 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -36,36 +36,60 @@ typedef unsigned int pg_wchar;
3636
#defineISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc))
3737
#defineISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
3838

39-
/*
40-
* Leading byte types or leading prefix byte for MULE internal code.
41-
* See http://www.xemacs.org for more details.(there is a doc titled
42-
* "XEmacs Internals Manual", "MULE Character Sets and Encodings"
43-
* section.)
44-
*/
45-
/*
46-
* Is a leading byte for "official" single byte encodings?
47-
*/
48-
#defineIS_LC1(c)((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
49-
/*
50-
* Is a prefix byte for "private" single byte encodings?
51-
*/
52-
#defineIS_LCPRV1(c)((unsigned char)(c) == 0x9a || (unsigned char)(c) == 0x9b)
53-
/*
54-
* Is a leading byte for "official" multibyte encodings?
55-
*/
56-
#defineIS_LC2(c)((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
57-
/*
58-
* Is a prefix byte for "private" multibyte encodings?
59-
*/
60-
#defineIS_LCPRV2(c)((unsigned char)(c) == 0x9c || (unsigned char)(c) == 0x9d)
61-
6239
/*----------------------------------------------------
63-
* leading characters
40+
* MULE Internal Encoding (MIC)
41+
*
42+
* This encoding follows the design used within XEmacs; it is meant to
43+
* subsume many externally-defined character sets. Each character includes
44+
* identification of the character set it belongs to, so the encoding is
45+
* general but somewhat bulky.
46+
*
47+
* Currently PostgreSQL supports 5 types of MULE character sets:
48+
*
49+
* 1) 1-byte ASCII characters. Each byte is below 0x80.
50+
*
51+
* 2) "Official" single byte charsets such as ISO-8859-1 (Latin1).
52+
* Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is
53+
* an identifier for the charset (in the range 0x81 to 0x8d) and C1
54+
* is the character code (in the range 0xa0 to 0xff).
55+
*
56+
* 3) "Private" single byte charsets such as SISHENG. Each MULE
57+
* character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1
58+
* is a private-charset flag, LC12 is an identifier for the charset,
59+
* and C1 is the character code (in the range 0xa0 to 0xff).
60+
* LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf)
61+
* or 0x9b (if LC12 is in the range 0xe0 to 0xef).
62+
*
63+
* 4) "Official" multibyte charsets such as JIS X0208. Each MULE
64+
* character consists of 3 bytes: LC2 + C1 + C2, where LC2 is
65+
* an identifier for the charset (in the range 0x90 to 0x99) and C1
66+
* and C2 form the character code (each in the range 0xa0 to 0xff).
67+
*
68+
* 5) "Private" multibyte charsets such as CNS 11643-1992 Plane 3.
69+
* Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2,
70+
* where LCPRV2 is a private-charset flag, LC22 is an identifier for
71+
* the charset, and C1 and C2 form the character code (each in the range
72+
* 0xa0 to 0xff). LCPRV2 is either 0x9c (if LC22 is in the range 0xf0
73+
* to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe).
74+
*
75+
* "Official" encodings are those that have been assigned code numbers by
76+
* the XEmacs project; "private" encodings have Postgres-specific charset
77+
* identifiers.
78+
*
79+
* See the "XEmacs Internals Manual", available at http://www.xemacs.org,
80+
* for more details. Note that for historical reasons, Postgres'
81+
* private-charset flag values do not match what XEmacs says they should be,
82+
* so this isn't really exactly MULE (not that private charsets would be
83+
* interoperable anyway).
6484
*----------------------------------------------------
6585
*/
6686

6787
/*
68-
* Official single byte encodings (0x81-0x8e)
88+
* Charset identifiers (also called "leading bytes" in the MULE documentation)
89+
*/
90+
91+
/*
92+
* Charset IDs for official single byte encodings (0x81-0x8e)
6993
*/
7094
#defineLC_ISO8859_10x81/* ISO8859 Latin 1 */
7195
#defineLC_ISO8859_20x82/* ISO8859 Latin 2 */
@@ -79,21 +103,19 @@ typedef unsigned int pg_wchar;
79103
#defineLC_JISX0201R0x8a/* Japanese 1 byte Roman */
80104
/* Note that 0x8b seems to be unused as of Emacs 20.7.
81105
* However, there might be a chance that 0x8b could be used
82-
* in laterversion of Emacs.
106+
* in laterversions of Emacs.
83107
*/
84108
#defineLC_KOI8_R0x8b/* Cyrillic KOI8-R */
85-
#defineLC_KOI8_U0x8b/* Cyrillic KOI8-U */
86109
#defineLC_ISO8859_50x8c/* ISO8859 Cyrillic */
87110
#defineLC_ISO8859_90x8d/* ISO8859 Latin 5 (not supported yet) */
88111
/* #define FREE0x8efree (unused) */
112+
/* #define CONTROL_10x8fcontrol characters (unused) */
89113

90-
/*
91-
* Unused
92-
*/
93-
#defineCONTROL_10x8f/* control characters (unused) */
114+
/* Is a leading byte for "official" single byte encodings? */
115+
#defineIS_LC1(c)((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
94116

95117
/*
96-
*Official multibyte byte encodings (0x90-0x99)
118+
*Charset IDs for official multibyte encodings (0x90-0x99)
97119
* 0x9a-0x9d are free. 0x9e and 0x9f are reserved.
98120
*/
99121
#defineLC_JISX0208_19780x90/* Japanese Kanji, old JIS (not supported) */
@@ -108,45 +130,70 @@ typedef unsigned int pg_wchar;
108130
#defineLC_BIG5_10x98/* Plane 1 Chinese traditional (not supported) */
109131
#defineLC_BIG5_20x99/* Plane 1 Chinese traditional (not supported) */
110132

133+
/* Is a leading byte for "official" multibyte encodings? */
134+
#defineIS_LC2(c)((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
135+
111136
/*
112-
* Private single byte encodings (0xa0-0xef)
137+
* Postgres-specific prefix bytes for "private" single byte encodings
138+
* (According to the MULE docs, we should be using 0x9e for this)
113139
*/
114-
#defineLC_SISHENG0xa0/* Chinese SiSheng characters for
115-
* PinYin/ZhuYin (not supported) */
116-
#defineLC_IPA0xa1/* IPA (International Phonetic Association)
117-
* (not supported) */
118-
#defineLC_VISCII_LOWER0xa2/* Vietnamese VISCII1.1 lower-case (not
119-
* supported) */
120-
#defineLC_VISCII_UPPER0xa3/* Vietnamese VISCII1.1 upper-case (not
121-
* supported) */
140+
#defineLCPRV1_A0x9a
141+
#defineLCPRV1_B0x9b
142+
#defineIS_LCPRV1(c)((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B)
143+
144+
/*
145+
* Postgres-specific prefix bytes for "private" multibyte encodings
146+
* (According to the MULE docs, we should be using 0x9f for this)
147+
*/
148+
#defineLCPRV2_A0x9c
149+
#defineLCPRV2_B0x9d
150+
#defineIS_LCPRV2(c)((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B)
151+
152+
/*
153+
* Charset IDs for private single byte encodings (0xa0-0xef)
154+
*/
155+
#defineLC_SISHENG0xa0/* Chinese SiSheng characters for
156+
* PinYin/ZhuYin (not supported) */
157+
#defineLC_IPA0xa1/* IPA (International Phonetic Association)
158+
* (not supported) */
159+
#defineLC_VISCII_LOWER0xa2/* Vietnamese VISCII1.1 lower-case (not
160+
* supported) */
161+
#defineLC_VISCII_UPPER0xa3/* Vietnamese VISCII1.1 upper-case (not
162+
* supported) */
122163
#defineLC_ARABIC_DIGIT0xa4/* Arabic digit (not supported) */
123164
#defineLC_ARABIC_1_COLUMN0xa5/* Arabic 1-column (not supported) */
124165
#defineLC_ASCII_RIGHT_TO_LEFT0xa6/* ASCII (left half of ISO8859-1) with
125166
* right-to-left direction (not
126167
* supported) */
127-
#defineLC_LAO0xa7/* Lao characters (ISO10646 0E80..0EDF) (not
128-
* supported) */
168+
#defineLC_LAO0xa7/* Lao characters (ISO10646 0E80..0EDF)
169+
* (not supported) */
129170
#defineLC_ARABIC_2_COLUMN0xa8/* Arabic 1-column (not supported) */
130171

131172
/*
132-
*Private multibyte encodings (0xf0-0xff)
173+
*Charset IDs for private multibyte encodings (0xf0-0xff)
133174
*/
134-
#defineLC_INDIAN_1_COLUMN0xf0/* Indian charset for 1-column width glypps
135-
* (not supported) */
136-
#defineLC_TIBETAN_1_COLUMN 0xf1/* Tibetan 1 column glyph (not supported) */
175+
#defineLC_INDIAN_1_COLUMN0xf0/* Indian charset for 1-column width glyphs
176+
* (not supported) */
177+
#defineLC_TIBETAN_1_COLUMN 0xf1/* Tibetan 1-column width glyphs
178+
* (not supported) */
137179
#defineLC_ETHIOPIC0xf5/* Ethiopic characters (not supported) */
138180
#defineLC_CNS11643_30xf6/* CNS 11643-1992 Plane 3 */
139181
#defineLC_CNS11643_40xf7/* CNS 11643-1992 Plane 4 */
140182
#defineLC_CNS11643_50xf8/* CNS 11643-1992 Plane 5 */
141183
#defineLC_CNS11643_60xf9/* CNS 11643-1992 Plane 6 */
142184
#defineLC_CNS11643_70xfa/* CNS 11643-1992 Plane 7 */
143-
#defineLC_INDIAN_2_COLUMN0xfb/* Indian charset for 2-column widthglypps
144-
* (not supported) */
185+
#defineLC_INDIAN_2_COLUMN0xfb/* Indian charset for 2-column widthglyphs
186+
* (not supported) */
145187
#defineLC_TIBETAN0xfc/* Tibetan (not supported) */
146188
/* #define FREE0xfdfree (unused) */
147189
/* #define FREE0xfefree (unused) */
148190
/* #define FREE0xfffree (unused) */
149191

192+
/*----------------------------------------------------
193+
* end of MULE stuff
194+
*----------------------------------------------------
195+
*/
196+
150197
/*
151198
* PostgreSQL encoding identifiers
152199
*

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp