@@ -36,36 +36,60 @@ typedef unsigned int pg_wchar;
3636#define ISSJISHEAD (c ) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc))
3737#define ISSJISTAIL (c ) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
3838
39- /*
40- * Leading byte types or leading prefix byte for MULE internal code.
41- * See http://www.xemacs.org for more details.(there is a doc titled
42- * "XEmacs Internals Manual", "MULE Character Sets and Encodings"
43- * section.)
44- */
45- /*
46- * Is a leading byte for "official" single byte encodings?
47- */
48- #define IS_LC1 (c )((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
49- /*
50- * Is a prefix byte for "private" single byte encodings?
51- */
52- #define IS_LCPRV1 (c )((unsigned char)(c) == 0x9a || (unsigned char)(c) == 0x9b)
53- /*
54- * Is a leading byte for "official" multibyte encodings?
55- */
56- #define IS_LC2 (c )((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
57- /*
58- * Is a prefix byte for "private" multibyte encodings?
59- */
60- #define IS_LCPRV2 (c )((unsigned char)(c) == 0x9c || (unsigned char)(c) == 0x9d)
61-
6239/*----------------------------------------------------
63- * leading characters
40+ * MULE Internal Encoding (MIC)
41+ *
42+ * This encoding follows the design used within XEmacs; it is meant to
43+ * subsume many externally-defined character sets. Each character includes
44+ * identification of the character set it belongs to, so the encoding is
45+ * general but somewhat bulky.
46+ *
47+ * Currently PostgreSQL supports 5 types of MULE character sets:
48+ *
49+ * 1) 1-byte ASCII characters. Each byte is below 0x80.
50+ *
51+ * 2) "Official" single byte charsets such as ISO-8859-1 (Latin1).
52+ * Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is
53+ * an identifier for the charset (in the range 0x81 to 0x8d) and C1
54+ * is the character code (in the range 0xa0 to 0xff).
55+ *
56+ * 3) "Private" single byte charsets such as SISHENG. Each MULE
57+ * character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1
58+ * is a private-charset flag, LC12 is an identifier for the charset,
59+ * and C1 is the character code (in the range 0xa0 to 0xff).
60+ * LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf)
61+ * or 0x9b (if LC12 is in the range 0xe0 to 0xef).
62+ *
63+ * 4) "Official" multibyte charsets such as JIS X0208. Each MULE
64+ * character consists of 3 bytes: LC2 + C1 + C2, where LC2 is
65+ * an identifier for the charset (in the range 0x90 to 0x99) and C1
66+ * and C2 form the character code (each in the range 0xa0 to 0xff).
67+ *
68+ * 5) "Private" multibyte charsets such as CNS 11643-1992 Plane 3.
69+ * Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2,
70+ * where LCPRV2 is a private-charset flag, LC22 is an identifier for
71+ * the charset, and C1 and C2 form the character code (each in the range
72+ * 0xa0 to 0xff). LCPRV2 is either 0x9c (if LC22 is in the range 0xf0
73+ * to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe).
74+ *
75+ * "Official" encodings are those that have been assigned code numbers by
76+ * the XEmacs project; "private" encodings have Postgres-specific charset
77+ * identifiers.
78+ *
79+ * See the "XEmacs Internals Manual", available at http://www.xemacs.org,
80+ * for more details. Note that for historical reasons, Postgres'
81+ * private-charset flag values do not match what XEmacs says they should be,
82+ * so this isn't really exactly MULE (not that private charsets would be
83+ * interoperable anyway).
6484 *----------------------------------------------------
6585 */
6686
6787/*
68- * Official single byte encodings (0x81-0x8e)
88+ * Charset identifiers (also called "leading bytes" in the MULE documentation)
89+ */
90+
91+ /*
92+ * Charset IDs for official single byte encodings (0x81-0x8e)
6993 */
7094#define LC_ISO8859_1 0x81/* ISO8859 Latin 1 */
7195#define LC_ISO8859_2 0x82/* ISO8859 Latin 2 */
@@ -79,21 +103,19 @@ typedef unsigned int pg_wchar;
79103#define LC_JISX0201R 0x8a/* Japanese 1 byte Roman */
80104/* Note that 0x8b seems to be unused as of Emacs 20.7.
81105 * However, there might be a chance that 0x8b could be used
82- * in laterversion of Emacs.
106+ * in laterversions of Emacs.
83107 */
84108#define LC_KOI8_R 0x8b/* Cyrillic KOI8-R */
85- #define LC_KOI8_U 0x8b/* Cyrillic KOI8-U */
86109#define LC_ISO8859_5 0x8c/* ISO8859 Cyrillic */
87110#define LC_ISO8859_9 0x8d/* ISO8859 Latin 5 (not supported yet) */
88111/* #define FREE0x8efree (unused) */
112+ /* #define CONTROL_10x8fcontrol characters (unused) */
89113
90- /*
91- * Unused
92- */
93- #define CONTROL_1 0x8f/* control characters (unused) */
114+ /* Is a leading byte for "official" single byte encodings? */
115+ #define IS_LC1 (c )((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
94116
95117/*
96- *Official multibyte byte encodings (0x90-0x99)
118+ *Charset IDs for official multibyte encodings (0x90-0x99)
97119 * 0x9a-0x9d are free. 0x9e and 0x9f are reserved.
98120 */
99121#define LC_JISX0208_1978 0x90/* Japanese Kanji, old JIS (not supported) */
@@ -108,45 +130,70 @@ typedef unsigned int pg_wchar;
108130#define LC_BIG5_1 0x98/* Plane 1 Chinese traditional (not supported) */
109131#define LC_BIG5_2 0x99/* Plane 1 Chinese traditional (not supported) */
110132
133+ /* Is a leading byte for "official" multibyte encodings? */
134+ #define IS_LC2 (c )((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
135+
111136/*
112- * Private single byte encodings (0xa0-0xef)
137+ * Postgres-specific prefix bytes for "private" single byte encodings
138+ * (According to the MULE docs, we should be using 0x9e for this)
113139 */
114- #define LC_SISHENG 0xa0/* Chinese SiSheng characters for
115- * PinYin/ZhuYin (not supported) */
116- #define LC_IPA 0xa1/* IPA (International Phonetic Association)
117- * (not supported) */
118- #define LC_VISCII_LOWER 0xa2/* Vietnamese VISCII1.1 lower-case (not
119- * supported) */
120- #define LC_VISCII_UPPER 0xa3/* Vietnamese VISCII1.1 upper-case (not
121- * supported) */
140+ #define LCPRV1_A 0x9a
141+ #define LCPRV1_B 0x9b
142+ #define IS_LCPRV1 (c )((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B)
143+
144+ /*
145+ * Postgres-specific prefix bytes for "private" multibyte encodings
146+ * (According to the MULE docs, we should be using 0x9f for this)
147+ */
148+ #define LCPRV2_A 0x9c
149+ #define LCPRV2_B 0x9d
150+ #define IS_LCPRV2 (c )((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B)
151+
152+ /*
153+ * Charset IDs for private single byte encodings (0xa0-0xef)
154+ */
155+ #define LC_SISHENG 0xa0/* Chinese SiSheng characters for
156+ * PinYin/ZhuYin (not supported) */
157+ #define LC_IPA 0xa1/* IPA (International Phonetic Association)
158+ * (not supported) */
159+ #define LC_VISCII_LOWER 0xa2/* Vietnamese VISCII1.1 lower-case (not
160+ * supported) */
161+ #define LC_VISCII_UPPER 0xa3/* Vietnamese VISCII1.1 upper-case (not
162+ * supported) */
122163#define LC_ARABIC_DIGIT 0xa4/* Arabic digit (not supported) */
123164#define LC_ARABIC_1_COLUMN 0xa5/* Arabic 1-column (not supported) */
124165#define LC_ASCII_RIGHT_TO_LEFT 0xa6/* ASCII (left half of ISO8859-1) with
125166 * right-to-left direction (not
126167 * supported) */
127- #define LC_LAO 0xa7/* Lao characters (ISO10646 0E80..0EDF) (not
128- * supported) */
168+ #define LC_LAO 0xa7/* Lao characters (ISO10646 0E80..0EDF)
169+ * (not supported) */
129170#define LC_ARABIC_2_COLUMN 0xa8/* Arabic 1-column (not supported) */
130171
131172/*
132- *Private multibyte encodings (0xf0-0xff)
173+ *Charset IDs for private multibyte encodings (0xf0-0xff)
133174 */
134- #define LC_INDIAN_1_COLUMN 0xf0/* Indian charset for 1-column width glypps
135- * (not supported) */
136- #define LC_TIBETAN_1_COLUMN 0xf1/* Tibetan 1 column glyph (not supported) */
175+ #define LC_INDIAN_1_COLUMN 0xf0/* Indian charset for 1-column width glyphs
176+ * (not supported) */
177+ #define LC_TIBETAN_1_COLUMN 0xf1/* Tibetan 1-column width glyphs
178+ * (not supported) */
137179#define LC_ETHIOPIC 0xf5/* Ethiopic characters (not supported) */
138180#define LC_CNS11643_3 0xf6/* CNS 11643-1992 Plane 3 */
139181#define LC_CNS11643_4 0xf7/* CNS 11643-1992 Plane 4 */
140182#define LC_CNS11643_5 0xf8/* CNS 11643-1992 Plane 5 */
141183#define LC_CNS11643_6 0xf9/* CNS 11643-1992 Plane 6 */
142184#define LC_CNS11643_7 0xfa/* CNS 11643-1992 Plane 7 */
143- #define LC_INDIAN_2_COLUMN 0xfb/* Indian charset for 2-column widthglypps
144- * (not supported) */
185+ #define LC_INDIAN_2_COLUMN 0xfb/* Indian charset for 2-column widthglyphs
186+ * (not supported) */
145187#define LC_TIBETAN 0xfc/* Tibetan (not supported) */
146188/* #define FREE0xfdfree (unused) */
147189/* #define FREE0xfefree (unused) */
148190/* #define FREE0xfffree (unused) */
149191
192+ /*----------------------------------------------------
193+ * end of MULE stuff
194+ *----------------------------------------------------
195+ */
196+
150197/*
151198 * PostgreSQL encoding identifiers
152199 *