NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit09022de

committed

Improve documentation about MULE encoding.

This commit improves the comments in pg_wchar.h and creates #define symbolsfor some formerly hard-coded values. No substantive code changes.Tatsuo Ishii and Tom Lane

1 parent47a2adc commit09022deCopy full SHA for 09022de

File tree

3 files changed

+110

-58

lines changed

src
- backend/utils/mb
  - conversion_procs/euc_tw_and_big5
    - euc_tw_and_big5.c
  - wchar.c
- include/mb
  - pg_wchar.h

3 files changed

+110

-58

lines changed

`‎src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c‎`

Lines changed: 8 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,8 @@ euc_tw2mic(const unsigned char euc, unsigned char p, int len)`
`168`	`168`	`*p++=LC_CNS11643_2;`
`169`	`169`	`else`
`170`	`170`	`{`
`171`		`-p++=0x9d;/ LCPRV2 */`
	`171`	`+/* other planes are MULE private charsets */`
	`172`	`+*p++=LCPRV2_B;`
`172`	`173`	`*p++=c1-0xa3+LC_CNS11643_3;`
`173`	`174`	`}`
`174`	`175`	`*p++=euc[2];`
`@@ -235,9 +236,9 @@ mic2euc_tw(const unsigned char mic, unsigned char p, int len)`
`235`	`236`	`*p++=mic[1];`
`236`	`237`	`*p++=mic[2];`
`237`	`238`	`}`
`238`		`-elseif (c1==0x9d&&`
	`239`	`+elseif (c1==LCPRV2_B&&`
`239`	`240`	`mic[1] >=LC_CNS11643_3&&mic[1] <=LC_CNS11643_7)`
`240`		`-{/* LCPRV2? */`
	`241`	`+{`
`241`	`242`	`*p++=SS2;`
`242`	`243`	`*p++=mic[1]-LC_CNS11643_3+0xa3;`
`243`	`244`	`*p++=mic[2];`
`@@ -286,10 +287,9 @@ big52mic(const unsigned char big5, unsigned char p, int len)`
`286`	`287`	`cnsBuf=BIG5toCNS(big5buf,&lc);`
`287`	`288`	`if (lc!=0)`
`288`	`289`	`{`
	`290`	`+/* Planes 3 and 4 are MULE private charsets */`
`289`	`291`	`if (lc==LC_CNS11643_3\|\|lc==LC_CNS11643_4)`
`290`		`-{`
`291`		`-p++=0x9d;/ LCPRV2 */`
`292`		`-}`
	`292`	`+*p++=LCPRV2_B;`
`293`	`293`	`p++=lc;/ Plane No. */`
`294`	`294`	`*p++= (cnsBuf >>8)&0x00ff;`
`295`	`295`	`*p++=cnsBuf&0x00ff;`
`@@ -332,10 +332,9 @@ mic2big5(const unsigned char mic, unsigned char p, int len)`
`332`	`332`	`if (l<0)`
`333`	`333`	`report_invalid_encoding(PG_MULE_INTERNAL,`
`334`	`334`	`(constchar*)mic,len);`
`335`		`-/* 0x9d means LCPRV2 */`
`336`		`-if (c1==LC_CNS11643_1\|\|c1==LC_CNS11643_2\|\|c1==0x9d)`
	`335`	`+if (c1==LC_CNS11643_1\|\|c1==LC_CNS11643_2\|\|c1==LCPRV2_B)`
`337`	`336`	`{`
`338`		`-if (c1==0x9d)`
	`337`	`+if (c1==LCPRV2_B)`
`339`	`338`	`{`
`340`	`339`	`c1=mic[1];/* get plane no. */`
`341`	`340`	`cnsBuf= (mic[2] <<8) \|mic[3];`

`‎src/backend/utils/mb/wchar.c‎`

Lines changed: 6 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -742,6 +742,12 @@ pg_mule_dsplen(const unsigned char *s)`
`742`	`742`	`{`
`743`	`743`	`intlen;`
`744`	`744`
	`745`	`+/*`
	`746`	`+ * Note: it's not really appropriate to assume that all multibyte charsets`
	`747`	`+ * are double-wide on screen. But this seems an okay approximation for`
	`748`	`+ * the MULE charsets we currently support.`
	`749`	`+ */`
	`750`	`+`
`745`	`751`	`if (IS_LC1(*s))`
`746`	`752`	`len=1;`
`747`	`753`	`elseif (IS_LCPRV1(*s))`

`‎src/include/mb/pg_wchar.h‎`

Lines changed: 96 additions & 49 deletions

Original file line number	Diff line number	Diff line change
`@@ -36,36 +36,60 @@ typedef unsigned int pg_wchar;`
`36`	`36`	`#defineISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) \|\| ((c) >= 0xe0 && (c) <= 0xfc))`
`37`	`37`	`#defineISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) \|\| ((c) >= 0x80 && (c) <= 0xfc))`
`38`	`38`
`39`		`-/*`
`40`		`- * Leading byte types or leading prefix byte for MULE internal code.`
`41`		`- * See http://www.xemacs.org for more details.(there is a doc titled`
`42`		`- * "XEmacs Internals Manual", "MULE Character Sets and Encodings"`
`43`		`- * section.)`
`44`		`- */`
`45`		`-/*`
`46`		`- * Is a leading byte for "official" single byte encodings?`
`47`		`- */`
`48`		`-#defineIS_LC1(c)((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)`
`49`		`-/*`
`50`		`- * Is a prefix byte for "private" single byte encodings?`
`51`		`- */`
`52`		`-#defineIS_LCPRV1(c)((unsigned char)(c) == 0x9a \|\| (unsigned char)(c) == 0x9b)`
`53`		`-/*`
`54`		`- * Is a leading byte for "official" multibyte encodings?`
`55`		`- */`
`56`		`-#defineIS_LC2(c)((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)`
`57`		`-/*`
`58`		`- * Is a prefix byte for "private" multibyte encodings?`
`59`		`- */`
`60`		`-#defineIS_LCPRV2(c)((unsigned char)(c) == 0x9c \|\| (unsigned char)(c) == 0x9d)`
`61`		`-`
`62`	`39`	`/*----------------------------------------------------`
`63`		`- * leading characters`
	`40`	`+ * MULE Internal Encoding (MIC)`
	`41`	`+ *`
	`42`	`+ * This encoding follows the design used within XEmacs; it is meant to`
	`43`	`+ * subsume many externally-defined character sets. Each character includes`
	`44`	`+ * identification of the character set it belongs to, so the encoding is`
	`45`	`+ * general but somewhat bulky.`
	`46`	`+ *`
	`47`	`+ * Currently PostgreSQL supports 5 types of MULE character sets:`
	`48`	`+ *`
	`49`	`+ * 1) 1-byte ASCII characters. Each byte is below 0x80.`
	`50`	`+ *`
	`51`	`+ * 2) "Official" single byte charsets such as ISO-8859-1 (Latin1).`
	`52`	`+ * Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is`
	`53`	`+ * an identifier for the charset (in the range 0x81 to 0x8d) and C1`
	`54`	`+ * is the character code (in the range 0xa0 to 0xff).`
	`55`	`+ *`
	`56`	`+ * 3) "Private" single byte charsets such as SISHENG. Each MULE`
	`57`	`+ * character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1`
	`58`	`+ * is a private-charset flag, LC12 is an identifier for the charset,`
	`59`	`+ * and C1 is the character code (in the range 0xa0 to 0xff).`
	`60`	`+ * LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf)`
	`61`	`+ * or 0x9b (if LC12 is in the range 0xe0 to 0xef).`
	`62`	`+ *`
	`63`	`+ * 4) "Official" multibyte charsets such as JIS X0208. Each MULE`
	`64`	`+ * character consists of 3 bytes: LC2 + C1 + C2, where LC2 is`
	`65`	`+ * an identifier for the charset (in the range 0x90 to 0x99) and C1`
	`66`	`+ * and C2 form the character code (each in the range 0xa0 to 0xff).`
	`67`	`+ *`
	`68`	`+ * 5) "Private" multibyte charsets such as CNS 11643-1992 Plane 3.`
	`69`	`+ * Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2,`
	`70`	`+ * where LCPRV2 is a private-charset flag, LC22 is an identifier for`
	`71`	`+ * the charset, and C1 and C2 form the character code (each in the range`
	`72`	`+ * 0xa0 to 0xff). LCPRV2 is either 0x9c (if LC22 is in the range 0xf0`
	`73`	`+ * to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe).`
	`74`	`+ *`
	`75`	`+ * "Official" encodings are those that have been assigned code numbers by`
	`76`	`+ * the XEmacs project; "private" encodings have Postgres-specific charset`
	`77`	`+ * identifiers.`
	`78`	`+ *`
	`79`	`+ * See the "XEmacs Internals Manual", available at http://www.xemacs.org,`
	`80`	`+ * for more details. Note that for historical reasons, Postgres'`
	`81`	`+ * private-charset flag values do not match what XEmacs says they should be,`
	`82`	`+ * so this isn't really exactly MULE (not that private charsets would be`
	`83`	`+ * interoperable anyway).`
`64`	`84`	`*----------------------------------------------------`
`65`	`85`	`*/`
`66`	`86`
`67`	`87`	`/*`
`68`		`- * Official single byte encodings (0x81-0x8e)`
	`88`	`+ * Charset identifiers (also called "leading bytes" in the MULE documentation)`
	`89`	`+ */`
	`90`	`+`
	`91`	`+/*`
	`92`	`+ * Charset IDs for official single byte encodings (0x81-0x8e)`
`69`	`93`	`*/`
`70`	`94`	`#defineLC_ISO8859_10x81/* ISO8859 Latin 1 */`
`71`	`95`	`#defineLC_ISO8859_20x82/* ISO8859 Latin 2 */`
`@@ -79,21 +103,19 @@ typedef unsigned int pg_wchar;`
`79`	`103`	`#defineLC_JISX0201R0x8a/* Japanese 1 byte Roman */`
`80`	`104`	`/* Note that 0x8b seems to be unused as of Emacs 20.7.`
`81`	`105`	`* However, there might be a chance that 0x8b could be used`
`82`		`- * in laterversion of Emacs.`
	`106`	`+ * in laterversions of Emacs.`
`83`	`107`	`*/`
`84`	`108`	`#defineLC_KOI8_R0x8b/* Cyrillic KOI8-R */`
`85`		`-#defineLC_KOI8_U0x8b/* Cyrillic KOI8-U */`
`86`	`109`	`#defineLC_ISO8859_50x8c/* ISO8859 Cyrillic */`
`87`	`110`	`#defineLC_ISO8859_90x8d/* ISO8859 Latin 5 (not supported yet) */`
`88`	`111`	`/* #define FREE0x8efree (unused) */`
	`112`	`+/* #define CONTROL_10x8fcontrol characters (unused) */`
`89`	`113`
`90`		`-/*`
`91`		`- * Unused`
`92`		`- */`
`93`		`-#defineCONTROL_10x8f/* control characters (unused) */`
	`114`	`+/* Is a leading byte for "official" single byte encodings? */`
	`115`	`+#defineIS_LC1(c)((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)`
`94`	`116`
`95`	`117`	`/*`
`96`		`- *Official multibyte byte encodings (0x90-0x99)`
	`118`	`+ *Charset IDs for official multibyte encodings (0x90-0x99)`
`97`	`119`	`* 0x9a-0x9d are free. 0x9e and 0x9f are reserved.`
`98`	`120`	`*/`
`99`	`121`	`#defineLC_JISX0208_19780x90/* Japanese Kanji, old JIS (not supported) */`
`@@ -108,45 +130,70 @@ typedef unsigned int pg_wchar;`
`108`	`130`	`#defineLC_BIG5_10x98/* Plane 1 Chinese traditional (not supported) */`
`109`	`131`	`#defineLC_BIG5_20x99/* Plane 1 Chinese traditional (not supported) */`
`110`	`132`
	`133`	`+/* Is a leading byte for "official" multibyte encodings? */`
	`134`	`+#defineIS_LC2(c)((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)`
	`135`	`+`
`111`	`136`	`/*`
`112`		`- * Private single byte encodings (0xa0-0xef)`
	`137`	`+ * Postgres-specific prefix bytes for "private" single byte encodings`
	`138`	`+ * (According to the MULE docs, we should be using 0x9e for this)`
`113`	`139`	`*/`
`114`		`-#defineLC_SISHENG0xa0/* Chinese SiSheng characters for`
`115`		`- * PinYin/ZhuYin (not supported) */`
`116`		`-#defineLC_IPA0xa1/* IPA (International Phonetic Association)`
`117`		`- * (not supported) */`
`118`		`-#defineLC_VISCII_LOWER0xa2/* Vietnamese VISCII1.1 lower-case (not`
`119`		`- * supported) */`
`120`		`-#defineLC_VISCII_UPPER0xa3/* Vietnamese VISCII1.1 upper-case (not`
`121`		`- * supported) */`
	`140`	`+#defineLCPRV1_A0x9a`
	`141`	`+#defineLCPRV1_B0x9b`
	`142`	`+#defineIS_LCPRV1(c)((unsigned char)(c) == LCPRV1_A \|\| (unsigned char)(c) == LCPRV1_B)`
	`143`	`+`
	`144`	`+/*`
	`145`	`+ * Postgres-specific prefix bytes for "private" multibyte encodings`
	`146`	`+ * (According to the MULE docs, we should be using 0x9f for this)`
	`147`	`+ */`
	`148`	`+#defineLCPRV2_A0x9c`
	`149`	`+#defineLCPRV2_B0x9d`
	`150`	`+#defineIS_LCPRV2(c)((unsigned char)(c) == LCPRV2_A \|\| (unsigned char)(c) == LCPRV2_B)`
	`151`	`+`
	`152`	`+/*`
	`153`	`+ * Charset IDs for private single byte encodings (0xa0-0xef)`
	`154`	`+ */`
	`155`	`+#defineLC_SISHENG0xa0/* Chinese SiSheng characters for`
	`156`	`+ * PinYin/ZhuYin (not supported) */`
	`157`	`+#defineLC_IPA0xa1/* IPA (International Phonetic Association)`
	`158`	`+ * (not supported) */`
	`159`	`+#defineLC_VISCII_LOWER0xa2/* Vietnamese VISCII1.1 lower-case (not`
	`160`	`+ * supported) */`
	`161`	`+#defineLC_VISCII_UPPER0xa3/* Vietnamese VISCII1.1 upper-case (not`
	`162`	`+ * supported) */`
`122`	`163`	`#defineLC_ARABIC_DIGIT0xa4/* Arabic digit (not supported) */`
`123`	`164`	`#defineLC_ARABIC_1_COLUMN0xa5/* Arabic 1-column (not supported) */`
`124`	`165`	`#defineLC_ASCII_RIGHT_TO_LEFT0xa6/* ASCII (left half of ISO8859-1) with`
`125`	`166`	`* right-to-left direction (not`
`126`	`167`	`* supported) */`
`127`		`-#defineLC_LAO0xa7/* Lao characters (ISO10646 0E80..0EDF) (not`
`128`		`- * supported) */`
	`168`	`+#defineLC_LAO0xa7/* Lao characters (ISO10646 0E80..0EDF)`
	`169`	`+ * (not supported) */`
`129`	`170`	`#defineLC_ARABIC_2_COLUMN0xa8/* Arabic 1-column (not supported) */`
`130`	`171`
`131`	`172`	`/*`
`132`		`- *Private multibyte encodings (0xf0-0xff)`
	`173`	`+ *Charset IDs for private multibyte encodings (0xf0-0xff)`
`133`	`174`	`*/`
`134`		`-#defineLC_INDIAN_1_COLUMN0xf0/* Indian charset for 1-column width glypps`
`135`		`- * (not supported) */`
`136`		`-#defineLC_TIBETAN_1_COLUMN 0xf1/* Tibetan 1 column glyph (not supported) */`
	`175`	`+#defineLC_INDIAN_1_COLUMN0xf0/* Indian charset for 1-column width glyphs`
	`176`	`+ * (not supported) */`
	`177`	`+#defineLC_TIBETAN_1_COLUMN 0xf1/* Tibetan 1-column width glyphs`
	`178`	`+ * (not supported) */`
`137`	`179`	`#defineLC_ETHIOPIC0xf5/* Ethiopic characters (not supported) */`
`138`	`180`	`#defineLC_CNS11643_30xf6/* CNS 11643-1992 Plane 3 */`
`139`	`181`	`#defineLC_CNS11643_40xf7/* CNS 11643-1992 Plane 4 */`
`140`	`182`	`#defineLC_CNS11643_50xf8/* CNS 11643-1992 Plane 5 */`
`141`	`183`	`#defineLC_CNS11643_60xf9/* CNS 11643-1992 Plane 6 */`
`142`	`184`	`#defineLC_CNS11643_70xfa/* CNS 11643-1992 Plane 7 */`
`143`		`-#defineLC_INDIAN_2_COLUMN0xfb/* Indian charset for 2-column widthglypps`
`144`		`- * (not supported) */`
	`185`	`+#defineLC_INDIAN_2_COLUMN0xfb/* Indian charset for 2-column widthglyphs`
	`186`	`+ * (not supported) */`
`145`	`187`	`#defineLC_TIBETAN0xfc/* Tibetan (not supported) */`
`146`	`188`	`/* #define FREE0xfdfree (unused) */`
`147`	`189`	`/* #define FREE0xfefree (unused) */`
`148`	`190`	`/* #define FREE0xfffree (unused) */`
`149`	`191`
	`192`	`+/*----------------------------------------------------`
	`193`	`+ * end of MULE stuff`
	`194`	`+ *----------------------------------------------------`
	`195`	`+ */`
	`196`	`+`
`150`	`197`	`/*`
`151`	`198`	`* PostgreSQL encoding identifiers`
`152`	`199`	`*`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit09022de

File tree

3 files changed

3 files changed

`‎src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c‎`

`‎src/backend/utils/mb/wchar.c‎`

`‎src/include/mb/pg_wchar.h‎`

0 commit comments