11/*
22 * conversion functions between pg_wchar and multibyte streams.
33 * Tatsuo Ishii
4- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.52 2005/12/26 19:30:44 momjian Exp $
4+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.53 2006/02/10 00:39:04 momjian Exp $
55 *
66 * WIN1250 client encoding updated by Pavel Behal
77 *
2323 * for the particular encoding. Note that if the encoding is only
2424 * supported in the client, you don't need to define
2525 * mb2wchar_with_len() function (SJIS is the case).
26+ *
27+ * Note: for the display output of psql to work properly, the return values
28+ * of these functions must conform to the Unicode standard. In particular
29+ * the NUL character is zero width and control characters are generally
30+ * width -1. It is recommended that non-ASCII encodings refer their ASCII
31+ * subset to the ASCII routines to ensure consistancy.
32+ *
2633 */
2734
2835/*
@@ -53,6 +60,11 @@ pg_ascii_mblen(const unsigned char *s)
5360static int
5461pg_ascii_dsplen (const unsignedchar * s )
5562{
63+ if (* s == '\0' )
64+ return 0 ;
65+ if (* s < 0x20 || * s == 0x7f )
66+ return -1 ;
67+
5668return 1 ;
5769}
5870
@@ -125,7 +137,7 @@ pg_euc_dsplen(const unsigned char *s)
125137else if (IS_HIGHBIT_SET (* s ))
126138len = 2 ;
127139else
128- len = 1 ;
140+ len = pg_ascii_dsplen ( s ) ;
129141return len ;
130142}
131143
@@ -156,7 +168,7 @@ pg_eucjp_dsplen(const unsigned char *s)
156168else if (IS_HIGHBIT_SET (* s ))
157169len = 2 ;
158170else
159- len = 1 ;
171+ len = pg_ascii_dsplen ( s ) ;
160172return len ;
161173}
162174
@@ -244,7 +256,7 @@ pg_euccn_dsplen(const unsigned char *s)
244256if (IS_HIGHBIT_SET (* s ))
245257len = 2 ;
246258else
247- len = 1 ;
259+ len = pg_ascii_dsplen ( s ) ;
248260return len ;
249261}
250262
@@ -304,7 +316,7 @@ pg_euctw_mblen(const unsigned char *s)
304316else if (IS_HIGHBIT_SET (* s ))
305317len = 2 ;
306318else
307- len = 1 ;
319+ len = pg_ascii_dsplen ( s ) ;
308320return len ;
309321}
310322
@@ -320,7 +332,7 @@ pg_euctw_dsplen(const unsigned char *s)
320332else if (IS_HIGHBIT_SET (* s ))
321333len = 2 ;
322334else
323- len = 1 ;
335+ len = pg_ascii_dsplen ( s ) ;
324336return len ;
325337}
326338
@@ -419,10 +431,179 @@ pg_utf_mblen(const unsigned char *s)
419431return len ;
420432}
421433
434+ /*
435+ * This is an implementation of wcwidth() and wcswidth() as defined in
436+ * "The Single UNIX Specification, Version 2, The Open Group, 1997"
437+ * <http://www.UNIX-systems.org/online.html>
438+ *
439+ * Markus Kuhn -- 2001-09-08 -- public domain
440+ *
441+ * customised for PostgreSQL
442+ *
443+ * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
444+ */
445+
446+ struct mbinterval
447+ {
448+ unsigned short first ;
449+ unsigned short last ;
450+ };
451+
452+ /* auxiliary function for binary search in interval table */
453+ static int
454+ mbbisearch (pg_wchar ucs ,const struct mbinterval * table ,int max )
455+ {
456+ int min = 0 ;
457+ int mid ;
458+
459+ if (ucs < table [0 ].first || ucs > table [max ].last )
460+ return 0 ;
461+ while (max >=min )
462+ {
463+ mid = (min + max ) /2 ;
464+ if (ucs > table [mid ].last )
465+ min = mid + 1 ;
466+ else if (ucs < table [mid ].first )
467+ max = mid - 1 ;
468+ else
469+ return 1 ;
470+ }
471+
472+ return 0 ;
473+ }
474+
475+
476+ /* The following functions define the column width of an ISO 10646
477+ * character as follows:
478+ *
479+ * - The null character (U+0000) has a column width of 0.
480+ *
481+ * - Other C0/C1 control characters and DEL will lead to a return
482+ *value of -1.
483+ *
484+ * - Non-spacing and enclosing combining characters (general
485+ *category code Mn or Me in the Unicode database) have a
486+ *column width of 0.
487+ *
488+ * - Other format characters (general category code Cf in the Unicode
489+ *database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
490+ *
491+ * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
492+ *have a column width of 0.
493+ *
494+ * - Spacing characters in the East Asian Wide (W) or East Asian
495+ *FullWidth (F) category as defined in Unicode Technical
496+ *Report #11 have a column width of 2.
497+ *
498+ * - All remaining characters (including all printable
499+ *ISO 8859-1 and WGL4 characters, Unicode control characters,
500+ *etc.) have a column width of 1.
501+ *
502+ * This implementation assumes that wchar_t characters are encoded
503+ * in ISO 10646.
504+ */
505+
506+ static int
507+ ucs_wcwidth (pg_wchar ucs )
508+ {
509+ /* sorted list of non-overlapping intervals of non-spacing characters */
510+ static const struct mbinterval combining []= {
511+ {0x0300 ,0x034E }, {0x0360 ,0x0362 }, {0x0483 ,0x0486 },
512+ {0x0488 ,0x0489 }, {0x0591 ,0x05A1 }, {0x05A3 ,0x05B9 },
513+ {0x05BB ,0x05BD }, {0x05BF ,0x05BF }, {0x05C1 ,0x05C2 },
514+ {0x05C4 ,0x05C4 }, {0x064B ,0x0655 }, {0x0670 ,0x0670 },
515+ {0x06D6 ,0x06E4 }, {0x06E7 ,0x06E8 }, {0x06EA ,0x06ED },
516+ {0x070F ,0x070F }, {0x0711 ,0x0711 }, {0x0730 ,0x074A },
517+ {0x07A6 ,0x07B0 }, {0x0901 ,0x0902 }, {0x093C ,0x093C },
518+ {0x0941 ,0x0948 }, {0x094D ,0x094D }, {0x0951 ,0x0954 },
519+ {0x0962 ,0x0963 }, {0x0981 ,0x0981 }, {0x09BC ,0x09BC },
520+ {0x09C1 ,0x09C4 }, {0x09CD ,0x09CD }, {0x09E2 ,0x09E3 },
521+ {0x0A02 ,0x0A02 }, {0x0A3C ,0x0A3C }, {0x0A41 ,0x0A42 },
522+ {0x0A47 ,0x0A48 }, {0x0A4B ,0x0A4D }, {0x0A70 ,0x0A71 },
523+ {0x0A81 ,0x0A82 }, {0x0ABC ,0x0ABC }, {0x0AC1 ,0x0AC5 },
524+ {0x0AC7 ,0x0AC8 }, {0x0ACD ,0x0ACD }, {0x0B01 ,0x0B01 },
525+ {0x0B3C ,0x0B3C }, {0x0B3F ,0x0B3F }, {0x0B41 ,0x0B43 },
526+ {0x0B4D ,0x0B4D }, {0x0B56 ,0x0B56 }, {0x0B82 ,0x0B82 },
527+ {0x0BC0 ,0x0BC0 }, {0x0BCD ,0x0BCD }, {0x0C3E ,0x0C40 },
528+ {0x0C46 ,0x0C48 }, {0x0C4A ,0x0C4D }, {0x0C55 ,0x0C56 },
529+ {0x0CBF ,0x0CBF }, {0x0CC6 ,0x0CC6 }, {0x0CCC ,0x0CCD },
530+ {0x0D41 ,0x0D43 }, {0x0D4D ,0x0D4D }, {0x0DCA ,0x0DCA },
531+ {0x0DD2 ,0x0DD4 }, {0x0DD6 ,0x0DD6 }, {0x0E31 ,0x0E31 },
532+ {0x0E34 ,0x0E3A }, {0x0E47 ,0x0E4E }, {0x0EB1 ,0x0EB1 },
533+ {0x0EB4 ,0x0EB9 }, {0x0EBB ,0x0EBC }, {0x0EC8 ,0x0ECD },
534+ {0x0F18 ,0x0F19 }, {0x0F35 ,0x0F35 }, {0x0F37 ,0x0F37 },
535+ {0x0F39 ,0x0F39 }, {0x0F71 ,0x0F7E }, {0x0F80 ,0x0F84 },
536+ {0x0F86 ,0x0F87 }, {0x0F90 ,0x0F97 }, {0x0F99 ,0x0FBC },
537+ {0x0FC6 ,0x0FC6 }, {0x102D ,0x1030 }, {0x1032 ,0x1032 },
538+ {0x1036 ,0x1037 }, {0x1039 ,0x1039 }, {0x1058 ,0x1059 },
539+ {0x1160 ,0x11FF }, {0x17B7 ,0x17BD }, {0x17C6 ,0x17C6 },
540+ {0x17C9 ,0x17D3 }, {0x180B ,0x180E }, {0x18A9 ,0x18A9 },
541+ {0x200B ,0x200F }, {0x202A ,0x202E }, {0x206A ,0x206F },
542+ {0x20D0 ,0x20E3 }, {0x302A ,0x302F }, {0x3099 ,0x309A },
543+ {0xFB1E ,0xFB1E }, {0xFE20 ,0xFE23 }, {0xFEFF ,0xFEFF },
544+ {0xFFF9 ,0xFFFB }
545+ };
546+
547+ /* test for 8-bit control characters */
548+ if (ucs == 0 )
549+ return 0 ;
550+
551+ if (ucs < 0x20 || (ucs >=0x7f && ucs < 0xa0 )|| ucs > 0x0010ffff )
552+ return -1 ;
553+
554+ /* binary search in table of non-spacing characters */
555+ if (mbbisearch (ucs ,combining ,
556+ sizeof (combining ) /sizeof (struct mbinterval )- 1 ))
557+ return 0 ;
558+
559+ /*
560+ * if we arrive here, ucs is not a combining or C0/C1 control character
561+ */
562+
563+ return 1 +
564+ (ucs >=0x1100 &&
565+ (ucs <=0x115f || /* Hangul Jamo init. consonants */
566+ (ucs >=0x2e80 && ucs <=0xa4cf && (ucs & ~0x0011 )!= 0x300a &&
567+ ucs != 0x303f )|| /* CJK ... Yi */
568+ (ucs >=0xac00 && ucs <=0xd7a3 )|| /* Hangul Syllables */
569+ (ucs >=0xf900 && ucs <=0xfaff )|| /* CJK Compatibility
570+ * Ideographs */
571+ (ucs >=0xfe30 && ucs <=0xfe6f )|| /* CJK Compatibility Forms */
572+ (ucs >=0xff00 && ucs <=0xff5f )|| /* Fullwidth Forms */
573+ (ucs >=0xffe0 && ucs <=0xffe6 )||
574+ (ucs >=0x20000 && ucs <=0x2ffff )));
575+ }
576+
577+ static pg_wchar
578+ utf2ucs (const unsignedchar * c )
579+ {
580+ /*
581+ * one char version of pg_utf2wchar_with_len. no control here, c must
582+ * point to a large enough string
583+ */
584+ if ((* c & 0x80 )== 0 )
585+ return (pg_wchar )c [0 ];
586+ else if ((* c & 0xe0 )== 0xc0 )
587+ return (pg_wchar ) (((c [0 ]& 0x1f ) <<6 ) |
588+ (c [1 ]& 0x3f ));
589+ else if ((* c & 0xf0 )== 0xe0 )
590+ return (pg_wchar ) (((c [0 ]& 0x0f ) <<12 ) |
591+ ((c [1 ]& 0x3f ) <<6 ) |
592+ (c [2 ]& 0x3f ));
593+ else if ((* c & 0xf0 )== 0xf0 )
594+ return (pg_wchar ) (((c [0 ]& 0x07 ) <<18 ) |
595+ ((c [1 ]& 0x3f ) <<12 ) |
596+ ((c [2 ]& 0x3f ) <<6 ) |
597+ (c [3 ]& 0x3f ));
598+ else
599+ /* that is an invalid code on purpose */
600+ return 0xffffffff ;
601+ }
602+
422603static int
423604pg_utf_dsplen (const unsignedchar * s )
424605{
425- return 1 ; /* XXX fix me! */
606+ return ucs_wcwidth ( utf2ucs ( s ));
426607}
427608
428609/*
@@ -499,7 +680,7 @@ pg_mule_mblen(const unsigned char *s)
499680static int
500681pg_mule_dsplen (const unsignedchar * s )
501682{
502- return 1 ;/* XXX fix me! */
683+ return pg_ascii_dsplen ( s ) ;/* XXX fix me! */
503684}
504685
505686/*
@@ -529,7 +710,7 @@ pg_latin1_mblen(const unsigned char *s)
529710static int
530711pg_latin1_dsplen (const unsignedchar * s )
531712{
532- return 1 ;
713+ return pg_ascii_dsplen ( s ) ;
533714}
534715
535716/*
@@ -559,7 +740,7 @@ pg_sjis_dsplen(const unsigned char *s)
559740else if (IS_HIGHBIT_SET (* s ))
560741len = 2 ;/* kanji? */
561742else
562- len = 1 ;/* should be ASCII */
743+ len = pg_ascii_dsplen ( s ) ;/* should be ASCII */
563744return len ;
564745}
565746
@@ -586,7 +767,7 @@ pg_big5_dsplen(const unsigned char *s)
586767if (IS_HIGHBIT_SET (* s ))
587768len = 2 ;/* kanji? */
588769else
589- len = 1 ;/* should be ASCII */
770+ len = pg_ascii_dsplen ( s ) ;/* should be ASCII */
590771return len ;
591772}
592773
@@ -613,7 +794,7 @@ pg_gbk_dsplen(const unsigned char *s)
613794if (IS_HIGHBIT_SET (* s ))
614795len = 2 ;/* kanji? */
615796else
616- len = 1 ;/* should be ASCII */
797+ len = pg_ascii_dsplen ( s ) ;/* should be ASCII */
617798return len ;
618799}
619800
@@ -640,7 +821,7 @@ pg_uhc_dsplen(const unsigned char *s)
640821if (IS_HIGHBIT_SET (* s ))
641822len = 2 ;/* 2byte? */
642823else
643- len = 1 ;/* should be ASCII */
824+ len = pg_ascii_dsplen ( s ) ;/* should be ASCII */
644825return len ;
645826}
646827
@@ -672,10 +853,10 @@ pg_gb18030_dsplen(const unsigned char *s)
672853{
673854int len ;
674855
675- if (!IS_HIGHBIT_SET (* s ))
676- len = 1 ;/* ASCII */
677- else
856+ if (IS_HIGHBIT_SET (* s ))
678857len = 2 ;
858+ else
859+ len = pg_ascii_dsplen (s );/* ASCII */
679860return len ;
680861}
681862