1
1
/*
2
2
* conversion functions between pg_wchar and multibyte streams.
3
3
* Tatsuo Ishii
4
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.52 2005/12/26 19:30:44 momjian Exp $
4
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.53 2006/02/10 00:39:04 momjian Exp $
5
5
*
6
6
* WIN1250 client encoding updated by Pavel Behal
7
7
*
23
23
* for the particular encoding. Note that if the encoding is only
24
24
* supported in the client, you don't need to define
25
25
* mb2wchar_with_len() function (SJIS is the case).
26
+ *
27
+ * Note: for the display output of psql to work properly, the return values
28
+ * of these functions must conform to the Unicode standard. In particular
29
+ * the NUL character is zero width and control characters are generally
30
+ * width -1. It is recommended that non-ASCII encodings refer their ASCII
31
+ * subset to the ASCII routines to ensure consistancy.
32
+ *
26
33
*/
27
34
28
35
/*
@@ -53,6 +60,11 @@ pg_ascii_mblen(const unsigned char *s)
53
60
static int
54
61
pg_ascii_dsplen (const unsignedchar * s )
55
62
{
63
+ if (* s == '\0' )
64
+ return 0 ;
65
+ if (* s < 0x20 || * s == 0x7f )
66
+ return -1 ;
67
+
56
68
return 1 ;
57
69
}
58
70
@@ -125,7 +137,7 @@ pg_euc_dsplen(const unsigned char *s)
125
137
else if (IS_HIGHBIT_SET (* s ))
126
138
len = 2 ;
127
139
else
128
- len = 1 ;
140
+ len = pg_ascii_dsplen ( s ) ;
129
141
return len ;
130
142
}
131
143
@@ -156,7 +168,7 @@ pg_eucjp_dsplen(const unsigned char *s)
156
168
else if (IS_HIGHBIT_SET (* s ))
157
169
len = 2 ;
158
170
else
159
- len = 1 ;
171
+ len = pg_ascii_dsplen ( s ) ;
160
172
return len ;
161
173
}
162
174
@@ -244,7 +256,7 @@ pg_euccn_dsplen(const unsigned char *s)
244
256
if (IS_HIGHBIT_SET (* s ))
245
257
len = 2 ;
246
258
else
247
- len = 1 ;
259
+ len = pg_ascii_dsplen ( s ) ;
248
260
return len ;
249
261
}
250
262
@@ -304,7 +316,7 @@ pg_euctw_mblen(const unsigned char *s)
304
316
else if (IS_HIGHBIT_SET (* s ))
305
317
len = 2 ;
306
318
else
307
- len = 1 ;
319
+ len = pg_ascii_dsplen ( s ) ;
308
320
return len ;
309
321
}
310
322
@@ -320,7 +332,7 @@ pg_euctw_dsplen(const unsigned char *s)
320
332
else if (IS_HIGHBIT_SET (* s ))
321
333
len = 2 ;
322
334
else
323
- len = 1 ;
335
+ len = pg_ascii_dsplen ( s ) ;
324
336
return len ;
325
337
}
326
338
@@ -419,10 +431,179 @@ pg_utf_mblen(const unsigned char *s)
419
431
return len ;
420
432
}
421
433
434
+ /*
435
+ * This is an implementation of wcwidth() and wcswidth() as defined in
436
+ * "The Single UNIX Specification, Version 2, The Open Group, 1997"
437
+ * <http://www.UNIX-systems.org/online.html>
438
+ *
439
+ * Markus Kuhn -- 2001-09-08 -- public domain
440
+ *
441
+ * customised for PostgreSQL
442
+ *
443
+ * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
444
+ */
445
+
446
+ struct mbinterval
447
+ {
448
+ unsigned short first ;
449
+ unsigned short last ;
450
+ };
451
+
452
+ /* auxiliary function for binary search in interval table */
453
+ static int
454
+ mbbisearch (pg_wchar ucs ,const struct mbinterval * table ,int max )
455
+ {
456
+ int min = 0 ;
457
+ int mid ;
458
+
459
+ if (ucs < table [0 ].first || ucs > table [max ].last )
460
+ return 0 ;
461
+ while (max >=min )
462
+ {
463
+ mid = (min + max ) /2 ;
464
+ if (ucs > table [mid ].last )
465
+ min = mid + 1 ;
466
+ else if (ucs < table [mid ].first )
467
+ max = mid - 1 ;
468
+ else
469
+ return 1 ;
470
+ }
471
+
472
+ return 0 ;
473
+ }
474
+
475
+
476
+ /* The following functions define the column width of an ISO 10646
477
+ * character as follows:
478
+ *
479
+ * - The null character (U+0000) has a column width of 0.
480
+ *
481
+ * - Other C0/C1 control characters and DEL will lead to a return
482
+ *value of -1.
483
+ *
484
+ * - Non-spacing and enclosing combining characters (general
485
+ *category code Mn or Me in the Unicode database) have a
486
+ *column width of 0.
487
+ *
488
+ * - Other format characters (general category code Cf in the Unicode
489
+ *database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
490
+ *
491
+ * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
492
+ *have a column width of 0.
493
+ *
494
+ * - Spacing characters in the East Asian Wide (W) or East Asian
495
+ *FullWidth (F) category as defined in Unicode Technical
496
+ *Report #11 have a column width of 2.
497
+ *
498
+ * - All remaining characters (including all printable
499
+ *ISO 8859-1 and WGL4 characters, Unicode control characters,
500
+ *etc.) have a column width of 1.
501
+ *
502
+ * This implementation assumes that wchar_t characters are encoded
503
+ * in ISO 10646.
504
+ */
505
+
506
+ static int
507
+ ucs_wcwidth (pg_wchar ucs )
508
+ {
509
+ /* sorted list of non-overlapping intervals of non-spacing characters */
510
+ static const struct mbinterval combining []= {
511
+ {0x0300 ,0x034E }, {0x0360 ,0x0362 }, {0x0483 ,0x0486 },
512
+ {0x0488 ,0x0489 }, {0x0591 ,0x05A1 }, {0x05A3 ,0x05B9 },
513
+ {0x05BB ,0x05BD }, {0x05BF ,0x05BF }, {0x05C1 ,0x05C2 },
514
+ {0x05C4 ,0x05C4 }, {0x064B ,0x0655 }, {0x0670 ,0x0670 },
515
+ {0x06D6 ,0x06E4 }, {0x06E7 ,0x06E8 }, {0x06EA ,0x06ED },
516
+ {0x070F ,0x070F }, {0x0711 ,0x0711 }, {0x0730 ,0x074A },
517
+ {0x07A6 ,0x07B0 }, {0x0901 ,0x0902 }, {0x093C ,0x093C },
518
+ {0x0941 ,0x0948 }, {0x094D ,0x094D }, {0x0951 ,0x0954 },
519
+ {0x0962 ,0x0963 }, {0x0981 ,0x0981 }, {0x09BC ,0x09BC },
520
+ {0x09C1 ,0x09C4 }, {0x09CD ,0x09CD }, {0x09E2 ,0x09E3 },
521
+ {0x0A02 ,0x0A02 }, {0x0A3C ,0x0A3C }, {0x0A41 ,0x0A42 },
522
+ {0x0A47 ,0x0A48 }, {0x0A4B ,0x0A4D }, {0x0A70 ,0x0A71 },
523
+ {0x0A81 ,0x0A82 }, {0x0ABC ,0x0ABC }, {0x0AC1 ,0x0AC5 },
524
+ {0x0AC7 ,0x0AC8 }, {0x0ACD ,0x0ACD }, {0x0B01 ,0x0B01 },
525
+ {0x0B3C ,0x0B3C }, {0x0B3F ,0x0B3F }, {0x0B41 ,0x0B43 },
526
+ {0x0B4D ,0x0B4D }, {0x0B56 ,0x0B56 }, {0x0B82 ,0x0B82 },
527
+ {0x0BC0 ,0x0BC0 }, {0x0BCD ,0x0BCD }, {0x0C3E ,0x0C40 },
528
+ {0x0C46 ,0x0C48 }, {0x0C4A ,0x0C4D }, {0x0C55 ,0x0C56 },
529
+ {0x0CBF ,0x0CBF }, {0x0CC6 ,0x0CC6 }, {0x0CCC ,0x0CCD },
530
+ {0x0D41 ,0x0D43 }, {0x0D4D ,0x0D4D }, {0x0DCA ,0x0DCA },
531
+ {0x0DD2 ,0x0DD4 }, {0x0DD6 ,0x0DD6 }, {0x0E31 ,0x0E31 },
532
+ {0x0E34 ,0x0E3A }, {0x0E47 ,0x0E4E }, {0x0EB1 ,0x0EB1 },
533
+ {0x0EB4 ,0x0EB9 }, {0x0EBB ,0x0EBC }, {0x0EC8 ,0x0ECD },
534
+ {0x0F18 ,0x0F19 }, {0x0F35 ,0x0F35 }, {0x0F37 ,0x0F37 },
535
+ {0x0F39 ,0x0F39 }, {0x0F71 ,0x0F7E }, {0x0F80 ,0x0F84 },
536
+ {0x0F86 ,0x0F87 }, {0x0F90 ,0x0F97 }, {0x0F99 ,0x0FBC },
537
+ {0x0FC6 ,0x0FC6 }, {0x102D ,0x1030 }, {0x1032 ,0x1032 },
538
+ {0x1036 ,0x1037 }, {0x1039 ,0x1039 }, {0x1058 ,0x1059 },
539
+ {0x1160 ,0x11FF }, {0x17B7 ,0x17BD }, {0x17C6 ,0x17C6 },
540
+ {0x17C9 ,0x17D3 }, {0x180B ,0x180E }, {0x18A9 ,0x18A9 },
541
+ {0x200B ,0x200F }, {0x202A ,0x202E }, {0x206A ,0x206F },
542
+ {0x20D0 ,0x20E3 }, {0x302A ,0x302F }, {0x3099 ,0x309A },
543
+ {0xFB1E ,0xFB1E }, {0xFE20 ,0xFE23 }, {0xFEFF ,0xFEFF },
544
+ {0xFFF9 ,0xFFFB }
545
+ };
546
+
547
+ /* test for 8-bit control characters */
548
+ if (ucs == 0 )
549
+ return 0 ;
550
+
551
+ if (ucs < 0x20 || (ucs >=0x7f && ucs < 0xa0 )|| ucs > 0x0010ffff )
552
+ return -1 ;
553
+
554
+ /* binary search in table of non-spacing characters */
555
+ if (mbbisearch (ucs ,combining ,
556
+ sizeof (combining ) /sizeof (struct mbinterval )- 1 ))
557
+ return 0 ;
558
+
559
+ /*
560
+ * if we arrive here, ucs is not a combining or C0/C1 control character
561
+ */
562
+
563
+ return 1 +
564
+ (ucs >=0x1100 &&
565
+ (ucs <=0x115f || /* Hangul Jamo init. consonants */
566
+ (ucs >=0x2e80 && ucs <=0xa4cf && (ucs & ~0x0011 )!= 0x300a &&
567
+ ucs != 0x303f )|| /* CJK ... Yi */
568
+ (ucs >=0xac00 && ucs <=0xd7a3 )|| /* Hangul Syllables */
569
+ (ucs >=0xf900 && ucs <=0xfaff )|| /* CJK Compatibility
570
+ * Ideographs */
571
+ (ucs >=0xfe30 && ucs <=0xfe6f )|| /* CJK Compatibility Forms */
572
+ (ucs >=0xff00 && ucs <=0xff5f )|| /* Fullwidth Forms */
573
+ (ucs >=0xffe0 && ucs <=0xffe6 )||
574
+ (ucs >=0x20000 && ucs <=0x2ffff )));
575
+ }
576
+
577
+ static pg_wchar
578
+ utf2ucs (const unsignedchar * c )
579
+ {
580
+ /*
581
+ * one char version of pg_utf2wchar_with_len. no control here, c must
582
+ * point to a large enough string
583
+ */
584
+ if ((* c & 0x80 )== 0 )
585
+ return (pg_wchar )c [0 ];
586
+ else if ((* c & 0xe0 )== 0xc0 )
587
+ return (pg_wchar ) (((c [0 ]& 0x1f ) <<6 ) |
588
+ (c [1 ]& 0x3f ));
589
+ else if ((* c & 0xf0 )== 0xe0 )
590
+ return (pg_wchar ) (((c [0 ]& 0x0f ) <<12 ) |
591
+ ((c [1 ]& 0x3f ) <<6 ) |
592
+ (c [2 ]& 0x3f ));
593
+ else if ((* c & 0xf0 )== 0xf0 )
594
+ return (pg_wchar ) (((c [0 ]& 0x07 ) <<18 ) |
595
+ ((c [1 ]& 0x3f ) <<12 ) |
596
+ ((c [2 ]& 0x3f ) <<6 ) |
597
+ (c [3 ]& 0x3f ));
598
+ else
599
+ /* that is an invalid code on purpose */
600
+ return 0xffffffff ;
601
+ }
602
+
422
603
static int
423
604
pg_utf_dsplen (const unsignedchar * s )
424
605
{
425
- return 1 ; /* XXX fix me! */
606
+ return ucs_wcwidth ( utf2ucs ( s ));
426
607
}
427
608
428
609
/*
@@ -499,7 +680,7 @@ pg_mule_mblen(const unsigned char *s)
499
680
static int
500
681
pg_mule_dsplen (const unsignedchar * s )
501
682
{
502
- return 1 ;/* XXX fix me! */
683
+ return pg_ascii_dsplen ( s ) ;/* XXX fix me! */
503
684
}
504
685
505
686
/*
@@ -529,7 +710,7 @@ pg_latin1_mblen(const unsigned char *s)
529
710
static int
530
711
pg_latin1_dsplen (const unsignedchar * s )
531
712
{
532
- return 1 ;
713
+ return pg_ascii_dsplen ( s ) ;
533
714
}
534
715
535
716
/*
@@ -559,7 +740,7 @@ pg_sjis_dsplen(const unsigned char *s)
559
740
else if (IS_HIGHBIT_SET (* s ))
560
741
len = 2 ;/* kanji? */
561
742
else
562
- len = 1 ;/* should be ASCII */
743
+ len = pg_ascii_dsplen ( s ) ;/* should be ASCII */
563
744
return len ;
564
745
}
565
746
@@ -586,7 +767,7 @@ pg_big5_dsplen(const unsigned char *s)
586
767
if (IS_HIGHBIT_SET (* s ))
587
768
len = 2 ;/* kanji? */
588
769
else
589
- len = 1 ;/* should be ASCII */
770
+ len = pg_ascii_dsplen ( s ) ;/* should be ASCII */
590
771
return len ;
591
772
}
592
773
@@ -613,7 +794,7 @@ pg_gbk_dsplen(const unsigned char *s)
613
794
if (IS_HIGHBIT_SET (* s ))
614
795
len = 2 ;/* kanji? */
615
796
else
616
- len = 1 ;/* should be ASCII */
797
+ len = pg_ascii_dsplen ( s ) ;/* should be ASCII */
617
798
return len ;
618
799
}
619
800
@@ -640,7 +821,7 @@ pg_uhc_dsplen(const unsigned char *s)
640
821
if (IS_HIGHBIT_SET (* s ))
641
822
len = 2 ;/* 2byte? */
642
823
else
643
- len = 1 ;/* should be ASCII */
824
+ len = pg_ascii_dsplen ( s ) ;/* should be ASCII */
644
825
return len ;
645
826
}
646
827
@@ -672,10 +853,10 @@ pg_gb18030_dsplen(const unsigned char *s)
672
853
{
673
854
int len ;
674
855
675
- if (!IS_HIGHBIT_SET (* s ))
676
- len = 1 ;/* ASCII */
677
- else
856
+ if (IS_HIGHBIT_SET (* s ))
678
857
len = 2 ;
858
+ else
859
+ len = pg_ascii_dsplen (s );/* ASCII */
679
860
return len ;
680
861
}
681
862