Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitbab9821

Browse files
committed
Update display widths as part of updating Unicode
The hardcoded "wide character" set in ucs_wcwidth() was last updatedaround the Unicode 5.0 era. This led to misalignment when printingemojis and other codepoints that have since been designatedwide or full-width.To fix and keep up to date, extend update-unicode to download the listof wide and full-width codepoints from the offical sources.In passing, remove some comments about non-spacing characters thathaven't been accurate since we removed the former hardcoded logic.Jacob ChampionReported and reviewed by Pavel StehuleDiscussion:https://www.postgresql.org/message-id/flat/CAFj8pRCeX21O69YHxmykYySYyprZAqrKWWg0KoGKdjgqcGyygg@mail.gmail.com
1 parent1563ecb commitbab9821

File tree

5 files changed

+220
-27
lines changed

5 files changed

+220
-27
lines changed

‎src/common/unicode/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
# Downloaded files
55
/CompositionExclusions.txt
66
/DerivedNormalizationProps.txt
7+
/EastAsianWidth.txt
78
/NormalizationTest.txt
89
/UnicodeData.txt

‎src/common/unicode/Makefile

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
1818
# By default, do nothing.
1919
all:
2020

21-
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
21+
update-unicode: unicode_norm_table.h unicode_combining_table.hunicode_east_asian_fw_table.hunicode_normprops_table.h unicode_norm_hashfunc.h
2222
mv$^ ../../../src/include/common/
2323
$(MAKE) normalization-check
2424

2525
# These files are part of the Unicode Character Database. Download
2626
# them on demand. The dependency on Makefile.global is for
2727
# UNICODE_VERSION.
28-
UnicodeData.txtDerivedNormalizationProps.txtCompositionExclusions.txtNormalizationTest.txt:$(top_builddir)/src/Makefile.global
28+
UnicodeData.txtEastAsianWidth.txtDerivedNormalizationProps.txtCompositionExclusions.txtNormalizationTest.txt:$(top_builddir)/src/Makefile.global
2929
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3030

3131
# Generation of conversion tables used for string normalization with
@@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
3838
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
3939
$(PERL)$^>$@
4040

41+
unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
42+
$(PERL)$^>$@
43+
4144
unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
4245
$(PERL)$^>$@
4346

@@ -64,6 +67,6 @@ clean:
6467
rm -f$(OBJS) norm_test norm_test.o
6568

6669
distclean: clean
67-
rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
70+
rm -f UnicodeData.txtEastAsianWidth.txtCompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
6871

6972
maintainer-clean: distclean
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/perl
2+
#
3+
# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
4+
# and East Asian Fullwidth (F) characters, using Unicode data files as input.
5+
# Pass EastAsianWidth.txt as argument. The output is on stdout.
6+
#
7+
# Copyright (c) 2019-2021, PostgreSQL Global Development Group
8+
9+
use strict;
10+
use warnings;
11+
12+
my$range_start =undef;
13+
my ($first,$last);
14+
my$prev_last;
15+
16+
print
17+
"/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
18+
19+
print"static const struct mbinterval east_asian_fw[] = {\n";
20+
21+
foreachmy$line (<ARGV>)
22+
{
23+
chomp$line;
24+
$line =~s/\s*#.*$//;
25+
nextif$lineeq'';
26+
my ($codepoint,$width) =split';',$line;
27+
28+
if ($codepoint =~/\.\./)
29+
{
30+
($first,$last) =split /\.\./,$codepoint;
31+
}
32+
else
33+
{
34+
$first =$last =$codepoint;
35+
}
36+
37+
($first,$last) =map(hex, ($first,$last));
38+
39+
if ($widtheq'F' ||$widtheq'W')
40+
{
41+
# fullwidth/wide characters
42+
if (!defined($range_start))
43+
{
44+
# save for start of range if one hasn't been started yet
45+
$range_start =$first;
46+
}
47+
elsif ($first !=$prev_last + 1)
48+
{
49+
# ranges aren't contiguous; emit the last and start a new one
50+
printf"\t{0x%04X, 0x%04X},\n",$range_start,$prev_last;
51+
$range_start =$first;
52+
}
53+
}
54+
else
55+
{
56+
# not wide characters, print out previous range if any
57+
if (defined($range_start))
58+
{
59+
printf"\t{0x%04X, 0x%04X},\n",$range_start,$prev_last;
60+
$range_start =undef;
61+
}
62+
}
63+
}
64+
continue
65+
{
66+
$prev_last =$last;
67+
}
68+
69+
# don't forget any ranges at the very end of the database (though there are none
70+
# as of Unicode 13.0)
71+
if (defined($range_start))
72+
{
73+
printf"\t{0x%04X, 0x%04X},\n",$range_start,$prev_last;
74+
}
75+
76+
print"};\n";

‎src/common/wchar.c

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -583,8 +583,8 @@ pg_utf_mblen(const unsigned char *s)
583583

584584
structmbinterval
585585
{
586-
unsignedshortfirst;
587-
unsignedshortlast;
586+
unsignedintfirst;
587+
unsignedintlast;
588588
};
589589

590590
/* auxiliary function for binary search in interval table */
@@ -623,12 +623,6 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
623623
*category code Mn or Me in the Unicode database) have a
624624
*column width of 0.
625625
*
626-
* - Other format characters (general category code Cf in the Unicode
627-
*database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
628-
*
629-
* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
630-
*have a column width of 0.
631-
*
632626
* - Spacing characters in the East Asian Wide (W) or East Asian
633627
*FullWidth (F) category as defined in Unicode Technical
634628
*Report #11 have a column width of 2.
@@ -645,6 +639,7 @@ static int
645639
ucs_wcwidth(pg_wcharucs)
646640
{
647641
#include"common/unicode_combining_table.h"
642+
#include"common/unicode_east_asian_fw_table.h"
648643

649644
/* test for 8-bit control characters */
650645
if (ucs==0)
@@ -653,27 +648,25 @@ ucs_wcwidth(pg_wchar ucs)
653648
if (ucs<0x20|| (ucs >=0x7f&&ucs<0xa0)||ucs>0x0010ffff)
654649
return-1;
655650

656-
/* binary search in table of non-spacing characters */
651+
/*
652+
* binary search in table of non-spacing characters
653+
*
654+
* XXX: In the official Unicode sources, it is possible for a character to
655+
* be described as both non-spacing and wide at the same time. As of
656+
* Unicode 13.0, treating the non-spacing property as the determining
657+
* factor for display width leads to the correct behavior, so do that
658+
* search first.
659+
*/
657660
if (mbbisearch(ucs,combining,
658661
sizeof(combining) /sizeof(structmbinterval)-1))
659662
return0;
660663

661-
/*
662-
* if we arrive here, ucs is not a combining or C0/C1 control character
663-
*/
664+
/* binary search in table of wide characters */
665+
if (mbbisearch(ucs,east_asian_fw,
666+
sizeof(east_asian_fw) /sizeof(structmbinterval)-1))
667+
return2;
664668

665-
return1+
666-
(ucs >=0x1100&&
667-
(ucs <=0x115f||/* Hangul Jamo init. consonants */
668-
(ucs >=0x2e80&&ucs <=0xa4cf&& (ucs& ~0x0011)!=0x300a&&
669-
ucs!=0x303f)||/* CJK ... Yi */
670-
(ucs >=0xac00&&ucs <=0xd7a3)||/* Hangul Syllables */
671-
(ucs >=0xf900&&ucs <=0xfaff)||/* CJK Compatibility
672-
* Ideographs */
673-
(ucs >=0xfe30&&ucs <=0xfe6f)||/* CJK Compatibility Forms */
674-
(ucs >=0xff00&&ucs <=0xff5f)||/* Fullwidth Forms */
675-
(ucs >=0xffe0&&ucs <=0xffe6)||
676-
(ucs >=0x20000&&ucs <=0x2ffff)));
669+
return1;
677670
}
678671

679672
/*
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */
2+
3+
staticconststructmbintervaleast_asian_fw[]= {
4+
{0x1100,0x115F},
5+
{0x231A,0x231B},
6+
{0x2329,0x232A},
7+
{0x23E9,0x23EC},
8+
{0x23F0,0x23F0},
9+
{0x23F3,0x23F3},
10+
{0x25FD,0x25FE},
11+
{0x2614,0x2615},
12+
{0x2648,0x2653},
13+
{0x267F,0x267F},
14+
{0x2693,0x2693},
15+
{0x26A1,0x26A1},
16+
{0x26AA,0x26AB},
17+
{0x26BD,0x26BE},
18+
{0x26C4,0x26C5},
19+
{0x26CE,0x26CE},
20+
{0x26D4,0x26D4},
21+
{0x26EA,0x26EA},
22+
{0x26F2,0x26F3},
23+
{0x26F5,0x26F5},
24+
{0x26FA,0x26FA},
25+
{0x26FD,0x26FD},
26+
{0x2705,0x2705},
27+
{0x270A,0x270B},
28+
{0x2728,0x2728},
29+
{0x274C,0x274C},
30+
{0x274E,0x274E},
31+
{0x2753,0x2755},
32+
{0x2757,0x2757},
33+
{0x2795,0x2797},
34+
{0x27B0,0x27B0},
35+
{0x27BF,0x27BF},
36+
{0x2B1B,0x2B1C},
37+
{0x2B50,0x2B50},
38+
{0x2B55,0x2B55},
39+
{0x2E80,0x2E99},
40+
{0x2E9B,0x2EF3},
41+
{0x2F00,0x2FD5},
42+
{0x2FF0,0x2FFB},
43+
{0x3000,0x303E},
44+
{0x3041,0x3096},
45+
{0x3099,0x30FF},
46+
{0x3105,0x312F},
47+
{0x3131,0x318E},
48+
{0x3190,0x31E3},
49+
{0x31F0,0x321E},
50+
{0x3220,0x3247},
51+
{0x3250,0x4DBF},
52+
{0x4E00,0xA48C},
53+
{0xA490,0xA4C6},
54+
{0xA960,0xA97C},
55+
{0xAC00,0xD7A3},
56+
{0xF900,0xFAFF},
57+
{0xFE10,0xFE19},
58+
{0xFE30,0xFE52},
59+
{0xFE54,0xFE66},
60+
{0xFE68,0xFE6B},
61+
{0xFF01,0xFF60},
62+
{0xFFE0,0xFFE6},
63+
{0x16FE0,0x16FE4},
64+
{0x16FF0,0x16FF1},
65+
{0x17000,0x187F7},
66+
{0x18800,0x18CD5},
67+
{0x18D00,0x18D08},
68+
{0x1B000,0x1B11E},
69+
{0x1B150,0x1B152},
70+
{0x1B164,0x1B167},
71+
{0x1B170,0x1B2FB},
72+
{0x1F004,0x1F004},
73+
{0x1F0CF,0x1F0CF},
74+
{0x1F18E,0x1F18E},
75+
{0x1F191,0x1F19A},
76+
{0x1F200,0x1F202},
77+
{0x1F210,0x1F23B},
78+
{0x1F240,0x1F248},
79+
{0x1F250,0x1F251},
80+
{0x1F260,0x1F265},
81+
{0x1F300,0x1F320},
82+
{0x1F32D,0x1F335},
83+
{0x1F337,0x1F37C},
84+
{0x1F37E,0x1F393},
85+
{0x1F3A0,0x1F3CA},
86+
{0x1F3CF,0x1F3D3},
87+
{0x1F3E0,0x1F3F0},
88+
{0x1F3F4,0x1F3F4},
89+
{0x1F3F8,0x1F43E},
90+
{0x1F440,0x1F440},
91+
{0x1F442,0x1F4FC},
92+
{0x1F4FF,0x1F53D},
93+
{0x1F54B,0x1F54E},
94+
{0x1F550,0x1F567},
95+
{0x1F57A,0x1F57A},
96+
{0x1F595,0x1F596},
97+
{0x1F5A4,0x1F5A4},
98+
{0x1F5FB,0x1F64F},
99+
{0x1F680,0x1F6C5},
100+
{0x1F6CC,0x1F6CC},
101+
{0x1F6D0,0x1F6D2},
102+
{0x1F6D5,0x1F6D7},
103+
{0x1F6EB,0x1F6EC},
104+
{0x1F6F4,0x1F6FC},
105+
{0x1F7E0,0x1F7EB},
106+
{0x1F90C,0x1F93A},
107+
{0x1F93C,0x1F945},
108+
{0x1F947,0x1F978},
109+
{0x1F97A,0x1F9CB},
110+
{0x1F9CD,0x1F9FF},
111+
{0x1FA70,0x1FA74},
112+
{0x1FA78,0x1FA7A},
113+
{0x1FA80,0x1FA86},
114+
{0x1FA90,0x1FAA8},
115+
{0x1FAB0,0x1FAB6},
116+
{0x1FAC0,0x1FAC2},
117+
{0x1FAD0,0x1FAD6},
118+
{0x20000,0x2FFFD},
119+
{0x30000,0x3FFFD},
120+
};

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp