Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit78d523b

Browse files
committed
Improve make_greater_string() with encoding-specific incrementers.
This infrastructure doesn't in any way guarantee that the characterwe produce will sort before the one we incremented; but it does at leastmake it much more likely that we'll end up with something that is a validcharacter, which improves our chances.Kyotaro Horiguchi, with various adjustments by me.
1 parent51eba98 commit78d523b

File tree

3 files changed

+297
-28
lines changed

3 files changed

+297
-28
lines changed

‎src/backend/utils/adt/selfuncs.c

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5665,6 +5665,19 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)
56655665
}
56665666

56675667

5668+
/*
5669+
* For bytea, the increment function need only increment the current byte
5670+
* (there are no multibyte characters to worry about).
5671+
*/
5672+
staticbool
5673+
byte_increment(unsignedchar*ptr,intlen)
5674+
{
5675+
if (*ptr >=255)
5676+
return false;
5677+
(*ptr)++;
5678+
return true;
5679+
}
5680+
56685681
/*
56695682
* Try to generate a string greater than the given string or any
56705683
* string it is a prefix of. If successful, return a palloc'd string
@@ -5704,6 +5717,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
57045717
intlen;
57055718
Datumcmpstr;
57065719
text*cmptxt=NULL;
5720+
mbcharacter_incrementercharinc;
57075721

57085722
/*
57095723
* Get a modifiable copy of the prefix string in C-string format, and set
@@ -5765,29 +5779,33 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
57655779
}
57665780
}
57675781

5782+
if (datatype==BYTEAOID)
5783+
charinc=&byte_increment;
5784+
else
5785+
charinc=pg_database_encoding_character_incrementer();
5786+
57685787
while (len>0)
57695788
{
5770-
unsignedchar*lastchar= (unsignedchar*) (workstr+len-1);
5771-
unsignedcharsavelastchar=*lastchar;
5789+
intcharlen;
5790+
unsignedchar*lastchar;
5791+
Const*workstr_const;
5792+
5793+
if (datatype==BYTEAOID)
5794+
charlen=1;
5795+
else
5796+
charlen=len-pg_mbcliplen(workstr,len,len-1);
5797+
lastchar= (unsignedchar*) (workstr+len-charlen);
57725798

57735799
/*
5774-
* Try to generate a larger string by incrementing the last byte.
5800+
* Try to generate a larger string by incrementing the last character
5801+
* (for BYTEA, we treat each byte as a character).
57755802
*/
5776-
while (*lastchar< (unsignedchar)255)
5803+
if (charinc(lastchar,charlen))
57775804
{
5778-
Const*workstr_const;
5779-
5780-
(*lastchar)++;
5781-
5782-
if (datatype!=BYTEAOID)
5783-
{
5784-
/* do not generate invalid encoding sequences */
5785-
if (!pg_verifymbstr(workstr,len, true))
5786-
continue;
5787-
workstr_const=string_to_const(workstr,datatype);
5788-
}
5789-
else
5805+
if (datatype==BYTEAOID)
57905806
workstr_const=string_to_bytea_const(workstr,len);
5807+
else
5808+
workstr_const=string_to_const(workstr,datatype);
57915809

57925810
if (DatumGetBool(FunctionCall2Coll(ltproc,
57935811
collation,
@@ -5806,20 +5824,11 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
58065824
pfree(workstr_const);
58075825
}
58085826

5809-
/* restore last byte so we don't confuse pg_mbcliplen */
5810-
*lastchar=savelastchar;
5811-
58125827
/*
5813-
* Truncate off the last character, which might be more than 1 byte,
5814-
* depending on the character encoding.
5828+
* Truncate off the last character or byte.
58155829
*/
5816-
if (datatype!=BYTEAOID&&pg_database_encoding_max_length()>1)
5817-
len=pg_mbcliplen(workstr,len,len-1);
5818-
else
5819-
len-=1;
5820-
5821-
if (datatype!=BYTEAOID)
5822-
workstr[len]='\0';
5830+
len-=charlen;
5831+
workstr[len]='\0';
58235832
}
58245833

58255834
/* Failed... */

‎src/backend/utils/mb/wchar.c

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,6 +1334,244 @@ pg_utf8_islegal(const unsigned char *source, int length)
13341334
return true;
13351335
}
13361336

1337+
#ifndefFRONTEND
1338+
1339+
/*
1340+
* Generic character increment function.
1341+
*
1342+
* Not knowing anything about the properties of the encoding in use, we just
1343+
* keep incrementing the last byte until pg_verifymbstr() likes the result,
1344+
* or we run out of values to try.
1345+
*
1346+
* Like all character-increment functions, we must restore the original input
1347+
* string on failure.
1348+
*/
1349+
staticbool
1350+
pg_generic_charinc(unsignedchar*charptr,intlen)
1351+
{
1352+
unsignedchar*lastchar= (unsignedchar*) (charptr+len-1);
1353+
unsignedcharsavelastchar=*lastchar;
1354+
constchar*const_charptr= (constchar*)charptr;
1355+
1356+
while (*lastchar< (unsignedchar)255)
1357+
{
1358+
(*lastchar)++;
1359+
if (!pg_verifymbstr(const_charptr,len, true))
1360+
continue;
1361+
return true;
1362+
}
1363+
1364+
*lastchar=savelastchar;
1365+
return false;
1366+
}
1367+
1368+
/*
1369+
* UTF-8 character increment function.
1370+
*
1371+
* For a one-byte character less than 0x7F, we just increment the byte.
1372+
*
1373+
* For a multibyte character, every byte but the first must fall between 0x80
1374+
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1375+
* the last byte that's not already at its maximum value, and set any following
1376+
* bytes back to 0x80. If we can't find a byte that's less than the maximum
1377+
* allowable vale, we simply fail. We also have some special-case logic to
1378+
* skip regions used for surrogate pair handling, as those should not occur in
1379+
* valid UTF-8.
1380+
*
1381+
* Like all character-increment functions, we must restore the original input
1382+
* string on failure.
1383+
*/
1384+
staticbool
1385+
pg_utf8_increment(unsignedchar*charptr,intlength)
1386+
{
1387+
unsignedchara;
1388+
unsignedcharbak[4];
1389+
unsignedcharlimit;
1390+
1391+
switch (length)
1392+
{
1393+
default:
1394+
/* reject lengths 5 and 6 for now */
1395+
return false;
1396+
case4:
1397+
bak[3]=charptr[3];
1398+
a=charptr[3];
1399+
if (a<0xBF)
1400+
{
1401+
charptr[3]++;
1402+
break;
1403+
}
1404+
charptr[3]=0x80;
1405+
/* FALL THRU */
1406+
case3:
1407+
bak[2]=charptr[2];
1408+
a=charptr[2];
1409+
if (a<0xBF)
1410+
{
1411+
charptr[2]++;
1412+
break;
1413+
}
1414+
charptr[2]=0x80;
1415+
/* FALL THRU */
1416+
case2:
1417+
bak[1]=charptr[1];
1418+
a=charptr[1];
1419+
switch (*charptr)
1420+
{
1421+
case0xED:
1422+
limit=0x9F;
1423+
break;
1424+
case0xF4:
1425+
limit=0x8F;
1426+
break;
1427+
default:
1428+
limit=0xBF;
1429+
break;
1430+
}
1431+
if (a<limit)
1432+
{
1433+
charptr[1]++;
1434+
break;
1435+
}
1436+
charptr[1]=0x80;
1437+
/* FALL THRU */
1438+
case1:
1439+
bak[0]=*charptr;
1440+
a=*charptr;
1441+
if (a==0x7F||a==0xDF||a==0xEF||a==0xF4)
1442+
{
1443+
/* Restore original string. */
1444+
memcpy(charptr,bak,length);
1445+
return false;
1446+
}
1447+
charptr[0]++;
1448+
break;
1449+
}
1450+
1451+
return true;
1452+
}
1453+
1454+
/*
1455+
* EUC-JP character increment function.
1456+
*
1457+
* If the sequence starts with SS2(0x8e), it must be a two-byte sequence
1458+
* representing JIS X 0201 characters with the second byte ranges between
1459+
* 0xa1 and 0xde. We just increment the last byte if it's less than 0xde,
1460+
* and otherwise rewrite whole the sequence to 0xa1 0xa1.
1461+
*
1462+
* If the sequence starts with SS3(0x8f), it must be a three-byte sequence
1463+
* which the last two bytes ranges between 0xa1 and 0xfe. The last byte
1464+
* is incremented, carrying overflow to the second-to-last byte.
1465+
*
1466+
* If the sequence starts with the values other than the aboves and its MSB
1467+
* is set, it must be a two-byte sequence representing JIS X 0208 characters
1468+
* with both bytes ranges between 0xa1 and 0xfe. The last byte is incremented,
1469+
* carrying overflow to the second-to-last byte.
1470+
*
1471+
* Otherwise the sequence is consists of single byte representing ASCII
1472+
* characters. It is incremented up to 0x7f.
1473+
*
1474+
* Only three EUC-JP byte sequences shown below - which have no character
1475+
* allocated - make this function to fail in spite of its validity: 0x7f,
1476+
* 0xfe 0xfe, 0x8f 0xfe 0xfe.
1477+
*/
1478+
staticbool
1479+
pg_eucjp_increment(unsignedchar*charptr,intlength)
1480+
{
1481+
unsignedcharbak[3];
1482+
unsignedcharc1,c2;
1483+
signedinti;
1484+
1485+
c1=*charptr;
1486+
1487+
switch (c1)
1488+
{
1489+
caseSS2:/* JIS X 0201 */
1490+
if (length!=2)
1491+
return false;
1492+
1493+
c2=charptr[1];
1494+
1495+
if (c2>0xde)
1496+
charptr[0]=charptr[1]=0xa1;
1497+
elseif (c2<0xa1)
1498+
charptr[1]=0xa1;
1499+
else
1500+
charptr[1]++;
1501+
1502+
break;
1503+
1504+
caseSS3:/* JIS X 0212 */
1505+
if (length!=3)
1506+
return false;
1507+
1508+
for (i=2;i>0;i--)
1509+
{
1510+
bak[i]=charptr[i];
1511+
c2=charptr[i];
1512+
if (c2<0xa1)
1513+
{
1514+
charptr[i]=0xa1;
1515+
return true;
1516+
}
1517+
elseif (c2<0xfe)
1518+
{
1519+
charptr[i]++;
1520+
break;
1521+
}
1522+
charptr[i]=0xa1;
1523+
}
1524+
1525+
if (i==0)/* Out of 3-byte code region */
1526+
{
1527+
charptr[1]=bak[1];
1528+
charptr[2]=bak[2];
1529+
return false;
1530+
}
1531+
break;
1532+
1533+
default:
1534+
if (IS_HIGHBIT_SET(c1))/* JIS X 0208? */
1535+
{
1536+
if (length!=2)
1537+
return false;
1538+
1539+
for (i=1 ;i >=0 ;i--)/* i must be signed */
1540+
{
1541+
bak[i]=charptr[i];
1542+
c2=charptr[i];
1543+
if (c2<0xa1)
1544+
{
1545+
charptr[i]=0xa1;
1546+
return true;
1547+
}
1548+
elseif (c2<0xfe)
1549+
{
1550+
charptr[i]++;
1551+
break;
1552+
}
1553+
charptr[i]=0xa1;
1554+
}
1555+
1556+
if (i<0)/* Out of 2 byte code region */
1557+
{
1558+
charptr[0]=bak[0];
1559+
charptr[1]=bak[1];
1560+
return false;
1561+
}
1562+
}
1563+
else
1564+
{/* ASCII, single byte */
1565+
if (c1>0x7e)
1566+
return false;
1567+
(*charptr)++;
1568+
}
1569+
}
1570+
1571+
return true;
1572+
}
1573+
#endif
1574+
13371575
/*
13381576
*-------------------------------------------------------------------
13391577
* encoding info table
@@ -1458,6 +1696,25 @@ pg_database_encoding_max_length(void)
14581696
returnpg_wchar_table[GetDatabaseEncoding()].maxmblen;
14591697
}
14601698

1699+
/*
1700+
* give the character incrementer for the encoding for the current database
1701+
*/
1702+
mbcharacter_incrementer
1703+
pg_database_encoding_character_incrementer(void)
1704+
{
1705+
switch (GetDatabaseEncoding())
1706+
{
1707+
casePG_UTF8:
1708+
returnpg_utf8_increment;
1709+
1710+
casePG_EUC_JP:
1711+
returnpg_eucjp_increment;
1712+
1713+
default:
1714+
returnpg_generic_charinc;
1715+
}
1716+
}
1717+
14611718
/*
14621719
* Verify mbstr to make sure that it is validly encoded in the current
14631720
* database encoding. Otherwise same as pg_verify_mbstr().

‎src/include/mb/pg_wchar.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,8 @@ typedef int (*mblen_converter) (const unsigned char *mbstr);
284284

285285
typedefint (*mbdisplaylen_converter) (constunsignedchar*mbstr);
286286

287+
typedefbool (*mbcharacter_incrementer) (unsignedchar*mbstr,intlen);
288+
287289
typedefint (*mbverifier) (constunsignedchar*mbstr,intlen);
288290

289291
typedefstruct
@@ -389,6 +391,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
389391
externintpg_mbcharcliplen(constchar*mbstr,intlen,intimit);
390392
externintpg_encoding_max_length(intencoding);
391393
externintpg_database_encoding_max_length(void);
394+
externmbcharacter_incrementerpg_database_encoding_character_incrementer(void);
392395

393396
externintPrepareClientEncoding(intencoding);
394397
externintSetClientEncoding(intencoding);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp