NotificationsYou must be signed in to change notification settings
Fork5
Star26

Commit78d523b

committed

Improve make_greater_string() with encoding-specific incrementers.

This infrastructure doesn't in any way guarantee that the characterwe produce will sort before the one we incremented; but it does at leastmake it much more likely that we'll end up with something that is a validcharacter, which improves our chances.Kyotaro Horiguchi, with various adjustments by me.

1 parent51eba98 commit78d523bCopy full SHA for 78d523b

File tree

3 files changed

+297

-28

lines changed

src
- backend/utils
  - adt
    - selfuncs.c
  - mb
    - wchar.c
- include/mb
  - pg_wchar.h

3 files changed

+297

-28

lines changed

`‎src/backend/utils/adt/selfuncs.c`

Lines changed: 37 additions & 28 deletions

Original file line number	Diff line number	Diff line change
`@@ -5665,6 +5665,19 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)`
`5665`	`5665`	`}`
`5666`	`5666`
`5667`	`5667`
	`5668`	`+/*`
	`5669`	`+ * For bytea, the increment function need only increment the current byte`
	`5670`	`+ * (there are no multibyte characters to worry about).`
	`5671`	`+ */`
	`5672`	`+staticbool`
	`5673`	`+byte_increment(unsignedchar*ptr,intlen)`
	`5674`	`+{`
	`5675`	`+if (*ptr >=255)`
	`5676`	`+return false;`
	`5677`	`+(*ptr)++;`
	`5678`	`+return true;`
	`5679`	`+}`
	`5680`	`+`
`5668`	`5681`	`/*`
`5669`	`5682`	`* Try to generate a string greater than the given string or any`
`5670`	`5683`	`* string it is a prefix of. If successful, return a palloc'd string`
`@@ -5704,6 +5717,7 @@ make_greater_string(const Const str_const, FmgrInfo ltproc, Oid collation)`
`5704`	`5717`	`intlen;`
`5705`	`5718`	`Datumcmpstr;`
`5706`	`5719`	`text*cmptxt=NULL;`
	`5720`	`+mbcharacter_incrementercharinc;`
`5707`	`5721`
`5708`	`5722`	`/*`
`5709`	`5723`	`* Get a modifiable copy of the prefix string in C-string format, and set`
`@@ -5765,29 +5779,33 @@ make_greater_string(const Const str_const, FmgrInfo ltproc, Oid collation)`
`5765`	`5779`	`}`
`5766`	`5780`	`}`
`5767`	`5781`
	`5782`	`+if (datatype==BYTEAOID)`
	`5783`	`+charinc=&byte_increment;`
	`5784`	`+else`
	`5785`	`+charinc=pg_database_encoding_character_incrementer();`
	`5786`	`+`
`5768`	`5787`	`while (len>0)`
`5769`	`5788`	`{`
`5770`		`-unsignedcharlastchar= (unsignedchar) (workstr+len-1);`
`5771`		`-unsignedcharsavelastchar=*lastchar;`
	`5789`	`+intcharlen;`
	`5790`	`+unsignedchar*lastchar;`
	`5791`	`+Const*workstr_const;`
	`5792`	`+`
	`5793`	`+if (datatype==BYTEAOID)`
	`5794`	`+charlen=1;`
	`5795`	`+else`
	`5796`	`+charlen=len-pg_mbcliplen(workstr,len,len-1);`
	`5797`	`+lastchar= (unsignedchar*) (workstr+len-charlen);`
`5772`	`5798`
`5773`	`5799`	`/*`
`5774`		`- * Try to generate a larger string by incrementing the last byte.`
	`5800`	`+ * Try to generate a larger string by incrementing the last character`
	`5801`	`+ * (for BYTEA, we treat each byte as a character).`
`5775`	`5802`	`*/`
`5776`		`-while (*lastchar< (unsignedchar)255)`
	`5803`	`+if (charinc(lastchar,charlen))`
`5777`	`5804`	`{`
`5778`		`-Const*workstr_const;`
`5779`		`-`
`5780`		`-(*lastchar)++;`
`5781`		`-`
`5782`		`-if (datatype!=BYTEAOID)`
`5783`		`-{`
`5784`		`-/* do not generate invalid encoding sequences */`
`5785`		`-if (!pg_verifymbstr(workstr,len, true))`
`5786`		`-continue;`
`5787`		`-workstr_const=string_to_const(workstr,datatype);`
`5788`		`-}`
`5789`		`-else`
	`5805`	`+if (datatype==BYTEAOID)`
`5790`	`5806`	`workstr_const=string_to_bytea_const(workstr,len);`
	`5807`	`+else`
	`5808`	`+workstr_const=string_to_const(workstr,datatype);`
`5791`	`5809`
`5792`	`5810`	`if (DatumGetBool(FunctionCall2Coll(ltproc,`
`5793`	`5811`	`collation,`
`@@ -5806,20 +5824,11 @@ make_greater_string(const Const str_const, FmgrInfo ltproc, Oid collation)`
`5806`	`5824`	`pfree(workstr_const);`
`5807`	`5825`	`}`
`5808`	`5826`
`5809`		`-/* restore last byte so we don't confuse pg_mbcliplen */`
`5810`		`-*lastchar=savelastchar;`
`5811`		`-`
`5812`	`5827`	`/*`
`5813`		`- * Truncate off the last character, which might be more than 1 byte,`
`5814`		`- * depending on the character encoding.`
	`5828`	`+ * Truncate off the last character or byte.`
`5815`	`5829`	`*/`
`5816`		`-if (datatype!=BYTEAOID&&pg_database_encoding_max_length()>1)`
`5817`		`-len=pg_mbcliplen(workstr,len,len-1);`
`5818`		`-else`
`5819`		`-len-=1;`
`5820`		`-`
`5821`		`-if (datatype!=BYTEAOID)`
`5822`		`-workstr[len]='\0';`
	`5830`	`+len-=charlen;`
	`5831`	`+workstr[len]='\0';`
`5823`	`5832`	`}`
`5824`	`5833`
`5825`	`5834`	`/* Failed... */`

`‎src/backend/utils/mb/wchar.c`

Lines changed: 257 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1334,6 +1334,244 @@ pg_utf8_islegal(const unsigned char *source, int length)`
`1334`	`1334`	`return true;`
`1335`	`1335`	`}`
`1336`	`1336`
	`1337`	`+#ifndefFRONTEND`
	`1338`	`+`
	`1339`	`+/*`
	`1340`	`+ * Generic character increment function.`
	`1341`	`+ *`
	`1342`	`+ * Not knowing anything about the properties of the encoding in use, we just`
	`1343`	`+ * keep incrementing the last byte until pg_verifymbstr() likes the result,`
	`1344`	`+ * or we run out of values to try.`
	`1345`	`+ *`
	`1346`	`+ * Like all character-increment functions, we must restore the original input`
	`1347`	`+ * string on failure.`
	`1348`	`+ */`
	`1349`	`+staticbool`
	`1350`	`+pg_generic_charinc(unsignedchar*charptr,intlen)`
	`1351`	`+{`
	`1352`	`+unsignedcharlastchar= (unsignedchar) (charptr+len-1);`
	`1353`	`+unsignedcharsavelastchar=*lastchar;`
	`1354`	`+constcharconst_charptr= (constchar)charptr;`
	`1355`	`+`
	`1356`	`+while (*lastchar< (unsignedchar)255)`
	`1357`	`+ {`
	`1358`	`+ (*lastchar)++;`
	`1359`	`+if (!pg_verifymbstr(const_charptr,len, true))`
	`1360`	`+continue;`
	`1361`	`+return true;`
	`1362`	`+ }`
	`1363`	`+`
	`1364`	`+*lastchar=savelastchar;`
	`1365`	`+return false;`
	`1366`	`+}`
	`1367`	`+`
	`1368`	`+/*`
	`1369`	`+ * UTF-8 character increment function.`
	`1370`	`+ *`
	`1371`	`+ * For a one-byte character less than 0x7F, we just increment the byte.`
	`1372`	`+ *`
	`1373`	`+ * For a multibyte character, every byte but the first must fall between 0x80`
	`1374`	`+ * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment`
	`1375`	`+ * the last byte that's not already at its maximum value, and set any following`
	`1376`	`+ * bytes back to 0x80. If we can't find a byte that's less than the maximum`
	`1377`	`+ * allowable vale, we simply fail. We also have some special-case logic to`
	`1378`	`+ * skip regions used for surrogate pair handling, as those should not occur in`
	`1379`	`+ * valid UTF-8.`
	`1380`	`+ *`
	`1381`	`+ * Like all character-increment functions, we must restore the original input`
	`1382`	`+ * string on failure.`
	`1383`	`+ */`
	`1384`	`+staticbool`
	`1385`	`+pg_utf8_increment(unsignedchar*charptr,intlength)`
	`1386`	`+{`
	`1387`	`+unsignedchara;`
	`1388`	`+unsignedcharbak[4];`
	`1389`	`+unsignedcharlimit;`
	`1390`	`+`
	`1391`	`+switch (length)`
	`1392`	`+ {`
	`1393`	`+default:`
	`1394`	`+/* reject lengths 5 and 6 for now */`
	`1395`	`+return false;`
	`1396`	`+case4:`
	`1397`	`+bak[3]=charptr[3];`
	`1398`	`+a=charptr[3];`
	`1399`	`+if (a<0xBF)`
	`1400`	`+ {`
	`1401`	`+charptr[3]++;`
	`1402`	`+break;`
	`1403`	`+ }`
	`1404`	`+charptr[3]=0x80;`
	`1405`	`+/* FALL THRU */`
	`1406`	`+case3:`
	`1407`	`+bak[2]=charptr[2];`
	`1408`	`+a=charptr[2];`
	`1409`	`+if (a<0xBF)`
	`1410`	`+ {`
	`1411`	`+charptr[2]++;`
	`1412`	`+break;`
	`1413`	`+ }`
	`1414`	`+charptr[2]=0x80;`
	`1415`	`+/* FALL THRU */`
	`1416`	`+case2:`
	`1417`	`+bak[1]=charptr[1];`
	`1418`	`+a=charptr[1];`
	`1419`	`+switch (*charptr)`
	`1420`	`+{`
	`1421`	`+case0xED:`
	`1422`	`+limit=0x9F;`
	`1423`	`+break;`
	`1424`	`+case0xF4:`
	`1425`	`+limit=0x8F;`
	`1426`	`+break;`
	`1427`	`+default:`
	`1428`	`+limit=0xBF;`
	`1429`	`+break;`
	`1430`	`+}`
	`1431`	`+if (a<limit)`
	`1432`	`+{`
	`1433`	`+charptr[1]++;`
	`1434`	`+break;`
	`1435`	`+ }`
	`1436`	`+charptr[1]=0x80;`
	`1437`	`+/* FALL THRU */`
	`1438`	`+case1:`
	`1439`	`+bak[0]=*charptr;`
	`1440`	`+a=*charptr;`
	`1441`	`+if (a==0x7F\|\|a==0xDF\|\|a==0xEF\|\|a==0xF4)`
	`1442`	`+{`
	`1443`	`+/* Restore original string. */`
	`1444`	`+memcpy(charptr,bak,length);`
	`1445`	`+return false;`
	`1446`	`+ }`
	`1447`	`+charptr[0]++;`
	`1448`	`+break;`
	`1449`	`+ }`
	`1450`	`+`
	`1451`	`+return true;`
	`1452`	`+}`
	`1453`	`+`
	`1454`	`+/*`
	`1455`	`+ * EUC-JP character increment function.`
	`1456`	`+ *`
	`1457`	`+ * If the sequence starts with SS2(0x8e), it must be a two-byte sequence`
	`1458`	`+ * representing JIS X 0201 characters with the second byte ranges between`
	`1459`	`+ * 0xa1 and 0xde. We just increment the last byte if it's less than 0xde,`
	`1460`	`+ * and otherwise rewrite whole the sequence to 0xa1 0xa1.`
	`1461`	`+ *`
	`1462`	`+ * If the sequence starts with SS3(0x8f), it must be a three-byte sequence`
	`1463`	`+ * which the last two bytes ranges between 0xa1 and 0xfe. The last byte`
	`1464`	`+ * is incremented, carrying overflow to the second-to-last byte.`
	`1465`	`+ *`
	`1466`	`+ * If the sequence starts with the values other than the aboves and its MSB`
	`1467`	`+ * is set, it must be a two-byte sequence representing JIS X 0208 characters`
	`1468`	`+ * with both bytes ranges between 0xa1 and 0xfe. The last byte is incremented,`
	`1469`	`+ * carrying overflow to the second-to-last byte.`
	`1470`	`+ *`
	`1471`	`+ * Otherwise the sequence is consists of single byte representing ASCII`
	`1472`	`+ * characters. It is incremented up to 0x7f.`
	`1473`	`+ *`
	`1474`	`+ * Only three EUC-JP byte sequences shown below - which have no character`
	`1475`	`+ * allocated - make this function to fail in spite of its validity: 0x7f,`
	`1476`	`+ * 0xfe 0xfe, 0x8f 0xfe 0xfe.`
	`1477`	`+ */`
	`1478`	`+staticbool`
	`1479`	`+pg_eucjp_increment(unsignedchar*charptr,intlength)`
	`1480`	`+{`
	`1481`	`+unsignedcharbak[3];`
	`1482`	`+unsignedcharc1,c2;`
	`1483`	`+signedinti;`
	`1484`	`+`
	`1485`	`+c1=*charptr;`
	`1486`	`+`
	`1487`	`+switch (c1)`
	`1488`	`+ {`
	`1489`	`+caseSS2:/* JIS X 0201 */`
	`1490`	`+if (length!=2)`
	`1491`	`+return false;`
	`1492`	`+`
	`1493`	`+c2=charptr[1];`
	`1494`	`+`
	`1495`	`+if (c2>0xde)`
	`1496`	`+charptr[0]=charptr[1]=0xa1;`
	`1497`	`+elseif (c2<0xa1)`
	`1498`	`+charptr[1]=0xa1;`
	`1499`	`+else`
	`1500`	`+charptr[1]++;`
	`1501`	`+`
	`1502`	`+break;`
	`1503`	`+`
	`1504`	`+caseSS3:/* JIS X 0212 */`
	`1505`	`+if (length!=3)`
	`1506`	`+return false;`
	`1507`	`+`
	`1508`	`+for (i=2;i>0;i--)`
	`1509`	`+ {`
	`1510`	`+bak[i]=charptr[i];`
	`1511`	`+c2=charptr[i];`
	`1512`	`+if (c2<0xa1)`
	`1513`	`+ {`
	`1514`	`+charptr[i]=0xa1;`
	`1515`	`+return true;`
	`1516`	`+ }`
	`1517`	`+elseif (c2<0xfe)`
	`1518`	`+ {`
	`1519`	`+charptr[i]++;`
	`1520`	`+break;`
	`1521`	`+ }`
	`1522`	`+charptr[i]=0xa1;`
	`1523`	`+ }`
	`1524`	`+`
	`1525`	`+if (i==0)/* Out of 3-byte code region */`
	`1526`	`+ {`
	`1527`	`+charptr[1]=bak[1];`
	`1528`	`+charptr[2]=bak[2];`
	`1529`	`+return false;`
	`1530`	`+ }`
	`1531`	`+break;`
	`1532`	`+`
	`1533`	`+default:`
	`1534`	`+if (IS_HIGHBIT_SET(c1))/* JIS X 0208? */`
	`1535`	`+ {`
	`1536`	`+if (length!=2)`
	`1537`	`+return false;`
	`1538`	`+`
	`1539`	`+for (i=1 ;i >=0 ;i--)/* i must be signed */`
	`1540`	`+ {`
	`1541`	`+bak[i]=charptr[i];`
	`1542`	`+c2=charptr[i];`
	`1543`	`+if (c2<0xa1)`
	`1544`	`+ {`
	`1545`	`+charptr[i]=0xa1;`
	`1546`	`+return true;`
	`1547`	`+ }`
	`1548`	`+elseif (c2<0xfe)`
	`1549`	`+ {`
	`1550`	`+charptr[i]++;`
	`1551`	`+break;`
	`1552`	`+ }`
	`1553`	`+charptr[i]=0xa1;`
	`1554`	`+ }`
	`1555`	`+`
	`1556`	`+if (i<0)/* Out of 2 byte code region */`
	`1557`	`+ {`
	`1558`	`+charptr[0]=bak[0];`
	`1559`	`+charptr[1]=bak[1];`
	`1560`	`+return false;`
	`1561`	`+ }`
	`1562`	`+ }`
	`1563`	`+else`
	`1564`	`+ {/* ASCII, single byte */`
	`1565`	`+if (c1>0x7e)`
	`1566`	`+return false;`
	`1567`	`+ (*charptr)++;`
	`1568`	`+ }`
	`1569`	`+ }`
	`1570`	`+`
	`1571`	`+return true;`
	`1572`	`+}`
	`1573`	`+#endif`
	`1574`	`+`
`1337`	`1575`	`/*`
`1338`	`1576`	`*-------------------------------------------------------------------`
`1339`	`1577`	`* encoding info table`
`@@ -1458,6 +1696,25 @@ pg_database_encoding_max_length(void)`
`1458`	`1696`	`returnpg_wchar_table[GetDatabaseEncoding()].maxmblen;`
`1459`	`1697`	`}`
`1460`	`1698`
	`1699`	`+/*`
	`1700`	`+ * give the character incrementer for the encoding for the current database`
	`1701`	`+ */`
	`1702`	`+mbcharacter_incrementer`
	`1703`	`+pg_database_encoding_character_incrementer(void)`
	`1704`	`+{`
	`1705`	`+switch (GetDatabaseEncoding())`
	`1706`	`+{`
	`1707`	`+casePG_UTF8:`
	`1708`	`+returnpg_utf8_increment;`
	`1709`	`+`
	`1710`	`+casePG_EUC_JP:`
	`1711`	`+returnpg_eucjp_increment;`
	`1712`	`+`
	`1713`	`+default:`
	`1714`	`+returnpg_generic_charinc;`
	`1715`	`+}`
	`1716`	`+}`
	`1717`	`+`
`1461`	`1718`	`/*`
`1462`	`1719`	`* Verify mbstr to make sure that it is validly encoded in the current`
`1463`	`1720`	`* database encoding. Otherwise same as pg_verify_mbstr().`

`‎src/include/mb/pg_wchar.h`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -284,6 +284,8 @@ typedef int (mblen_converter) (const unsigned char mbstr);`
`284`	`284`
`285`	`285`	`typedefint (mbdisplaylen_converter) (constunsignedcharmbstr);`
`286`	`286`
	`287`	`+typedefbool (mbcharacter_incrementer) (unsignedcharmbstr,intlen);`
	`288`	`+`
`287`	`289`	`typedefint (mbverifier) (constunsignedcharmbstr,intlen);`
`288`	`290`
`289`	`291`	`typedefstruct`
`@@ -389,6 +391,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,`
`389`	`391`	`externintpg_mbcharcliplen(constchar*mbstr,intlen,intimit);`
`390`	`392`	`externintpg_encoding_max_length(intencoding);`
`391`	`393`	`externintpg_database_encoding_max_length(void);`
	`394`	`+externmbcharacter_incrementerpg_database_encoding_character_incrementer(void);`
`392`	`395`
`393`	`396`	`externintPrepareClientEncoding(intencoding);`
`394`	`397`	`externintSetClientEncoding(intencoding);`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit78d523b

File tree

3 files changed

3 files changed

`‎src/backend/utils/adt/selfuncs.c`

`‎src/backend/utils/mb/wchar.c`

`‎src/include/mb/pg_wchar.h`

0 commit comments