Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit2c265ad

Browse files
committed
Modify the built-in text search parser to handle URLs more nearly according
to RFC 3986. In particular, these characters now terminate the path partof a URL: '"', '<', '>', '\', '^', '`', '{', '|', '}'. The previous behaviorwas inconsistent and depended on whether a "?" was present in the path.Per gripe from Donald Fraser and spec research by Kevin Grittner.This is a pre-existing bug, but not back-patching since the risks ofbreaking existing applications seem to outweigh the benefits.
1 parentd64b110 commit2c265ad

File tree

3 files changed

+86
-17
lines changed

3 files changed

+86
-17
lines changed

‎src/backend/tsearch/wparser_def.c

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.29 2010/04/26 17:10:18 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.30 2010/04/28 02:04:16 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -583,6 +583,35 @@ p_isasclet(TParser *prs)
583583
return (p_isascii(prs)&&p_isalpha(prs)) ?1 :0;
584584
}
585585

586+
staticint
587+
p_isurlchar(TParser*prs)
588+
{
589+
charch;
590+
591+
/* no non-ASCII need apply */
592+
if (prs->state->charlen!=1)
593+
return0;
594+
ch=*(prs->str+prs->state->posbyte);
595+
/* no spaces or control characters */
596+
if (ch <=0x20||ch >=0x7F)
597+
return0;
598+
/* reject characters disallowed by RFC 3986 */
599+
switch (ch)
600+
{
601+
case'"':
602+
case'<':
603+
case'>':
604+
case'\\':
605+
case'^':
606+
case'`':
607+
case'{':
608+
case'|':
609+
case'}':
610+
return0;
611+
}
612+
return1;
613+
}
614+
586615

587616
/* deliberately suppress unused-function complaints for the above */
588617
void_make_compiler_happy(void);
@@ -707,9 +736,9 @@ p_isURLPath(TParser *prs)
707736
intres=0;
708737

709738
tmpprs->state=newTParserPosition(tmpprs->state);
710-
tmpprs->state->state=TPS_InFileFirst;
739+
tmpprs->state->state=TPS_InURLPathFirst;
711740

712-
if (TParserGet(tmpprs)&&(tmpprs->type==URLPATH||tmpprs->type==FILEPATH))
741+
if (TParserGet(tmpprs)&&tmpprs->type==URLPATH)
713742
{
714743
prs->state->posbyte+=tmpprs->lenbytetoken;
715744
prs->state->poschar+=tmpprs->lenchartoken;
@@ -1441,7 +1470,6 @@ static const TParserStateActionItem actionTPS_InFileFirst[] = {
14411470
{p_isdigit,0,A_NEXT,TPS_InFile,0,NULL},
14421471
{p_iseqC,'.',A_NEXT,TPS_InPathFirst,0,NULL},
14431472
{p_iseqC,'_',A_NEXT,TPS_InFile,0,NULL},
1444-
{p_iseqC,'?',A_PUSH,TPS_InURLPathFirst,0,NULL},
14451473
{p_iseqC,'~',A_PUSH,TPS_InFileTwiddle,0,NULL},
14461474
{NULL,0,A_POP,TPS_Null,0,NULL}
14471475
};
@@ -1488,7 +1516,6 @@ static const TParserStateActionItem actionTPS_InFile[] = {
14881516
{p_iseqC,'_',A_NEXT,TPS_InFile,0,NULL},
14891517
{p_iseqC,'-',A_NEXT,TPS_InFile,0,NULL},
14901518
{p_iseqC,'/',A_PUSH,TPS_InFileFirst,0,NULL},
1491-
{p_iseqC,'?',A_PUSH,TPS_InURLPathFirst,0,NULL},
14921519
{NULL,0,A_BINGO,TPS_Base,FILEPATH,NULL}
14931520
};
14941521

@@ -1502,9 +1529,7 @@ static const TParserStateActionItem actionTPS_InFileNext[] = {
15021529

15031530
staticconstTParserStateActionItemactionTPS_InURLPathFirst[]= {
15041531
{p_isEOF,0,A_POP,TPS_Null,0,NULL},
1505-
{p_iseqC,'"',A_POP,TPS_Null,0,NULL},
1506-
{p_iseqC,'\'',A_POP,TPS_Null,0,NULL},
1507-
{p_isnotspace,0,A_CLEAR,TPS_InURLPath,0,NULL},
1532+
{p_isurlchar,0,A_NEXT,TPS_InURLPath,0,NULL},
15081533
{NULL,0,A_POP,TPS_Null,0,NULL},
15091534
};
15101535

@@ -1514,9 +1539,7 @@ static const TParserStateActionItem actionTPS_InURLPathStart[] = {
15141539

15151540
staticconstTParserStateActionItemactionTPS_InURLPath[]= {
15161541
{p_isEOF,0,A_BINGO,TPS_Base,URLPATH,NULL},
1517-
{p_iseqC,'"',A_BINGO,TPS_Base,URLPATH,NULL},
1518-
{p_iseqC,'\'',A_BINGO,TPS_Base,URLPATH,NULL},
1519-
{p_isnotspace,0,A_NEXT,TPS_InURLPath,0,NULL},
1542+
{p_isurlchar,0,A_NEXT,TPS_InURLPath,0,NULL},
15201543
{NULL,0,A_BINGO,TPS_Base,URLPATH,NULL}
15211544
};
15221545

‎src/test/regress/expected/tsearch.out

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,10 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
287287
6 | 4aew.werc.ewr
288288
12 |
289289
14 | http://
290+
5 | 5aew.werc.ewr:8100/?
290291
6 | 5aew.werc.ewr:8100
291-
12 | /?
292+
18 | /?
293+
12 |
292294
1 | ad
293295
12 | =
294296
1 | qwe
@@ -391,22 +393,22 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
391393
12 |
392394
12 | <>
393395
1 | qwerty
394-
(131 rows)
396+
(133 rows)
395397

396398
SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
397399
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
398400
<i <b> wow < jqw <> qwerty');
399-
to_tsvector
400-
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
401-
'+4.0e-10':26 '-4.2':58,60 '/?ad=qwe&dw':7,10,14,22 '/?ad=qwe&dw=%20%32':25 '/awdf/dwqe/4325':46 '/usr/local/fff':45 '/wqe-324/ewr':49 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':61 '234.435':30 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':54,55,56 '455':31 '4aew.werc.ewr':15 '5.005':32 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100':24 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23 'ad':17 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':37 'dw':19 'efd.r':3 'ewr1':43 'ewri2':44 'gist.c':52 'gist.h':50 'gist.h.c':51 'hjwer':42 'jf':39 'jqw':64 'qwe':2,18,27,28,35 'qwe-wer':34 'qwer':38 'qwerti':65 'qwqwe':29 'readlin':53,57,59 'rewt/ewr':47 'sdjk':40 'teodor@stack.net':33 'wefjn':48 'wer':36 'wow':63 'www.com':4
401+
to_tsvector
402+
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
403+
'+4.0e-10':28 '-4.2':60,62 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':48 '/usr/local/fff':47 '/wqe-324/ewr':51 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':63 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':56,57,58 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':39 'dw':21 'efd.r':3 'ewr1':45 'ewri2':46 'gist.c':54 'gist.h':52 'gist.h.c':53 'hjwer':44 'jf':41 'jqw':66 'qwe':2,20,29,30,37 'qwe-wer':36 'qwer':40 'qwerti':67 'qwqwe':31 'readlin':55,59,61 'rewt/ewr':49 'sdjk':42 'teodor@stack.net':35 'wefjn':50 'wer':38 'wow':65 'www.com':4
402404
(1 row)
403405

404406
SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
405407
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
406408
<i <b> wow < jqw <> qwerty'));
407409
length
408410
--------
409-
51
411+
53
410412
(1 row)
411413

412414
-- ts_debug
@@ -424,6 +426,44 @@ SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def&#xa9;ghi
424426
tag | XML tag | </myns:foo-bar_baz.blurfl> | {} | |
425427
(9 rows)
426428

429+
-- check parsing of URLs
430+
SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
431+
alias | description | token | dictionaries | dictionary | lexemes
432+
----------+---------------+----------------------------------------+--------------+------------+------------------------------------------
433+
protocol | Protocol head | http:// | {} | |
434+
url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx}
435+
host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk}
436+
url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx}
437+
tag | XML tag | </span> | {} | |
438+
(5 rows)
439+
440+
SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
441+
alias | description | token | dictionaries | dictionary | lexemes
442+
----------+---------------+----------------------------+--------------+------------+------------------------------
443+
protocol | Protocol head | http:// | {} | |
444+
url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw}
445+
host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr}
446+
url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw}
447+
tag | XML tag | <span> | {} | |
448+
(5 rows)
449+
450+
SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
451+
alias | description | token | dictionaries | dictionary | lexemes
452+
----------+---------------+----------------------+--------------+------------+------------------------
453+
protocol | Protocol head | http:// | {} | |
454+
url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?}
455+
host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
456+
url_path | URL path | /? | {simple} | simple | {/?}
457+
(4 rows)
458+
459+
SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
460+
alias | description | token | dictionaries | dictionary | lexemes
461+
----------+-------------+------------------------+--------------+------------+--------------------------
462+
url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx}
463+
host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
464+
url_path | URL path | /?xx | {simple} | simple | {/?xx}
465+
(3 rows)
466+
427467
-- to_tsquery
428468
SELECT to_tsquery('english', 'qwe & sKies ');
429469
to_tsquery

‎src/test/regress/sql/tsearch.sql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@ SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://ae
105105

106106
SELECT*from ts_debug('english','<myns:foo-bar_baz.blurfl>abc&nm1;def&#xa9;ghi&#245;jkl</myns:foo-bar_baz.blurfl>');
107107

108+
-- check parsing of URLs
109+
SELECT*from ts_debug('english','http://www.harewoodsolutions.co.uk/press.aspx</span>');
110+
SELECT*from ts_debug('english','http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
111+
SELECT*from ts_debug('english','http://5aew.werc.ewr:8100/?');
112+
SELECT*from ts_debug('english','5aew.werc.ewr:8100/?xx');
113+
108114
-- to_tsquery
109115

110116
SELECT to_tsquery('english','qwe & sKies');

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp