Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit73e6f9d

Browse files
committed
Change text search parsing rules for hyphenated words so that digit strings
containing decimal points aren't considered part of a hyphenated word.Sync the hyphenated-word lookahead states with the subsequent part-by-partreparsing states so that we don't get different answers about how much textis part of the hyphenated word. Per my gripe of a few days ago.
1 parent1aaf39b commit73e6f9d

File tree

2 files changed

+21
-82
lines changed

2 files changed

+21
-82
lines changed

‎src/backend/tsearch/wparser_def.c

Lines changed: 13 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.6 2007/10/2717:53:15 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/2719:03:45 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -181,19 +181,13 @@ typedef enum
181181
TPS_InHyphenWord,
182182
TPS_InHyphenNumWordFirst,
183183
TPS_InHyphenNumWord,
184-
TPS_InHyphenValueFirst,
185-
TPS_InHyphenValue,
186-
TPS_InHyphenValueExact,
184+
TPS_InHyphenDigitLookahead,
187185
TPS_InParseHyphen,
188186
TPS_InParseHyphenHyphen,
189187
TPS_InHyphenWordPart,
190188
TPS_InHyphenAsciiWordPart,
191189
TPS_InHyphenNumWordPart,
192190
TPS_InHyphenUnsignedInt,
193-
TPS_InHDecimalPartFirst,
194-
TPS_InHDecimalPart,
195-
TPS_InHVersionPartFirst,
196-
TPS_InHVersionPart,
197191
TPS_Null/* last state (fake value) */
198192
}TParserState;
199193

@@ -1147,8 +1141,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
11471141
{p_isEOF,0,A_POP,TPS_Null,0,NULL},
11481142
{p_isasclet,0,A_NEXT,TPS_InHyphenAsciiWord,0,NULL},
11491143
{p_isalpha,0,A_NEXT,TPS_InHyphenWord,0,NULL},
1150-
{p_isdigit,0,A_NEXT,TPS_InHyphenValue,0,NULL},
1151-
{p_isdigit,0,A_NEXT,TPS_InHyphenNumWord,0,NULL},
1144+
{p_isdigit,0,A_NEXT,TPS_InHyphenDigitLookahead,0,NULL},
11521145
{NULL,0,A_POP,TPS_Null,0,NULL}
11531146
};
11541147

@@ -1164,8 +1157,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
11641157
staticconstTParserStateActionItemactionTPS_InHyphenWordFirst[]= {
11651158
{p_isEOF,0,A_POP,TPS_Null,0,NULL},
11661159
{p_isalpha,0,A_NEXT,TPS_InHyphenWord,0,NULL},
1167-
{p_isdigit,0,A_NEXT,TPS_InHyphenValue,0,NULL},
1168-
{p_isdigit,0,A_NEXT,TPS_InHyphenNumWord,0,NULL},
1160+
{p_isdigit,0,A_NEXT,TPS_InHyphenDigitLookahead,0,NULL},
11691161
{NULL,0,A_POP,TPS_Null,0,NULL}
11701162
};
11711163

@@ -1179,8 +1171,8 @@ static const TParserStateActionItem actionTPS_InHyphenWord[] = {
11791171

11801172
staticconstTParserStateActionItemactionTPS_InHyphenNumWordFirst[]= {
11811173
{p_isEOF,0,A_POP,TPS_Null,0,NULL},
1182-
{p_isdigit,0,A_NEXT,TPS_InHyphenValue,0,NULL},
11831174
{p_isalpha,0,A_NEXT,TPS_InHyphenNumWord,0,NULL},
1175+
{p_isdigit,0,A_NEXT,TPS_InHyphenDigitLookahead,0,NULL},
11841176
{NULL,0,A_POP,TPS_Null,0,NULL}
11851177
};
11861178

@@ -1191,34 +1183,18 @@ static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
11911183
{NULL,0,A_BINGO |A_CLRALL,TPS_InParseHyphen,NUMHWORD,SpecialHyphen}
11921184
};
11931185

1194-
staticconstTParserStateActionItemactionTPS_InHyphenValueFirst[]= {
1186+
staticconstTParserStateActionItemactionTPS_InHyphenDigitLookahead[]= {
11951187
{p_isEOF,0,A_POP,TPS_Null,0,NULL},
1196-
{p_isdigit,0,A_NEXT,TPS_InHyphenValueExact,0,NULL},
1197-
{NULL,0,A_POP,TPS_Null,0,NULL}
1198-
};
1199-
1200-
staticconstTParserStateActionItemactionTPS_InHyphenValue[]= {
1201-
{p_isEOF,0,A_BINGO |A_CLRALL,TPS_InParseHyphen,NUMHWORD,SpecialHyphen},
1202-
{p_isdigit,0,A_NEXT,TPS_InHyphenValue,0,NULL},
1203-
{p_iseqC,'.',A_PUSH,TPS_InHyphenValueFirst,0,NULL},
1204-
{p_iseqC,'-',A_PUSH,TPS_InHyphenNumWordFirst,0,NULL},
1188+
{p_isdigit,0,A_NEXT,TPS_InHyphenDigitLookahead,0,NULL},
12051189
{p_isalpha,0,A_NEXT,TPS_InHyphenNumWord,0,NULL},
1206-
{NULL,0,A_BINGO |A_CLRALL,TPS_InParseHyphen,NUMHWORD,SpecialHyphen}
1207-
};
1208-
1209-
staticconstTParserStateActionItemactionTPS_InHyphenValueExact[]= {
1210-
{p_isEOF,0,A_BINGO |A_CLRALL,TPS_InParseHyphen,NUMHWORD,SpecialHyphen},
1211-
{p_isdigit,0,A_NEXT,TPS_InHyphenValueExact,0,NULL},
1212-
{p_iseqC,'.',A_PUSH,TPS_InHyphenValueFirst,0,NULL},
1213-
{p_iseqC,'-',A_PUSH,TPS_InHyphenNumWordFirst,0,NULL},
1214-
{NULL,0,A_BINGO |A_CLRALL,TPS_InParseHyphen,NUMHWORD,SpecialHyphen}
1190+
{NULL,0,A_POP,TPS_Null,0,NULL}
12151191
};
12161192

12171193
staticconstTParserStateActionItemactionTPS_InParseHyphen[]= {
12181194
{p_isEOF,0,A_RERUN,TPS_Base,0,NULL},
12191195
{p_isasclet,0,A_NEXT,TPS_InHyphenAsciiWordPart,0,NULL},
12201196
{p_isalpha,0,A_NEXT,TPS_InHyphenWordPart,0,NULL},
1221-
{p_isdigit,0,A_NEXT,TPS_InHyphenUnsignedInt,0,NULL},
1197+
{p_isdigit,0,A_PUSH,TPS_InHyphenUnsignedInt,0,NULL},
12221198
{p_iseqC,'-',A_PUSH,TPS_InParseHyphenHyphen,0,NULL},
12231199
{NULL,0,A_RERUN,TPS_Base,0,NULL}
12241200
};
@@ -1251,39 +1227,12 @@ static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
12511227
};
12521228

12531229
staticconstTParserStateActionItemactionTPS_InHyphenUnsignedInt[]= {
1254-
{p_isEOF,0,A_BINGO,TPS_Base,UNSIGNEDINT,NULL},
1255-
{p_isdigit,0,A_NEXT,TPS_InHyphenUnsignedInt,0,NULL},
1256-
{p_isalpha,0,A_NEXT,TPS_InHyphenNumWordPart,0,NULL},
1257-
{p_iseqC,'.',A_PUSH,TPS_InHDecimalPartFirst,0,NULL},
1258-
{NULL,0,A_BINGO,TPS_InParseHyphen,UNSIGNEDINT,NULL}
1259-
};
1260-
1261-
staticconstTParserStateActionItemactionTPS_InHDecimalPartFirst[]= {
1262-
{p_isEOF,0,A_POP,TPS_Null,0,NULL},
1263-
{p_isdigit,0,A_CLEAR,TPS_InHDecimalPart,0,NULL},
1264-
{NULL,0,A_POP,TPS_Null,0,NULL}
1265-
};
1266-
1267-
staticconstTParserStateActionItemactionTPS_InHDecimalPart[]= {
1268-
{p_isEOF,0,A_BINGO,TPS_Base,DECIMAL,NULL},
1269-
{p_isdigit,0,A_NEXT,TPS_InHDecimalPart,0,NULL},
1270-
{p_iseqC,'.',A_PUSH,TPS_InHVersionPartFirst,0,NULL},
1271-
{NULL,0,A_BINGO,TPS_InParseHyphen,DECIMAL,NULL}
1272-
};
1273-
1274-
staticconstTParserStateActionItemactionTPS_InHVersionPartFirst[]= {
12751230
{p_isEOF,0,A_POP,TPS_Null,0,NULL},
1276-
{p_isdigit,0,A_CLEAR,TPS_InHVersionPart,0,NULL},
1231+
{p_isdigit,0,A_NEXT,TPS_Null,0,NULL},
1232+
{p_isalpha,0,A_CLEAR,TPS_InHyphenNumWordPart,0,NULL},
12771233
{NULL,0,A_POP,TPS_Null,0,NULL}
12781234
};
12791235

1280-
staticconstTParserStateActionItemactionTPS_InHVersionPart[]= {
1281-
{p_isEOF,0,A_BINGO,TPS_Base,VERSIONNUMBER,NULL},
1282-
{p_isdigit,0,A_NEXT,TPS_InHVersionPart,0,NULL},
1283-
{p_iseqC,'.',A_PUSH,TPS_InHVersionPartFirst,0,NULL},
1284-
{NULL,0,A_BINGO,TPS_InParseHyphen,VERSIONNUMBER,NULL}
1285-
};
1286-
12871236

12881237
/*
12891238
* main table of per-state parser actions
@@ -1378,19 +1327,13 @@ static const TParserStateAction Actions[] = {
13781327
TPARSERSTATEACTION(TPS_InHyphenWord),
13791328
TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
13801329
TPARSERSTATEACTION(TPS_InHyphenNumWord),
1381-
TPARSERSTATEACTION(TPS_InHyphenValueFirst),
1382-
TPARSERSTATEACTION(TPS_InHyphenValue),
1383-
TPARSERSTATEACTION(TPS_InHyphenValueExact),
1330+
TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
13841331
TPARSERSTATEACTION(TPS_InParseHyphen),
13851332
TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
13861333
TPARSERSTATEACTION(TPS_InHyphenWordPart),
13871334
TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
13881335
TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1389-
TPARSERSTATEACTION(TPS_InHyphenUnsignedInt),
1390-
TPARSERSTATEACTION(TPS_InHDecimalPartFirst),
1391-
TPARSERSTATEACTION(TPS_InHDecimalPart),
1392-
TPARSERSTATEACTION(TPS_InHVersionPartFirst),
1393-
TPARSERSTATEACTION(TPS_InHVersionPart)
1336+
TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
13941337
};
13951338

13961339

‎src/test/regress/expected/tsearch.out

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -352,15 +352,11 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
352352
12 | .
353353
20 | 4.2
354354
12 | ,
355-
15 | readline-4.2
356-
11 | readline
357-
12 | -
358-
20 | 4.2
355+
1 | readline
356+
20 | -4.2
359357
12 |
360-
15 | readline-4.2
361-
11 | readline
362-
12 | -
363-
20 | 4.2
358+
1 | readline
359+
20 | -4.2
364360
12 | .
365361
22 | 234
366362
12 |
@@ -377,14 +373,14 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
377373
12 |
378374
12 | <>
379375
1 | qwerty
380-
(135 rows)
376+
(131 rows)
381377

382378
SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
383379
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
384380
<i <b> wow < jqw <> qwerty');
385-
to_tsvector
386-
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
387-
'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
381+
to_tsvector
382+
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
383+
'ad':17 'dw':19 'jf':39 '234':61 '345':1 '4.2':54,55,56 '455':31 'jqw':64 'qwe':2,18,27,28,35 'wer':36 'wow':63 '-4.2':58,60 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':65 '234.435':30 'qwe-wer':34 'readlin':53,57,59 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
388384
(1 row)
389385

390386
SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp