1- <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.58 2010/08/20 13:59:45 tgl Exp $ -->
1+ <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.59 2010/08/25 21:42:55 tgl Exp $ -->
22
33<chapter id="textsearch">
44 <title>Full Text Search</title>
112112 as a sorted array of normalized lexemes. Along with the lexemes it is
113113 often desirable to store positional information to use for
114114 <firstterm>proximity ranking</firstterm>, so that a document that
115- contains a more <quote>dense</> region of query words is
115+ contains a more <quote>dense</> region of query words is
116116 assigned a higher rank than one with scattered query words.
117117 </para>
118118 </listitem>
@@ -1151,13 +1151,13 @@ MaxFragments=0, FragmentDelimiter=" ... "
11511151<screen>
11521152SELECT ts_headline('english',
11531153 'The most common type of search
1154- is to find all documents containing given query terms
1154+ is to find all documents containing given query terms
11551155and return them in order of their similarity to the
11561156query.',
11571157 to_tsquery('query & similarity'));
11581158 ts_headline
11591159------------------------------------------------------------
1160- containing given <b>query</b> terms
1160+ containing given <b>query</b> terms
11611161 and return them in order of their <b>similarity</b> to the
11621162 <b>query</b>.
11631163
@@ -1166,7 +1166,7 @@ SELECT ts_headline('english',
11661166is to find all documents containing given query terms
11671167and return them in order of their similarity to the
11681168query.',
1169- to_tsquery('query & similarity'),
1169+ to_tsquery('query & similarity'),
11701170 'StartSel = <, StopSel = >');
11711171 ts_headline
11721172-------------------------------------------------------
@@ -2064,6 +2064,14 @@ SELECT alias, description, token FROM ts_debug('http://example.com/stuff/index.h
20642064 (notice that one token can produce more than one lexeme)
20652065 </para>
20662066 </listitem>
2067+ <listitem>
2068+ <para>
2069+ a single lexeme with the <literal>TSL_FILTER</> flag set, to replace
2070+ the original token with a new token to be passed to subsequent
2071+ dictionaries (a dictionary that does this is called a
2072+ <firstterm>filtering dictionary</>)
2073+ </para>
2074+ </listitem>
20672075 <listitem>
20682076 <para>
20692077 an empty array if the dictionary knows the token, but it is a stop word
@@ -2096,6 +2104,13 @@ SELECT alias, description, token FROM ts_debug('http://example.com/stuff/index.h
20962104 until some dictionary recognizes it as a known word. If it is identified
20972105 as a stop word, or if no dictionary recognizes the token, it will be
20982106 discarded and not indexed or searched for.
2107+ Normally, the first dictionary that returns a non-<literal>NULL</>
2108+ output determines the result, and any remaining dictionaries are not
2109+ consulted; but a filtering dictionary can replace the given word
2110+ with a modified word, which is then passed to subsequent dictionaries.
2111+ </para>
2112+
2113+ <para>
20992114 The general rule for configuring a list of dictionaries
21002115 is to place first the most narrow, most specific dictionary, then the more
21012116 general dictionaries, finishing with a very general dictionary, like
@@ -2112,6 +2127,16 @@ ALTER TEXT SEARCH CONFIGURATION astro_en
21122127</programlisting>
21132128 </para>
21142129
2130+ <para>
2131+ A filtering dictionary can be placed anywhere in the list, except at the
2132+ end where it'd be useless. Filtering dictionaries are useful to partially
2133+ normalize words to simplify the task of later dictionaries. For example,
2134+ a filtering dictionary could be used to remove accents from accented
2135+ letters, as is done by the
2136+ <link linkend="unaccent"><filename>contrib/unaccent</></link>
2137+ extension module.
2138+ </para>
2139+
21152140 <sect2 id="textsearch-stopwords">
21162141 <title>Stop Words</title>
21172142
@@ -2184,7 +2209,7 @@ CREATE TEXT SEARCH DICTIONARY public.simple_dict (
21842209 Here, <literal>english</literal> is the base name of a file of stop words.
21852210 The file's full name will be
21862211 <filename>$SHAREDIR/tsearch_data/english.stop</>,
2187- where <literal>$SHAREDIR</> means the
2212+ where <literal>$SHAREDIR</> means the
21882213 <productname>PostgreSQL</productname> installation's shared-data directory,
21892214 often <filename>/usr/local/share/postgresql</> (use <command>pg_config
21902215 --sharedir</> to determine it if you're not sure).
@@ -2295,85 +2320,82 @@ SELECT * FROM ts_debug('english', 'Paris');
22952320 asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
22962321</screen>
22972322 </para>
2298-
2323+
22992324 <para>
2300- An asterisk (<literal>*</literal>) at the end of definition word indicates
2301- that definition word is a prefix, and <function>to_tsquery()</function>
2302- function will transform that definition to the prefix search format (see
2303- <xref linkend="textsearch-parsing-queries">).
2304- Notice that it is ignored in <function>to_tsvector()</function>.
2325+ The only parameter required by the <literal>synonym</> template is
2326+ <literal>SYNONYMS</>, which is the base name of its configuration file
2327+ — <literal>my_synonyms</> in the above example.
2328+ The file's full name will be
2329+ <filename>$SHAREDIR/tsearch_data/my_synonyms.syn</>
2330+ (where <literal>$SHAREDIR</> means the
2331+ <productname>PostgreSQL</> installation's shared-data directory).
2332+ The file format is just one line
2333+ per word to be substituted, with the word followed by its synonym,
2334+ separated by white space. Blank lines and trailing spaces are ignored.
2335+ </para>
2336+
2337+ <para>
2338+ The <literal>synonym</> template also has an optional parameter
2339+ <literal>CaseSensitive</>, which defaults to <literal>false</>. When
2340+ <literal>CaseSensitive</> is <literal>false</>, words in the synonym file
2341+ are folded to lower case, as are input tokens. When it is
2342+ <literal>true</>, words and tokens are not folded to lower case,
2343+ but are compared as-is.
23052344 </para>
23062345
23072346 <para>
2308- Contents of <filename>$SHAREDIR/tsearch_data/synonym_sample.syn</>:
2347+ An asterisk (<literal>*</literal>) can be placed at the end of a synonym
2348+ in the configuration file. This indicates that the synonym is a prefix.
2349+ The asterisk is ignored when the entry is used in
2350+ <function>to_tsvector()</function>, but when it is used in
2351+ <function>to_tsquery()</function>, the result will be a query item with
2352+ the prefix match marker (see
2353+ <xref linkend="textsearch-parsing-queries">).
2354+ For example, suppose we have these entries in
2355+ <filename>$SHAREDIR/tsearch_data/synonym_sample.syn</>:
23092356<programlisting>
23102357postgres pgsql
23112358postgresql pgsql
23122359postgre pgsql
23132360gogle googl
23142361indices index*
23152362</programlisting>
2316- </para>
2317-
2318- <para>
2319- Results:
2363+ Then we will get these results:
23202364<screen>
2321- =# CREATE TEXT SEARCH DICTIONARY syn (template=synonym, synonyms='synonym_sample');
2322- =# SELECT ts_lexize('syn','indices');
2365+ mydb =# CREATE TEXT SEARCH DICTIONARY syn (template=synonym, synonyms='synonym_sample');
2366+ mydb =# SELECT ts_lexize('syn','indices');
23232367 ts_lexize
23242368-----------
23252369 {index}
23262370(1 row)
23272371
2328- =# CREATE TEXT SEARCH CONFIGURATION tst (copy=simple);
2329- =# ALTER TEXT SEARCH CONFIGURATION tst ALTER MAPPING FOR asciiword WITH syn;
2330- =# SELECT to_tsquery('tst','indices');
2372+ mydb=# CREATE TEXT SEARCH CONFIGURATION tst (copy=simple);
2373+ mydb=# ALTER TEXT SEARCH CONFIGURATION tst ALTER MAPPING FOR asciiword WITH syn;
2374+ mydb=# SELECT to_tsvector('tst','indices');
2375+ to_tsvector
2376+ -------------
2377+ 'index':1
2378+ (1 row)
2379+
2380+ mydb=# SELECT to_tsquery('tst','indices');
23312381 to_tsquery
23322382------------
23332383 'index':*
23342384(1 row)
23352385
2336- =# SELECT 'indexes are very useful'::tsvector;
2386+ mydb =# SELECT 'indexes are very useful'::tsvector;
23372387 tsvector
23382388---------------------------------
23392389 'are' 'indexes' 'useful' 'very'
23402390(1 row)
23412391
2342- =# SELECT 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
2392+ mydb =# SELECT 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
23432393 ?column?
23442394----------
23452395 t
23462396(1 row)
2347-
2348- =# SELECT to_tsvector('tst','indices');
2349- to_tsvector
2350- -------------
2351- 'index':1
2352- (1 row)
23532397</screen>
23542398 </para>
2355-
2356- <para>
2357- The only parameter required by the <literal>synonym</> template is
2358- <literal>SYNONYMS</>, which is the base name of its configuration file
2359- — <literal>my_synonyms</> in the above example.
2360- The file's full name will be
2361- <filename>$SHAREDIR/tsearch_data/my_synonyms.syn</>
2362- (where <literal>$SHAREDIR</> means the
2363- <productname>PostgreSQL</> installation's shared-data directory).
2364- The file format is just one line
2365- per word to be substituted, with the word followed by its synonym,
2366- separated by white space. Blank lines and trailing spaces are ignored.
2367- </para>
2368-
2369- <para>
2370- The <literal>synonym</> template also has an optional parameter
2371- <literal>CaseSensitive</>, which defaults to <literal>false</>. When
2372- <literal>CaseSensitive</> is <literal>false</>, words in the synonym file
2373- are folded to lower case, as are input tokens. When it is
2374- <literal>true</>, words and tokens are not folded to lower case,
2375- but are compared as-is.
2376- </para>
23772399 </sect2>
23782400
23792401 <sect2 id="textsearch-thesaurus">