1- <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.22 2007/10/2203:37:04 tgl Exp $ -->
1+ <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.23 2007/10/2220:13:37 tgl Exp $ -->
22
33<chapter id="textsearch">
44 <title id="textsearch-title">Full Text Search</title>
@@ -1699,18 +1699,18 @@ ON messages FOR EACH ROW EXECUTE PROCEDURE messages_trigger();
16991699 <itemizedlist spacing="compact" mark="bullet">
17001700 <listitem>
17011701 <para>
1702- <structname >word</> <type>text</> — the value of a lexeme
1702+ <replaceable >word</> <type>text</> — the value of a lexeme
17031703 </para>
17041704 </listitem>
17051705 <listitem>
17061706 <para>
1707- <structname >ndoc</> <type>integer</> — number of documents
1707+ <replaceable >ndoc</> <type>integer</> — number of documents
17081708 (<type>tsvector</>s) the word occurred in
17091709 </para>
17101710 </listitem>
17111711 <listitem>
17121712 <para>
1713- <structname >nentry</> <type>integer</> — total number of
1713+ <replaceable >nentry</> <type>integer</> — total number of
17141714 occurrences of the word
17151715 </para>
17161716 </listitem>
@@ -1901,8 +1901,8 @@ LIMIT 10;
19011901 as the entire word and as each component:
19021902
19031903<programlisting>
1904- SELECT"Alias", "Description", "Token" FROM ts_debug('foo-bar-beta1');
1905- Alias |Description |Token
1904+ SELECTalias, description, token FROM ts_debug('foo-bar-beta1');
1905+ alias |description |token
19061906-------------+-------------------------------+---------------
19071907 hword | Hyphenated word | foo-bar-beta1
19081908 lpart_hword | Latin part of hyphenated word | foo
@@ -1917,8 +1917,8 @@ SELECT "Alias", "Description", "Token" FROM ts_debug('foo-bar-beta1');
19171917 instructive example:
19181918
19191919<programlisting>
1920- SELECT"Alias", "Description", "Token" FROM ts_debug('http://foo.com/stuff/index.html');
1921- Alias |Description |Token
1920+ SELECTalias, description, token FROM ts_debug('http://foo.com/stuff/index.html');
1921+ alias |description |token
19221922----------+---------------+--------------------------
19231923 protocol | Protocol head | http://
19241924 url | URL | foo.com/stuff/index.html
@@ -2186,25 +2186,23 @@ SELECT ts_lexize('public.simple_dict','The');
21862186 synonym dictionary and put it before the <literal>english_stem</> dictionary:
21872187
21882188<programlisting>
2189- SELECT * FROM ts_debug('english','Paris');
2190- Alias | Description | Token | Dictionaries | Lexized token
2191- -------+-------------+-------+----------------+----------------------
2192- lword | Latin word | Paris | {english_stem} | english_stem: {pari}
2193- (1 row)
2189+ SELECT * FROM ts_debug('english', 'Paris');
2190+ alias | description | token | dictionaries | dictionary | lexemes
2191+ -------+-------------+-------+----------------+--------------+---------
2192+ lword | Latin word | Paris | {english_stem} | english_stem | {pari}
21942193
2195- CREATE TEXT SEARCH DICTIONARYsynonym (
2194+ CREATE TEXT SEARCH DICTIONARYmy_synonym (
21962195 TEMPLATE = synonym,
21972196 SYNONYMS = my_synonyms
21982197);
21992198
22002199ALTER TEXT SEARCH CONFIGURATION english
2201- ALTER MAPPING FOR lword WITHsynonym , english_stem;
2200+ ALTER MAPPING FOR lword WITHmy_synonym , english_stem;
22022201
2203- SELECT * FROM ts_debug('english','Paris');
2204- Alias | Description | Token | Dictionaries | Lexized token
2205- -------+-------------+-------+------------------------+------------------
2206- lword | Latin word | Paris | {synonym,english_stem} | synonym: {paris}
2207- (1 row)
2202+ SELECT * FROM ts_debug('english', 'Paris');
2203+ alias | description | token | dictionaries | dictionary | lexemes
2204+ -------+-------------+-------+---------------------------+------------+---------
2205+ lword | Latin word | Paris | {my_synonym,english_stem} | my_synonym | {paris}
22082206</programlisting>
22092207 </para>
22102208
@@ -2711,7 +2709,14 @@ SHOW default_text_search_config;
27112709 </indexterm>
27122710
27132711 <synopsis>
2714- ts_debug(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">document</replaceable> <type>text</>) returns <type>setof ts_debug</>
2712+ ts_debug(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">document</replaceable> <type>text</>,
2713+ OUT <replaceable class="PARAMETER">alias</> <type>text</>,
2714+ OUT <replaceable class="PARAMETER">description</> <type>text</>,
2715+ OUT <replaceable class="PARAMETER">token</> <type>text</>,
2716+ OUT <replaceable class="PARAMETER">dictionaries</> <type>regdictionary[]</>,
2717+ OUT <replaceable class="PARAMETER">dictionary</> <type>regdictionary</>,
2718+ OUT <replaceable class="PARAMETER">lexemes</> <type>text[]</>)
2719+ returns setof record
27152720 </synopsis>
27162721
27172722 <para>
@@ -2725,57 +2730,80 @@ SHOW default_text_search_config;
27252730 </para>
27262731
27272732 <para>
2728- <function>ts_debug</>'s result row type is defined as:
2733+ <function>ts_debug</> returns one row for each token identified in the text
2734+ by the parser. The columns returned are
27292735
2730- <programlisting>
2731- CREATE TYPE ts_debug AS (
2732- "Alias" text,
2733- "Description" text,
2734- "Token" text,
2735- "Dictionaries" regdictionary[],
2736- "Lexized token" text
2737- );
2738- </programlisting>
2739-
2740- One row is produced for each token identified by the parser.
2741- The first three columns describe the token, and the fourth lists
2742- the dictionaries selected by the configuration for that token's type.
2743- The last column shows the result of dictionary processing: which
2744- dictionary (if any) recognized the token, and what it produced.
2736+ <itemizedlist spacing="compact" mark="bullet">
2737+ <listitem>
2738+ <para>
2739+ <replaceable>alias</> <type>text</> — short name of the token type
2740+ </para>
2741+ </listitem>
2742+ <listitem>
2743+ <para>
2744+ <replaceable>description</> <type>text</> — description of the
2745+ token type
2746+ </para>
2747+ </listitem>
2748+ <listitem>
2749+ <para>
2750+ <replaceable>token</> <type>text</> — text of the token
2751+ </para>
2752+ </listitem>
2753+ <listitem>
2754+ <para>
2755+ <replaceable>dictionaries</> <type>regdictionary[]</> — the
2756+ dictionaries selected by the configuration for this token type
2757+ </para>
2758+ </listitem>
2759+ <listitem>
2760+ <para>
2761+ <replaceable>dictionary</> <type>regdictionary</> — the dictionary
2762+ that recognized the token, or <literal>NULL</> if none did
2763+ </para>
2764+ </listitem>
2765+ <listitem>
2766+ <para>
2767+ <replaceable>lexemes</> <type>text[]</> — the lexeme(s) produced
2768+ by the dictionary that recognized the token, or <literal>NULL</> if
2769+ none did; an empty array (<literal>{}</>) means it was recognized as a
2770+ stop word
2771+ </para>
2772+ </listitem>
2773+ </itemizedlist>
27452774 </para>
27462775
27472776 <para>
27482777 Here is a simple example:
27492778
27502779<programlisting>
27512780SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats');
2752- Alias | Description | Token | Dictionaries | Lexized token
2753- -------+---------------+-------+--------------+----------------
2754- lword | Latin word | a | {english} | english: {}
2755- blank | Space symbols | | |
2756- lword | Latin word | fat | {english} | english: {fat}
2757- blank | Space symbols | | |
2758- lword | Latin word | cat | {english} | english: {cat}
2759- blank | Space symbols | | |
2760- lword | Latin word | sat | {english} | english: {sat}
2761- blank | Space symbols | | |
2762- lword | Latin word | on | {english} | english: {}
2763- blank | Space symbols | | |
2764- lword | Latin word | a | {english} | english: {}
2765- blank | Space symbols | | |
2766- lword | Latin word | mat | {english} | english: {mat}
2767- blank | Space symbols | | |
2768- blank | Space symbols | - | |
2769- lword | Latin word | it | {english} | english: {}
2770- blank | Space symbols | | |
2771- lword | Latin word | ate | {english} | english: {ate}
2772- blank | Space symbols | | |
2773- lword | Latin word | a | {english} | english: {}
2774- blank | Space symbols | | |
2775- lword | Latin word | fat | {english} | english: {fat}
2776- blank | Space symbols | | |
2777- lword | Latin word | rats | {english} | english: {rat}
2778- (24 rows)
2781+ alias | description | token | dictionaries | dictionary | lexemes
2782+ -------+---------------+-------+----------------+--------------+---------
2783+ lword | Latin word | a | {english_stem} | english_stem | {}
2784+ blank | Space symbols | | {} | |
2785+ lword | Latin word | fat | {english_stem} | english_stem | {fat}
2786+ blank | Space symbols | | {} | |
2787+ lword | Latin word | cat | {english_stem} | english_stem | {cat}
2788+ blank | Space symbols | | {} | |
2789+ lword | Latin word | sat | {english_stem} | english_stem | {sat}
2790+ blank | Space symbols | | {} | |
2791+ lword | Latin word | on | {english_stem} | english_stem | {}
2792+ blank | Space symbols | | {} | |
2793+ lword | Latin word | a | {english_stem} | english_stem | {}
2794+ blank | Space symbols | | {} | |
2795+ lword | Latin word | mat | {english_stem} | english_stem | {mat}
2796+ blank | Space symbols | | {} | |
2797+ blank | Space symbols | - | {} | |
2798+ lword | Latin word | it | {english_stem} | english_stem | {}
2799+ blank | Space symbols | | {} | |
2800+ lword | Latin word | ate | {english_stem} | english_stem | {ate}
2801+ blank | Space symbols | | {} | |
2802+ lword | Latin word | a | {english_stem} | english_stem | {}
2803+ blank | Space symbols | | {} | |
2804+ lword | Latin word | fat | {english_stem} | english_stem | {fat}
2805+ blank | Space symbols | | {} | |
2806+ lword | Latin word | rats | {english_stem} | english_stem | {rat}
27792807</programlisting>
27802808 </para>
27812809
@@ -2801,34 +2829,33 @@ ALTER TEXT SEARCH CONFIGURATION public.english
28012829
28022830<programlisting>
28032831SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
2804- Alias | Description | Token | Dictionaries | Lexized token
2805- -------+---------------+-------------+-------------------------------------------------+-------------------------------------
2806- lword | Latin word | The | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {}
2807- blank | Space symbols | | |
2808- lword | Latin word | Brightest | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {bright}
2809- blank | Space symbols | | |
2810- lword | Latin word | supernovaes | {public.english_ispell,pg_catalog.english_stem} | pg_catalog.english_stem: {supernova}
2811- (5 rows)
2832+ alias | description | token | dictionaries | dictionary | lexemes
2833+ -------+---------------+-------------+-------------------------------+----------------+-------------
2834+ lword | Latin word | The | {english_ispell,english_stem} | english_ispell | {}
2835+ blank | Space symbols | | {} | |
2836+ lword | Latin word | Brightest | {english_ispell,english_stem} | english_ispell | {bright}
2837+ blank | Space symbols | | {} | |
2838+ lword | Latin word | supernovaes | {english_ispell,english_stem} | english_stem | {supernova}
28122839</programlisting>
28132840
28142841 <para>
28152842 In this example, the word <literal>Brightest</> was recognized by the
28162843 parser as a <literal>Latin word</literal> (alias <literal>lword</literal>).
28172844 For this token type the dictionary list is
2818- <literal>public. english_ispell</> and
2819- <literal>pg_catalog. english_stem</literal>. The word was recognized by
2820- <literal>public. english_ispell</literal>, which reduced it to the noun
2845+ <literal>english_ispell</> and
2846+ <literal>english_stem</literal>. The word was recognized by
2847+ <literal>english_ispell</literal>, which reduced it to the noun
28212848 <literal>bright</literal>. The word <literal>supernovaes</literal> is
2822- unknown to the <literal>public. english_ispell</literal> dictionary so it
2849+ unknown to the <literal>english_ispell</literal> dictionary so it
28232850 was passed to the next dictionary, and, fortunately, was recognized (in
2824- fact, <literal>public. english_stem</literal> is a Snowball dictionary which
2851+ fact, <literal>english_stem</literal> is a Snowball dictionary which
28252852 recognizes everything; that is why it was placed at the end of the
28262853 dictionary list).
28272854 </para>
28282855
28292856 <para>
28302857 The word <literal>The</literal> was recognized by the
2831- <literal>public. english_ispell</literal> dictionary as a stop word (<xref
2858+ <literal>english_ispell</literal> dictionary as a stop word (<xref
28322859 linkend="textsearch-stopwords">) and will not be indexed.
28332860 The spaces are discarded too, since the configuration provides no
28342861 dictionaries at all for them.
@@ -2839,16 +2866,15 @@ SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
28392866 you want to see:
28402867
28412868<programlisting>
2842- SELECT"Alias", "Token", "Lexized token"
2869+ SELECTalias, token, dictionary, lexemes
28432870FROM ts_debug('public.english','The Brightest supernovaes');
2844- Alias | Token | Lexized token
2845- -------+-------------+--------------------------------------
2846- lword | The | public.english_ispell: {}
2847- blank | |
2848- lword | Brightest | public.english_ispell: {bright}
2849- blank | |
2850- lword | supernovaes | pg_catalog.english_stem: {supernova}
2851- (5 rows)
2871+ alias | token | dictionary | lexemes
2872+ -------+-------------+----------------+-------------
2873+ lword | The | english_ispell | {}
2874+ blank | | |
2875+ lword | Brightest | english_ispell | {bright}
2876+ blank | | |
2877+ lword | supernovaes | english_stem | {supernova}
28522878</programlisting>
28532879 </para>
28542880