77 *
88 *
99 * IDENTIFICATION
10- * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.10 2007/11/15 22 :25:16 momjian Exp $
10+ * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.11 2007/11/20 02 :25:22 adunstan Exp $
1111 *
1212 *-------------------------------------------------------------------------
1313 */
5050#define DECIMAL 20
5151#define SIGNEDINT 21
5252#define UNSIGNEDINT 22
53- #define HTMLENTITY 23
53+ #define XMLENTITY 23
5454
5555#define LASTNUM 23
5656
@@ -95,7 +95,7 @@ static const char *const lex_descr[] = {
9595"Hyphenated word part, all letters" ,
9696"Hyphenated word part, all ASCII" ,
9797"Space symbols" ,
98- "HTML tag" ,
98+ "XML tag" ,
9999"Protocol head" ,
100100"Hyphenated word, letters and digits" ,
101101"Hyphenated word, all ASCII" ,
@@ -105,7 +105,7 @@ static const char *const lex_descr[] = {
105105"Decimal notation" ,
106106"Signed integer" ,
107107"Unsigned integer" ,
108- "HTML entity"
108+ "XML entity"
109109};
110110
111111
@@ -132,11 +132,13 @@ typedef enum
132132TPS_InMantissaFirst ,
133133TPS_InMantissaSign ,
134134TPS_InMantissa ,
135- TPS_InHTMLEntityFirst ,
136- TPS_InHTMLEntity ,
137- TPS_InHTMLEntityNumFirst ,
138- TPS_InHTMLEntityNum ,
139- TPS_InHTMLEntityEnd ,
135+ TPS_InXMLEntityFirst ,
136+ TPS_InXMLEntity ,
137+ TPS_InXMLEntityNumFirst ,
138+ TPS_InXMLEntityNum ,
139+ TPS_InXMLEntityHexNumFirst ,
140+ TPS_InXMLEntityHexNum ,
141+ TPS_InXMLEntityEnd ,
140142TPS_InTagFirst ,
141143TPS_InXMLBegin ,
142144TPS_InTagCloseFirst ,
@@ -653,7 +655,7 @@ static const TParserStateActionItem actionTPS_Base[] = {
653655{p_isdigit ,0 ,A_NEXT ,TPS_InUnsignedInt ,0 ,NULL },
654656{p_iseqC ,'-' ,A_PUSH ,TPS_InSignedIntFirst ,0 ,NULL },
655657{p_iseqC ,'+' ,A_PUSH ,TPS_InSignedIntFirst ,0 ,NULL },
656- {p_iseqC ,'&' ,A_PUSH ,TPS_InHTMLEntityFirst ,0 ,NULL },
658+ {p_iseqC ,'&' ,A_PUSH ,TPS_InXMLEntityFirst ,0 ,NULL },
657659{p_iseqC ,'~' ,A_PUSH ,TPS_InFileTwiddle ,0 ,NULL },
658660{p_iseqC ,'/' ,A_PUSH ,TPS_InFileFirst ,0 ,NULL },
659661{p_iseqC ,'.' ,A_PUSH ,TPS_InPathFirstFirst ,0 ,NULL },
@@ -811,35 +813,56 @@ static const TParserStateActionItem actionTPS_InMantissa[] = {
811813{NULL ,0 ,A_BINGO ,TPS_Base ,SCIENTIFIC ,NULL }
812814};
813815
814- static const TParserStateActionItem actionTPS_InHTMLEntityFirst []= {
816+ static const TParserStateActionItem actionTPS_InXMLEntityFirst []= {
815817{p_isEOF ,0 ,A_POP ,TPS_Null ,0 ,NULL },
816- {p_iseqC ,'#' ,A_NEXT ,TPS_InHTMLEntityNumFirst ,0 ,NULL },
817- {p_isasclet ,0 ,A_NEXT ,TPS_InHTMLEntity ,0 ,NULL },
818+ {p_iseqC ,'#' ,A_NEXT ,TPS_InXMLEntityNumFirst ,0 ,NULL },
819+ {p_isasclet ,0 ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
820+ {p_iseqC ,':' ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
821+ {p_iseqC ,'_' ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
818822{NULL ,0 ,A_POP ,TPS_Null ,0 ,NULL }
819823};
820824
821- static const TParserStateActionItem actionTPS_InHTMLEntity []= {
825+ static const TParserStateActionItem actionTPS_InXMLEntity []= {
822826{p_isEOF ,0 ,A_POP ,TPS_Null ,0 ,NULL },
823- {p_isasclet ,0 ,A_NEXT ,TPS_InHTMLEntity ,0 ,NULL },
824- {p_iseqC ,';' ,A_NEXT ,TPS_InHTMLEntityEnd ,0 ,NULL },
827+ {p_isalnum ,0 ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
828+ {p_iseqC ,':' ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
829+ {p_iseqC ,'_' ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
830+ {p_iseqC ,':' ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
831+ {p_iseqC ,'.' ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
832+ {p_iseqC ,'-' ,A_NEXT ,TPS_InXMLEntity ,0 ,NULL },
833+ {p_iseqC ,';' ,A_NEXT ,TPS_InXMLEntityEnd ,0 ,NULL },
825834{NULL ,0 ,A_POP ,TPS_Null ,0 ,NULL }
826835};
827836
828- static const TParserStateActionItem actionTPS_InHTMLEntityNumFirst []= {
837+ static const TParserStateActionItem actionTPS_InXMLEntityNumFirst []= {
829838{p_isEOF ,0 ,A_POP ,TPS_Null ,0 ,NULL },
830- {p_isdigit ,0 ,A_NEXT ,TPS_InHTMLEntityNum ,0 ,NULL },
839+ {p_iseqC ,'x' ,A_NEXT ,TPS_InXMLEntityHexNumFirst ,0 ,NULL },
840+ {p_isdigit ,0 ,A_NEXT ,TPS_InXMLEntityNum ,0 ,NULL },
831841{NULL ,0 ,A_POP ,TPS_Null ,0 ,NULL }
832842};
833843
834- static const TParserStateActionItem actionTPS_InHTMLEntityNum []= {
844+ static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst []= {
835845{p_isEOF ,0 ,A_POP ,TPS_Null ,0 ,NULL },
836- {p_isdigit ,0 ,A_NEXT ,TPS_InHTMLEntityNum ,0 ,NULL },
837- {p_iseqC ,';' ,A_NEXT ,TPS_InHTMLEntityEnd ,0 ,NULL },
846+ {p_isxdigit ,0 ,A_NEXT ,TPS_InXMLEntityHexNum ,0 ,NULL },
838847{NULL ,0 ,A_POP ,TPS_Null ,0 ,NULL }
839848};
840849
841- static const TParserStateActionItem actionTPS_InHTMLEntityEnd []= {
842- {NULL ,0 ,A_BINGO |A_CLEAR ,TPS_Base ,HTMLENTITY ,NULL }
850+ static const TParserStateActionItem actionTPS_InXMLEntityNum []= {
851+ {p_isEOF ,0 ,A_POP ,TPS_Null ,0 ,NULL },
852+ {p_isdigit ,0 ,A_NEXT ,TPS_InXMLEntityNum ,0 ,NULL },
853+ {p_iseqC ,';' ,A_NEXT ,TPS_InXMLEntityEnd ,0 ,NULL },
854+ {NULL ,0 ,A_POP ,TPS_Null ,0 ,NULL }
855+ };
856+
857+ static const TParserStateActionItem actionTPS_InXMLEntityHexNum []= {
858+ {p_isEOF ,0 ,A_POP ,TPS_Null ,0 ,NULL },
859+ {p_isxdigit ,0 ,A_NEXT ,TPS_InXMLEntityHexNum ,0 ,NULL },
860+ {p_iseqC ,';' ,A_NEXT ,TPS_InXMLEntityEnd ,0 ,NULL },
861+ {NULL ,0 ,A_POP ,TPS_Null ,0 ,NULL }
862+ };
863+
864+ static const TParserStateActionItem actionTPS_InXMLEntityEnd []= {
865+ {NULL ,0 ,A_BINGO |A_CLEAR ,TPS_Base ,XMLENTITY ,NULL }
843866};
844867
845868static const TParserStateActionItem actionTPS_InTagFirst []= {
@@ -854,8 +877,8 @@ static const TParserStateActionItem actionTPS_InTagFirst[] = {
854877static const TParserStateActionItem actionTPS_InXMLBegin []= {
855878{p_isEOF ,0 ,A_POP ,TPS_Null ,0 ,NULL },
856879/* <?xml ... */
880+ /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
857881{p_iseqC ,'x' ,A_NEXT ,TPS_InTag ,0 ,NULL },
858- {p_iseqC ,'X' ,A_NEXT ,TPS_InTag ,0 ,NULL },
859882{NULL ,0 ,A_POP ,TPS_Null ,0 ,NULL }
860883};
861884
@@ -1278,11 +1301,13 @@ static const TParserStateAction Actions[] = {
12781301TPARSERSTATEACTION (TPS_InMantissaFirst ),
12791302TPARSERSTATEACTION (TPS_InMantissaSign ),
12801303TPARSERSTATEACTION (TPS_InMantissa ),
1281- TPARSERSTATEACTION (TPS_InHTMLEntityFirst ),
1282- TPARSERSTATEACTION (TPS_InHTMLEntity ),
1283- TPARSERSTATEACTION (TPS_InHTMLEntityNumFirst ),
1284- TPARSERSTATEACTION (TPS_InHTMLEntityNum ),
1285- TPARSERSTATEACTION (TPS_InHTMLEntityEnd ),
1304+ TPARSERSTATEACTION (TPS_InXMLEntityFirst ),
1305+ TPARSERSTATEACTION (TPS_InXMLEntity ),
1306+ TPARSERSTATEACTION (TPS_InXMLEntityNumFirst ),
1307+ TPARSERSTATEACTION (TPS_InXMLEntityNum ),
1308+ TPARSERSTATEACTION (TPS_InXMLEntityHexNumFirst ),
1309+ TPARSERSTATEACTION (TPS_InXMLEntityHexNum ),
1310+ TPARSERSTATEACTION (TPS_InXMLEntityEnd ),
12861311TPARSERSTATEACTION (TPS_InTagFirst ),
12871312TPARSERSTATEACTION (TPS_InXMLBegin ),
12881313TPARSERSTATEACTION (TPS_InTagCloseFirst ),
@@ -1556,9 +1581,9 @@ prsd_end(PG_FUNCTION_ARGS)
15561581#define COMPLEXTOKEN (x ) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
15571582#define ENDPUNCTOKEN (x ) ( (x)==SPACE )
15581583
1559- #define TS_IDIGNORE (x ) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY )
1584+ #define TS_IDIGNORE (x ) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
15601585#define HLIDIGNORE (x ) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1561- #define HTMLHLIDIGNORE (x ) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1586+ #define XMLHLIDIGNORE (x ) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
15621587#define NONWORDTOKEN (x ) ( (x)==SPACE || HLIDIGNORE(x) )
15631588#define NOENDTOKEN (x )( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
15641589
@@ -1839,7 +1864,7 @@ prsd_headline(PG_FUNCTION_ARGS)
18391864}
18401865else
18411866{
1842- if (HTMLHLIDIGNORE (prs -> words [i ].type ))
1867+ if (XMLHLIDIGNORE (prs -> words [i ].type ))
18431868prs -> words [i ].replace = 1 ;
18441869}
18451870