Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite6a310b

Browse files
committed
Reimplement text_position and related functions to use Boyer-Moore-Horspool
searching instead of naive matching. In the worst case this has the sameO(M*N) complexity as the naive method, but the worst case is hard to hit,and the average case is very fast, especially with longer patterns.David Rowley
1 parent2cf3f66 commite6a310b

File tree

1 file changed

+185
-30
lines changed

1 file changed

+185
-30
lines changed

‎src/backend/utils/adt/varlena.c

Lines changed: 185 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.167 2008/05/27 00:13:09 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.168 2008/09/07 04:20:00 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -39,6 +39,9 @@ typedef struct
3939
pg_wchar*wstr2;/* note: these are palloc'd */
4040
intlen1;/* string lengths in logical characters */
4141
intlen2;
42+
/* Skip table for Boyer-Moore-Horspool search algorithm: */
43+
intskiptablemask;/* mask for ANDing with skiptable subscripts */
44+
intskiptable[256];/* skip distance for given mismatched char */
4245
}TextPositionState;
4346

4447
#defineDatumGetUnknownP(X)((unknown *) PG_DETOAST_DATUM(X))
@@ -753,7 +756,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
753756
* If we're working with an untoasted source, no need to do an extra
754757
* copying step.
755758
*/
756-
if (VARATT_IS_COMPRESSED(DatumGetPointer(str))||
759+
if (VARATT_IS_COMPRESSED(DatumGetPointer(str))||
757760
VARATT_IS_EXTERNAL(DatumGetPointer(str)))
758761
slice=DatumGetTextPSlice(str,slice_start,slice_size);
759762
else
@@ -866,6 +869,7 @@ text_position(text *t1, text *t2)
866869
returnresult;
867870
}
868871

872+
869873
/*
870874
* text_position_setup, text_position_next, text_position_cleanup -
871875
*Component steps of text_position()
@@ -909,64 +913,215 @@ text_position_setup(text *t1, text *t2, TextPositionState *state)
909913
state->len1=len1;
910914
state->len2=len2;
911915
}
916+
917+
/*
918+
* Prepare the skip table for Boyer-Moore-Horspool searching. In these
919+
* notes we use the terminology that the "haystack" is the string to be
920+
* searched (t1) and the "needle" is the pattern being sought (t2).
921+
*
922+
* If the needle is empty or bigger than the haystack then there is no
923+
* point in wasting cycles initializing the table. We also choose not
924+
* to use B-M-H for needles of length 1, since the skip table can't
925+
* possibly save anything in that case.
926+
*/
927+
if (len1 >=len2&&len2>1)
928+
{
929+
intsearchlength=len1-len2;
930+
intskiptablemask;
931+
intlast;
932+
inti;
933+
934+
/*
935+
* First we must determine how much of the skip table to use. The
936+
* declaration of TextPositionState allows up to 256 elements, but for
937+
* short search problems we don't really want to have to initialize so
938+
* many elements --- it would take too long in comparison to the
939+
* actual search time. So we choose a useful skip table size based on
940+
* the haystack length minus the needle length. The closer the needle
941+
* length is to the haystack length the less useful skipping becomes.
942+
*
943+
* Note: since we use bit-masking to select table elements, the skip
944+
* table size MUST be a power of 2, and so the mask must be 2^N-1.
945+
*/
946+
if (searchlength<16)
947+
skiptablemask=3;
948+
elseif (searchlength<64)
949+
skiptablemask=7;
950+
elseif (searchlength<128)
951+
skiptablemask=15;
952+
elseif (searchlength<512)
953+
skiptablemask=31;
954+
elseif (searchlength<2048)
955+
skiptablemask=63;
956+
elseif (searchlength<4096)
957+
skiptablemask=127;
958+
else
959+
skiptablemask=255;
960+
state->skiptablemask=skiptablemask;
961+
962+
/*
963+
* Initialize the skip table. We set all elements to the needle
964+
* length, since this is the correct skip distance for any character
965+
* not found in the needle.
966+
*/
967+
for (i=0;i <=skiptablemask;i++)
968+
state->skiptable[i]=len2;
969+
970+
/*
971+
* Now examine the needle. For each character except the last one,
972+
* set the corresponding table element to the appropriate skip
973+
* distance. Note that when two characters share the same skip table
974+
* entry, the one later in the needle must determine the skip distance.
975+
*/
976+
last=len2-1;
977+
978+
if (!state->use_wchar)
979+
{
980+
constchar*str2=state->str2;
981+
982+
for (i=0;i<last;i++)
983+
state->skiptable[(unsignedchar)str2[i]&skiptablemask]=last-i;
984+
}
985+
else
986+
{
987+
constpg_wchar*wstr2=state->wstr2;
988+
989+
for (i=0;i<last;i++)
990+
state->skiptable[wstr2[i]&skiptablemask]=last-i;
991+
}
992+
}
912993
}
913994

914995
staticint
915996
text_position_next(intstart_pos,TextPositionState*state)
916997
{
917-
intpos=0,
918-
p,
919-
px;
998+
inthaystack_len=state->len1;
999+
intneedle_len=state->len2;
1000+
intskiptablemask=state->skiptablemask;
9201001

9211002
Assert(start_pos>0);/* else caller error */
9221003

923-
if (state->len2 <=0)
1004+
if (needle_len <=0)
9241005
returnstart_pos;/* result for empty pattern */
9251006

1007+
start_pos--;/* adjust for zero based arrays */
1008+
1009+
/* Done if the needle can't possibly fit */
1010+
if (haystack_len<start_pos+needle_len)
1011+
return0;
1012+
9261013
if (!state->use_wchar)
9271014
{
9281015
/* simple case - single byte encoding */
929-
char*p1=state->str1;
930-
char*p2=state->str2;
1016+
constchar*haystack=state->str1;
1017+
constchar*needle=state->str2;
1018+
constchar*haystack_end=&haystack[haystack_len];
1019+
constchar*hptr;
9311020

932-
/* no use in searching str past point where search_str will fit */
933-
px= (state->len1-state->len2);
934-
935-
p1+=start_pos-1;
1021+
if (needle_len==1)
1022+
{
1023+
/* No point in using B-M-H for a one-character needle */
1024+
charnchar=*needle;
9361025

937-
for (p=start_pos-1;p <=px;p++)
1026+
hptr=&haystack[start_pos];
1027+
while (hptr<haystack_end)
1028+
{
1029+
if (*hptr==nchar)
1030+
returnhptr-haystack+1;
1031+
hptr++;
1032+
}
1033+
}
1034+
else
9381035
{
939-
if ((*p1==*p2)&& (strncmp(p1,p2,state->len2)==0))
1036+
constchar*needle_last=&needle[needle_len-1];
1037+
1038+
/* Start at startpos plus the length of the needle */
1039+
hptr=&haystack[start_pos+needle_len-1];
1040+
while (hptr<haystack_end)
9401041
{
941-
pos=p+1;
942-
break;
1042+
/* Match the needle scanning *backward* */
1043+
constchar*nptr;
1044+
constchar*p;
1045+
1046+
nptr=needle_last;
1047+
p=hptr;
1048+
while (*nptr==*p)
1049+
{
1050+
/* Matched it all? If so, return 1-based position */
1051+
if (nptr==needle)
1052+
returnp-haystack+1;
1053+
nptr--,p--;
1054+
}
1055+
/*
1056+
* No match, so use the haystack char at hptr to decide how
1057+
* far to advance. If the needle had any occurrence of that
1058+
* character (or more precisely, one sharing the same
1059+
* skiptable entry) before its last character, then we advance
1060+
* far enough to align the last such needle character with
1061+
* that haystack position. Otherwise we can advance by the
1062+
* whole needle length.
1063+
*/
1064+
hptr+=state->skiptable[(unsignedchar)*hptr&skiptablemask];
9431065
}
944-
p1++;
9451066
}
9461067
}
9471068
else
9481069
{
949-
/* not as simple - multibyte encoding */
950-
pg_wchar*p1=state->wstr1;
951-
pg_wchar*p2=state->wstr2;
1070+
/* The multibyte char version. This works exactly the same way. */
1071+
constpg_wchar*haystack=state->wstr1;
1072+
constpg_wchar*needle=state->wstr2;
1073+
constpg_wchar*haystack_end=&haystack[haystack_len];
1074+
constpg_wchar*hptr;
9521075

953-
/* no use in searching str past point where search_str will fit */
954-
px= (state->len1-state->len2);
955-
956-
p1+=start_pos-1;
1076+
if (needle_len==1)
1077+
{
1078+
/* No point in using B-M-H for a one-character needle */
1079+
pg_wcharnchar=*needle;
9571080

958-
for (p=start_pos-1;p <=px;p++)
1081+
hptr=&haystack[start_pos];
1082+
while (hptr<haystack_end)
1083+
{
1084+
if (*hptr==nchar)
1085+
returnhptr-haystack+1;
1086+
hptr++;
1087+
}
1088+
}
1089+
else
9591090
{
960-
if ((*p1==*p2)&& (pg_wchar_strncmp(p1,p2,state->len2)==0))
1091+
constpg_wchar*needle_last=&needle[needle_len-1];
1092+
1093+
/* Start at startpos plus the length of the needle */
1094+
hptr=&haystack[start_pos+needle_len-1];
1095+
while (hptr<haystack_end)
9611096
{
962-
pos=p+1;
963-
break;
1097+
/* Match the needle scanning *backward* */
1098+
constpg_wchar*nptr;
1099+
constpg_wchar*p;
1100+
1101+
nptr=needle_last;
1102+
p=hptr;
1103+
while (*nptr==*p)
1104+
{
1105+
/* Matched it all? If so, return 1-based position */
1106+
if (nptr==needle)
1107+
returnp-haystack+1;
1108+
nptr--,p--;
1109+
}
1110+
/*
1111+
* No match, so use the haystack char at hptr to decide how
1112+
* far to advance. If the needle had any occurrence of that
1113+
* character (or more precisely, one sharing the same
1114+
* skiptable entry) before its last character, then we advance
1115+
* far enough to align the last such needle character with
1116+
* that haystack position. Otherwise we can advance by the
1117+
* whole needle length.
1118+
*/
1119+
hptr+=state->skiptable[*hptr&skiptablemask];
9641120
}
965-
p1++;
9661121
}
9671122
}
9681123

969-
returnpos;
1124+
return0;/* not found */
9701125
}
9711126

9721127
staticvoid

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp