|
8 | 8 | *
|
9 | 9 | *
|
10 | 10 | * IDENTIFICATION
|
11 |
| - * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.167 2008/05/27 00:13:09 tgl Exp $ |
| 11 | + * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.168 2008/09/07 04:20:00 tgl Exp $ |
12 | 12 | *
|
13 | 13 | *-------------------------------------------------------------------------
|
14 | 14 | */
|
@@ -39,6 +39,9 @@ typedef struct
|
39 | 39 | pg_wchar*wstr2;/* note: these are palloc'd */
|
40 | 40 | intlen1;/* string lengths in logical characters */
|
41 | 41 | intlen2;
|
| 42 | +/* Skip table for Boyer-Moore-Horspool search algorithm: */ |
| 43 | +intskiptablemask;/* mask for ANDing with skiptable subscripts */ |
| 44 | +intskiptable[256];/* skip distance for given mismatched char */ |
42 | 45 | }TextPositionState;
|
43 | 46 |
|
44 | 47 | #defineDatumGetUnknownP(X)((unknown *) PG_DETOAST_DATUM(X))
|
@@ -753,7 +756,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
|
753 | 756 | * If we're working with an untoasted source, no need to do an extra
|
754 | 757 | * copying step.
|
755 | 758 | */
|
756 |
| -if (VARATT_IS_COMPRESSED(DatumGetPointer(str))|| |
| 759 | +if (VARATT_IS_COMPRESSED(DatumGetPointer(str))|| |
757 | 760 | VARATT_IS_EXTERNAL(DatumGetPointer(str)))
|
758 | 761 | slice=DatumGetTextPSlice(str,slice_start,slice_size);
|
759 | 762 | else
|
@@ -866,6 +869,7 @@ text_position(text *t1, text *t2)
|
866 | 869 | returnresult;
|
867 | 870 | }
|
868 | 871 |
|
| 872 | + |
869 | 873 | /*
|
870 | 874 | * text_position_setup, text_position_next, text_position_cleanup -
|
871 | 875 | *Component steps of text_position()
|
@@ -909,64 +913,215 @@ text_position_setup(text *t1, text *t2, TextPositionState *state)
|
909 | 913 | state->len1=len1;
|
910 | 914 | state->len2=len2;
|
911 | 915 | }
|
| 916 | + |
| 917 | +/* |
| 918 | + * Prepare the skip table for Boyer-Moore-Horspool searching. In these |
| 919 | + * notes we use the terminology that the "haystack" is the string to be |
| 920 | + * searched (t1) and the "needle" is the pattern being sought (t2). |
| 921 | + * |
| 922 | + * If the needle is empty or bigger than the haystack then there is no |
| 923 | + * point in wasting cycles initializing the table. We also choose not |
| 924 | + * to use B-M-H for needles of length 1, since the skip table can't |
| 925 | + * possibly save anything in that case. |
| 926 | + */ |
| 927 | +if (len1 >=len2&&len2>1) |
| 928 | +{ |
| 929 | +intsearchlength=len1-len2; |
| 930 | +intskiptablemask; |
| 931 | +intlast; |
| 932 | +inti; |
| 933 | + |
| 934 | +/* |
| 935 | + * First we must determine how much of the skip table to use. The |
| 936 | + * declaration of TextPositionState allows up to 256 elements, but for |
| 937 | + * short search problems we don't really want to have to initialize so |
| 938 | + * many elements --- it would take too long in comparison to the |
| 939 | + * actual search time. So we choose a useful skip table size based on |
| 940 | + * the haystack length minus the needle length. The closer the needle |
| 941 | + * length is to the haystack length the less useful skipping becomes. |
| 942 | + * |
| 943 | + * Note: since we use bit-masking to select table elements, the skip |
| 944 | + * table size MUST be a power of 2, and so the mask must be 2^N-1. |
| 945 | + */ |
| 946 | +if (searchlength<16) |
| 947 | +skiptablemask=3; |
| 948 | +elseif (searchlength<64) |
| 949 | +skiptablemask=7; |
| 950 | +elseif (searchlength<128) |
| 951 | +skiptablemask=15; |
| 952 | +elseif (searchlength<512) |
| 953 | +skiptablemask=31; |
| 954 | +elseif (searchlength<2048) |
| 955 | +skiptablemask=63; |
| 956 | +elseif (searchlength<4096) |
| 957 | +skiptablemask=127; |
| 958 | +else |
| 959 | +skiptablemask=255; |
| 960 | +state->skiptablemask=skiptablemask; |
| 961 | + |
| 962 | +/* |
| 963 | + * Initialize the skip table. We set all elements to the needle |
| 964 | + * length, since this is the correct skip distance for any character |
| 965 | + * not found in the needle. |
| 966 | + */ |
| 967 | +for (i=0;i <=skiptablemask;i++) |
| 968 | +state->skiptable[i]=len2; |
| 969 | + |
| 970 | +/* |
| 971 | + * Now examine the needle. For each character except the last one, |
| 972 | + * set the corresponding table element to the appropriate skip |
| 973 | + * distance. Note that when two characters share the same skip table |
| 974 | + * entry, the one later in the needle must determine the skip distance. |
| 975 | + */ |
| 976 | +last=len2-1; |
| 977 | + |
| 978 | +if (!state->use_wchar) |
| 979 | +{ |
| 980 | +constchar*str2=state->str2; |
| 981 | + |
| 982 | +for (i=0;i<last;i++) |
| 983 | +state->skiptable[(unsignedchar)str2[i]&skiptablemask]=last-i; |
| 984 | +} |
| 985 | +else |
| 986 | +{ |
| 987 | +constpg_wchar*wstr2=state->wstr2; |
| 988 | + |
| 989 | +for (i=0;i<last;i++) |
| 990 | +state->skiptable[wstr2[i]&skiptablemask]=last-i; |
| 991 | +} |
| 992 | +} |
912 | 993 | }
|
913 | 994 |
|
914 | 995 | staticint
|
915 | 996 | text_position_next(intstart_pos,TextPositionState*state)
|
916 | 997 | {
|
917 |
| -intpos=0, |
918 |
| -p, |
919 |
| -px; |
| 998 | +inthaystack_len=state->len1; |
| 999 | +intneedle_len=state->len2; |
| 1000 | +intskiptablemask=state->skiptablemask; |
920 | 1001 |
|
921 | 1002 | Assert(start_pos>0);/* else caller error */
|
922 | 1003 |
|
923 |
| -if (state->len2 <=0) |
| 1004 | +if (needle_len <=0) |
924 | 1005 | returnstart_pos;/* result for empty pattern */
|
925 | 1006 |
|
| 1007 | +start_pos--;/* adjust for zero based arrays */ |
| 1008 | + |
| 1009 | +/* Done if the needle can't possibly fit */ |
| 1010 | +if (haystack_len<start_pos+needle_len) |
| 1011 | +return0; |
| 1012 | + |
926 | 1013 | if (!state->use_wchar)
|
927 | 1014 | {
|
928 | 1015 | /* simple case - single byte encoding */
|
929 |
| -char*p1=state->str1; |
930 |
| -char*p2=state->str2; |
| 1016 | +constchar*haystack=state->str1; |
| 1017 | +constchar*needle=state->str2; |
| 1018 | +constchar*haystack_end=&haystack[haystack_len]; |
| 1019 | +constchar*hptr; |
931 | 1020 |
|
932 |
| -/* no use in searching str past point where search_str will fit */ |
933 |
| -px= (state->len1-state->len2); |
934 |
| - |
935 |
| -p1+=start_pos-1; |
| 1021 | +if (needle_len==1) |
| 1022 | +{ |
| 1023 | +/* No point in using B-M-H for a one-character needle */ |
| 1024 | +charnchar=*needle; |
936 | 1025 |
|
937 |
| -for (p=start_pos-1;p <=px;p++) |
| 1026 | +hptr=&haystack[start_pos]; |
| 1027 | +while (hptr<haystack_end) |
| 1028 | +{ |
| 1029 | +if (*hptr==nchar) |
| 1030 | +returnhptr-haystack+1; |
| 1031 | +hptr++; |
| 1032 | +} |
| 1033 | +} |
| 1034 | +else |
938 | 1035 | {
|
939 |
| -if ((*p1==*p2)&& (strncmp(p1,p2,state->len2)==0)) |
| 1036 | +constchar*needle_last=&needle[needle_len-1]; |
| 1037 | + |
| 1038 | +/* Start at startpos plus the length of the needle */ |
| 1039 | +hptr=&haystack[start_pos+needle_len-1]; |
| 1040 | +while (hptr<haystack_end) |
940 | 1041 | {
|
941 |
| -pos=p+1; |
942 |
| -break; |
| 1042 | +/* Match the needle scanning *backward* */ |
| 1043 | +constchar*nptr; |
| 1044 | +constchar*p; |
| 1045 | + |
| 1046 | +nptr=needle_last; |
| 1047 | +p=hptr; |
| 1048 | +while (*nptr==*p) |
| 1049 | +{ |
| 1050 | +/* Matched it all? If so, return 1-based position */ |
| 1051 | +if (nptr==needle) |
| 1052 | +returnp-haystack+1; |
| 1053 | +nptr--,p--; |
| 1054 | +} |
| 1055 | +/* |
| 1056 | + * No match, so use the haystack char at hptr to decide how |
| 1057 | + * far to advance. If the needle had any occurrence of that |
| 1058 | + * character (or more precisely, one sharing the same |
| 1059 | + * skiptable entry) before its last character, then we advance |
| 1060 | + * far enough to align the last such needle character with |
| 1061 | + * that haystack position. Otherwise we can advance by the |
| 1062 | + * whole needle length. |
| 1063 | + */ |
| 1064 | +hptr+=state->skiptable[(unsignedchar)*hptr&skiptablemask]; |
943 | 1065 | }
|
944 |
| -p1++; |
945 | 1066 | }
|
946 | 1067 | }
|
947 | 1068 | else
|
948 | 1069 | {
|
949 |
| -/* not as simple - multibyte encoding */ |
950 |
| -pg_wchar*p1=state->wstr1; |
951 |
| -pg_wchar*p2=state->wstr2; |
| 1070 | +/* The multibyte char version. This works exactly the same way. */ |
| 1071 | +constpg_wchar*haystack=state->wstr1; |
| 1072 | +constpg_wchar*needle=state->wstr2; |
| 1073 | +constpg_wchar*haystack_end=&haystack[haystack_len]; |
| 1074 | +constpg_wchar*hptr; |
952 | 1075 |
|
953 |
| -/* no use in searching str past point where search_str will fit */ |
954 |
| -px= (state->len1-state->len2); |
955 |
| - |
956 |
| -p1+=start_pos-1; |
| 1076 | +if (needle_len==1) |
| 1077 | +{ |
| 1078 | +/* No point in using B-M-H for a one-character needle */ |
| 1079 | +pg_wcharnchar=*needle; |
957 | 1080 |
|
958 |
| -for (p=start_pos-1;p <=px;p++) |
| 1081 | +hptr=&haystack[start_pos]; |
| 1082 | +while (hptr<haystack_end) |
| 1083 | +{ |
| 1084 | +if (*hptr==nchar) |
| 1085 | +returnhptr-haystack+1; |
| 1086 | +hptr++; |
| 1087 | +} |
| 1088 | +} |
| 1089 | +else |
959 | 1090 | {
|
960 |
| -if ((*p1==*p2)&& (pg_wchar_strncmp(p1,p2,state->len2)==0)) |
| 1091 | +constpg_wchar*needle_last=&needle[needle_len-1]; |
| 1092 | + |
| 1093 | +/* Start at startpos plus the length of the needle */ |
| 1094 | +hptr=&haystack[start_pos+needle_len-1]; |
| 1095 | +while (hptr<haystack_end) |
961 | 1096 | {
|
962 |
| -pos=p+1; |
963 |
| -break; |
| 1097 | +/* Match the needle scanning *backward* */ |
| 1098 | +constpg_wchar*nptr; |
| 1099 | +constpg_wchar*p; |
| 1100 | + |
| 1101 | +nptr=needle_last; |
| 1102 | +p=hptr; |
| 1103 | +while (*nptr==*p) |
| 1104 | +{ |
| 1105 | +/* Matched it all? If so, return 1-based position */ |
| 1106 | +if (nptr==needle) |
| 1107 | +returnp-haystack+1; |
| 1108 | +nptr--,p--; |
| 1109 | +} |
| 1110 | +/* |
| 1111 | + * No match, so use the haystack char at hptr to decide how |
| 1112 | + * far to advance. If the needle had any occurrence of that |
| 1113 | + * character (or more precisely, one sharing the same |
| 1114 | + * skiptable entry) before its last character, then we advance |
| 1115 | + * far enough to align the last such needle character with |
| 1116 | + * that haystack position. Otherwise we can advance by the |
| 1117 | + * whole needle length. |
| 1118 | + */ |
| 1119 | +hptr+=state->skiptable[*hptr&skiptablemask]; |
964 | 1120 | }
|
965 |
| -p1++; |
966 | 1121 | }
|
967 | 1122 | }
|
968 | 1123 |
|
969 |
| -returnpos; |
| 1124 | +return0;/* not found */ |
970 | 1125 | }
|
971 | 1126 |
|
972 | 1127 | staticvoid
|
|