Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb464e51

Browse files
committed
Update to latest Snowball sources.
It's been some time since we did this, partly because the upstreamsnowball project hasn't formally tagged a new release since 2021.The main motivation for doing it now is to absorb a bug fix(their commit e322673a841d9abd69994ae8cd20e191090b6ef4), whichprevents a null pointer dereference crash if SN_create_env() getsa malloc failure at just the wrong point. We'll patch the backbranches with only that change, but we might as well do the fullsync dance on HEAD.Aside from a bunch of mostly-minor tweaks to existing stemmers, thisupdate adds a new stemmer for Estonian. It also removes the existingstemmer for Romanian using ISO-8859-2 encoding. Upstream apparentlyconcluded that ISO-8859-2 doesn't provide an adequate representationof some Romanian characters, and the UTF-8 implementation should beused instead.While at it, update the README's instructions for doing a sync,which have not been adjusted during the addition of meson tooling.Thanks to Maksim Korotkov for discovering the null-pointerbug and submitting the fix to upstream snowball.Reported-by: Maksim Korotkov <m.korotkov@postgrespro.ru>Discussion:https://postgr.es/m/1d1a46-67ab1000-21-80c451@83151435
1 parent71d02dc commitb464e51

File tree

61 files changed

+4970
-4578
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+4970
-4578
lines changed

‎doc/src/sgml/textsearch.sgml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3852,6 +3852,7 @@ Parser: "pg_catalog.default"
38523852
pg_catalog | danish_stem | snowball stemmer for danish language
38533853
pg_catalog | dutch_stem | snowball stemmer for dutch language
38543854
pg_catalog | english_stem | snowball stemmer for english language
3855+
pg_catalog | estonian_stem | snowball stemmer for estonian language
38553856
pg_catalog | finnish_stem | snowball stemmer for finnish language
38563857
pg_catalog | french_stem | snowball stemmer for french language
38573858
pg_catalog | german_stem | snowball stemmer for german language

‎src/backend/snowball/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ OBJS += \
4040
stem_ISO_8859_1_spanish.o\
4141
stem_ISO_8859_1_swedish.o\
4242
stem_ISO_8859_2_hungarian.o\
43-
stem_ISO_8859_2_romanian.o\
4443
stem_KOI8_R_russian.o\
4544
stem_UTF_8_arabic.o\
4645
stem_UTF_8_armenian.o\
@@ -49,6 +48,7 @@ OBJS += \
4948
stem_UTF_8_danish.o\
5049
stem_UTF_8_dutch.o\
5150
stem_UTF_8_english.o\
51+
stem_UTF_8_estonian.o\
5252
stem_UTF_8_finnish.o\
5353
stem_UTF_8_french.o\
5454
stem_UTF_8_german.o\

‎src/backend/snowball/README

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ We choose to include the derived files in the PostgreSQL distribution
2929
because most installations will not have the Snowball compiler available.
3030

3131
We are currently synced with the Snowball git commit
32-
48a67a2831005f49c48ec29a5837640e23e54e6b (tag v2.2.0)
33-
of2021-11-10.
32+
d19326ac6c1b9a417fc872f7c2f845265a5e9ece
33+
of2025-02-19.
3434

3535
To update the PostgreSQL sources from a new Snowball version:
3636

@@ -44,8 +44,8 @@ do
4444
sed 's|\.\./runtime/header\.h|header.h|' $f >libstemmer/`basename $f`
4545
done
4646

47-
Do not copy stemmers that are listed in libstemmer/modules.txt as
48-
nonstandard, such as "german2" or "lovins".
47+
Do not copy stemmers that are listed intheirlibstemmer/modules.txt as
48+
nonstandard, such as "kraaij_pohlmann" or "lovins".
4949

5050
2. Copy the *.c files in snowball/runtime/ to
5151
src/backend/snowball/libstemmer, and edit them to remove direct inclusions
@@ -55,14 +55,18 @@ is sensitive to largefile compilation options.)
5555

5656
3. Copy the *.h files in snowball/src_c/ and snowball/runtime/
5757
to src/include/snowball/libstemmer. At this writing the header files
58-
do not require any changes.
58+
do not require any changes. Again, omit the *.h files for nonstandard
59+
stemmers.
5960

6061
4. Check whether any stemmer modules have been added or removed. If so, edit
61-
the OBJS list in Makefile, the list of #include's in dict_snowball.c, and the
62-
stemmer_modules[] table in dict_snowball.c, as well as the list in the
63-
documentation in textsearch.sgml. You might also need to change
64-
the LANGUAGES list in Makefile and tsearch_config_languages in initdb.c.
62+
the OBJS list in Makefile, the dict_snowball_sources list in meson.build,
63+
the list of #include's and the stemmer_modules[] table in dict_snowball.c,
64+
and the sample \dFd output in the documentation in textsearch.sgml.
65+
You might also need to change the @languages array in snowball_create.pl
66+
and the tsearch_config_languages[] table in initdb.c.
6567

6668
5. The various stopword files in stopwords/ must be downloaded
6769
individually from pages on the snowballstem.org website.
6870
Be careful that these files must be stored in UTF-8 encoding.
71+
Update the stop_files list in Makefile if any are added or removed
72+
(the meson tooling does not require adjustment for that, though).

‎src/backend/snowball/dict_snowball.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@
4545
#include"snowball/libstemmer/stem_ISO_8859_1_spanish.h"
4646
#include"snowball/libstemmer/stem_ISO_8859_1_swedish.h"
4747
#include"snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
48-
#include"snowball/libstemmer/stem_ISO_8859_2_romanian.h"
4948
#include"snowball/libstemmer/stem_KOI8_R_russian.h"
5049
#include"snowball/libstemmer/stem_UTF_8_arabic.h"
5150
#include"snowball/libstemmer/stem_UTF_8_armenian.h"
@@ -54,6 +53,7 @@
5453
#include"snowball/libstemmer/stem_UTF_8_danish.h"
5554
#include"snowball/libstemmer/stem_UTF_8_dutch.h"
5655
#include"snowball/libstemmer/stem_UTF_8_english.h"
56+
#include"snowball/libstemmer/stem_UTF_8_estonian.h"
5757
#include"snowball/libstemmer/stem_UTF_8_finnish.h"
5858
#include"snowball/libstemmer/stem_UTF_8_french.h"
5959
#include"snowball/libstemmer/stem_UTF_8_german.h"
@@ -119,7 +119,6 @@ static const stemmer_module stemmer_modules[] =
119119
STEMMER_MODULE(spanish,PG_LATIN1,ISO_8859_1),
120120
STEMMER_MODULE(swedish,PG_LATIN1,ISO_8859_1),
121121
STEMMER_MODULE(hungarian,PG_LATIN2,ISO_8859_2),
122-
STEMMER_MODULE(romanian,PG_LATIN2,ISO_8859_2),
123122
STEMMER_MODULE(russian,PG_KOI8R,KOI8_R),
124123
STEMMER_MODULE(arabic,PG_UTF8,UTF_8),
125124
STEMMER_MODULE(armenian,PG_UTF8,UTF_8),
@@ -128,6 +127,7 @@ static const stemmer_module stemmer_modules[] =
128127
STEMMER_MODULE(danish,PG_UTF8,UTF_8),
129128
STEMMER_MODULE(dutch,PG_UTF8,UTF_8),
130129
STEMMER_MODULE(english,PG_UTF8,UTF_8),
130+
STEMMER_MODULE(estonian,PG_UTF8,UTF_8),
131131
STEMMER_MODULE(finnish,PG_UTF8,UTF_8),
132132
STEMMER_MODULE(french,PG_UTF8,UTF_8),
133133
STEMMER_MODULE(german,PG_UTF8,UTF_8),

‎src/backend/snowball/libstemmer/api.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ extern struct SN_env * SN_create_env(int S_size, int I_size)
3434
externvoidSN_close_env(structSN_env*z,intS_size)
3535
{
3636
if (z==NULL)return;
37-
if (S_size)
37+
if (z->S)
3838
{
3939
inti;
4040
for (i=0;i<S_size;i++)

‎src/backend/snowball/libstemmer/stem_ISO_8859_1_basque.c

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -912,7 +912,8 @@ static int r_mark_regions(struct SN_env * z) {
912912
if (in_grouping(z,g_v,97,117,0)) gotolab2;
913913
{intc3=z->c;
914914
if (out_grouping(z,g_v,97,117,0)) gotolab4;
915-
{
915+
916+
{
916917
intret=out_grouping(z,g_v,97,117,1);
917918
if (ret<0) gotolab4;
918919
z->c+=ret;
@@ -921,7 +922,8 @@ static int r_mark_regions(struct SN_env * z) {
921922
lab4:
922923
z->c=c3;
923924
if (in_grouping(z,g_v,97,117,0)) gotolab2;
924-
{
925+
926+
{
925927
intret=in_grouping(z,g_v,97,117,1);
926928
if (ret<0) gotolab2;
927929
z->c+=ret;
@@ -934,7 +936,8 @@ static int r_mark_regions(struct SN_env * z) {
934936
if (out_grouping(z,g_v,97,117,0)) gotolab0;
935937
{intc4=z->c;
936938
if (out_grouping(z,g_v,97,117,0)) gotolab6;
937-
{
939+
940+
{
938941
intret=out_grouping(z,g_v,97,117,1);
939942
if (ret<0) gotolab6;
940943
z->c+=ret;
@@ -955,23 +958,27 @@ static int r_mark_regions(struct SN_env * z) {
955958
z->c=c1;
956959
}
957960
{intc5=z->c;
958-
{
961+
962+
{
959963
intret=out_grouping(z,g_v,97,117,1);
960964
if (ret<0) gotolab7;
961965
z->c+=ret;
962966
}
963-
{
967+
968+
{
964969
intret=in_grouping(z,g_v,97,117,1);
965970
if (ret<0) gotolab7;
966971
z->c+=ret;
967972
}
968973
z->I[1]=z->c;
969-
{
974+
975+
{
970976
intret=out_grouping(z,g_v,97,117,1);
971977
if (ret<0) gotolab7;
972978
z->c+=ret;
973979
}
974-
{
980+
981+
{
975982
intret=in_grouping(z,g_v,97,117,1);
976983
if (ret<0) gotolab7;
977984
z->c+=ret;
@@ -984,26 +991,23 @@ static int r_mark_regions(struct SN_env * z) {
984991
}
985992

986993
staticintr_RV(structSN_env*z) {
987-
if (!(z->I[2] <=z->c))return0;
988-
return1;
994+
returnz->I[2] <=z->c;
989995
}
990996

991997
staticintr_R2(structSN_env*z) {
992-
if (!(z->I[0] <=z->c))return0;
993-
return1;
998+
returnz->I[0] <=z->c;
994999
}
9951000

9961001
staticintr_R1(structSN_env*z) {
997-
if (!(z->I[1] <=z->c))return0;
998-
return1;
1002+
returnz->I[1] <=z->c;
9991003
}
10001004

10011005
staticintr_aditzak(structSN_env*z) {
10021006
intamong_var;
10031007
z->ket=z->c;
10041008
if (z->c-1 <=z->lb||z->p[z->c-1] >>5!=3|| !((70566434 >> (z->p[z->c-1]&0x1f))&1))return0;
10051009
among_var=find_among_b(z,a_0,109);
1006-
if (!(among_var))return0;
1010+
if (!among_var)return0;
10071011
z->bra=z->c;
10081012
switch (among_var) {
10091013
case1:
@@ -1046,7 +1050,7 @@ static int r_izenak(struct SN_env * z) {
10461050
z->ket=z->c;
10471051
if (z->c <=z->lb||z->p[z->c-1] >>5!=3|| !((71162402 >> (z->p[z->c-1]&0x1f))&1))return0;
10481052
among_var=find_among_b(z,a_1,295);
1049-
if (!(among_var))return0;
1053+
if (!among_var)return0;
10501054
z->bra=z->c;
10511055
switch (among_var) {
10521056
case1:
@@ -1117,7 +1121,7 @@ static int r_adjetiboak(struct SN_env * z) {
11171121
z->ket=z->c;
11181122
if (z->c-1 <=z->lb||z->p[z->c-1] >>5!=3|| !((35362 >> (z->p[z->c-1]&0x1f))&1))return0;
11191123
among_var=find_among_b(z,a_2,19);
1120-
if (!(among_var))return0;
1124+
if (!among_var)return0;
11211125
z->bra=z->c;
11221126
switch (among_var) {
11231127
case1:
@@ -1138,7 +1142,7 @@ static int r_adjetiboak(struct SN_env * z) {
11381142
}
11391143

11401144
externintbasque_ISO_8859_1_stem(structSN_env*z) {
1141-
1145+
11421146
{intret=r_mark_regions(z);
11431147
if (ret<0)returnret;
11441148
}

‎src/backend/snowball/libstemmer/stem_ISO_8859_1_catalan.c

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,23 +1184,27 @@ static int r_mark_regions(struct SN_env * z) {
11841184
z->I[1]=z->l;
11851185
z->I[0]=z->l;
11861186
{intc1=z->c;
1187-
{
1187+
1188+
{
11881189
intret=out_grouping(z,g_v,97,252,1);
11891190
if (ret<0) gotolab0;
11901191
z->c+=ret;
11911192
}
1192-
{
1193+
1194+
{
11931195
intret=in_grouping(z,g_v,97,252,1);
11941196
if (ret<0) gotolab0;
11951197
z->c+=ret;
11961198
}
11971199
z->I[1]=z->c;
1198-
{
1200+
1201+
{
11991202
intret=out_grouping(z,g_v,97,252,1);
12001203
if (ret<0) gotolab0;
12011204
z->c+=ret;
12021205
}
1203-
{
1206+
1207+
{
12041208
intret=in_grouping(z,g_v,97,252,1);
12051209
if (ret<0) gotolab0;
12061210
z->c+=ret;
@@ -1218,7 +1222,6 @@ static int r_cleaning(struct SN_env * z) {
12181222
intc1=z->c;
12191223
z->bra=z->c;
12201224
among_var=find_among(z,a_0,13);
1221-
if (!(among_var)) gotolab0;
12221225
z->ket=z->c;
12231226
switch (among_var) {
12241227
case1:
@@ -1265,19 +1268,17 @@ static int r_cleaning(struct SN_env * z) {
12651268
}
12661269

12671270
staticintr_R1(structSN_env*z) {
1268-
if (!(z->I[1] <=z->c))return0;
1269-
return1;
1271+
returnz->I[1] <=z->c;
12701272
}
12711273

12721274
staticintr_R2(structSN_env*z) {
1273-
if (!(z->I[0] <=z->c))return0;
1274-
return1;
1275+
returnz->I[0] <=z->c;
12751276
}
12761277

12771278
staticintr_attached_pronoun(structSN_env*z) {
12781279
z->ket=z->c;
12791280
if (z->c-1 <=z->lb||z->p[z->c-1] >>5!=3|| !((1634850 >> (z->p[z->c-1]&0x1f))&1))return0;
1280-
if (!(find_among_b(z,a_1,39)))return0;
1281+
if (!find_among_b(z,a_1,39))return0;
12811282
z->bra=z->c;
12821283
{intret=r_R1(z);
12831284
if (ret <=0)returnret;
@@ -1292,7 +1293,7 @@ static int r_standard_suffix(struct SN_env * z) {
12921293
intamong_var;
12931294
z->ket=z->c;
12941295
among_var=find_among_b(z,a_2,200);
1295-
if (!(among_var))return0;
1296+
if (!among_var)return0;
12961297
z->bra=z->c;
12971298
switch (among_var) {
12981299
case1:
@@ -1343,7 +1344,7 @@ static int r_verb_suffix(struct SN_env * z) {
13431344
intamong_var;
13441345
z->ket=z->c;
13451346
among_var=find_among_b(z,a_3,283);
1346-
if (!(among_var))return0;
1347+
if (!among_var)return0;
13471348
z->bra=z->c;
13481349
switch (among_var) {
13491350
case1:
@@ -1370,7 +1371,7 @@ static int r_residual_suffix(struct SN_env * z) {
13701371
intamong_var;
13711372
z->ket=z->c;
13721373
among_var=find_among_b(z,a_4,22);
1373-
if (!(among_var))return0;
1374+
if (!among_var)return0;
13741375
z->bra=z->c;
13751376
switch (among_var) {
13761377
case1:
@@ -1394,7 +1395,7 @@ static int r_residual_suffix(struct SN_env * z) {
13941395
}
13951396

13961397
externintcatalan_ISO_8859_1_stem(structSN_env*z) {
1397-
1398+
13981399
{intret=r_mark_regions(z);
13991400
if (ret<0)returnret;
14001401
}

‎src/backend/snowball/libstemmer/stem_ISO_8859_1_danish.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -141,15 +141,17 @@ z->c = z->c + 3;
141141
z->I[0]=z->c;
142142
z->c=c_test1;
143143
}
144+
144145
if (out_grouping(z,g_v,97,248,1)<0)return0;
145-
{
146+
147+
{
146148
intret=in_grouping(z,g_v,97,248,1);
147149
if (ret<0)return0;
148150
z->c+=ret;
149151
}
150152
z->I[1]=z->c;
151-
152-
if (!(z->I[1]<z->I[0])) gotolab0;
153+
154+
if (z->I[1]>=z->I[0]) gotolab0;
153155
z->I[1]=z->I[0];
154156
lab0:
155157
return1;
@@ -164,7 +166,7 @@ static int r_main_suffix(struct SN_env * z) {
164166
z->ket=z->c;
165167
if (z->c <=z->lb||z->p[z->c-1] >>5!=3|| !((1851440 >> (z->p[z->c-1]&0x1f))&1)) {z->lb=mlimit1;return0; }
166168
among_var=find_among_b(z,a_0,32);
167-
if (!(among_var)) {z->lb=mlimit1;return0; }
169+
if (!among_var) {z->lb=mlimit1;return0; }
168170
z->bra=z->c;
169171
z->lb=mlimit1;
170172
}
@@ -192,7 +194,7 @@ static int r_consonant_pair(struct SN_env * z) {
192194
mlimit2=z->lb;z->lb=z->I[1];
193195
z->ket=z->c;
194196
if (z->c-1 <=z->lb|| (z->p[z->c-1]!=100&&z->p[z->c-1]!=116)) {z->lb=mlimit2;return0; }
195-
if (!(find_among_b(z,a_1,4))) {z->lb=mlimit2;return0; }
197+
if (!find_among_b(z,a_1,4)) {z->lb=mlimit2;return0; }
196198
z->bra=z->c;
197199
z->lb=mlimit2;
198200
}
@@ -227,7 +229,7 @@ static int r_other_suffix(struct SN_env * z) {
227229
z->ket=z->c;
228230
if (z->c-1 <=z->lb||z->p[z->c-1] >>5!=3|| !((1572992 >> (z->p[z->c-1]&0x1f))&1)) {z->lb=mlimit2;return0; }
229231
among_var=find_among_b(z,a_2,5);
230-
if (!(among_var)) {z->lb=mlimit2;return0; }
232+
if (!among_var) {z->lb=mlimit2;return0; }
231233
z->bra=z->c;
232234
z->lb=mlimit2;
233235
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp