Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite826ac0

Browse files
author
Nick Galbreath
committed
sql cleanups
1 parentbbadd12 commite826ac0

File tree

4 files changed

+155
-20
lines changed

4 files changed

+155
-20
lines changed
Lines changed: 120 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,124 @@
11

2+
-- delete empty and small words
3+
DELETEFROM enwordswhere LENGTH(word)<4;
4+
25
-- delete this tag. If word has other tags, it survives.
3-
DELETEFROMcategorylinksWHEREcl_to="English_terms_with_obsolete_senses";
6+
--DELETE FROMenwords WHEREtag = "English_terms_with_obsolete_senses";
47

58
-- delete all words completely if tag matches
6-
DELETE tmp.*
7-
FROM categorylinks tmp
8-
WHERE cl_fromIN (
9-
SELECT cl_fromFROM (
10-
SELECT DISTINCT(cl_from)from categorylinks
11-
WHERE (cl_to="English_archaic_forms")
12-
OR (cl_to="English_abbreviations")
13-
OR (cl_to="English_misspellings")
14-
OR (cl_toLIKE"English_obsolete%")
15-
) x );
9+
CREATE TEMPORARY TABLE IF NOT EXISTS junk1AS (
10+
SELECT DISTINCT(id)from enwords
11+
WHERE tagIN (
12+
"English_archaic_forms",
13+
"English_abbreviations",
14+
"English_misspellings",
15+
"English_dated_forms",
16+
"English_obsolete_terms",
17+
"English_obsolete_forms",
18+
"English_initialisms",
19+
"English_terms_with_obsolete_senses"
20+
)
21+
);
22+
DELETEFROM enwords
23+
USING enwords, junk1
24+
WHEREenwords.id=junk1.id;
25+
26+
-- delete words that are errors in wikitionary, or
27+
-- likely to be a misspelling anyways
28+
CREATE TEMPORARY TABLE IF NOT EXISTS junk2AS (
29+
SELECT DISTINCT(id)from enwords
30+
WHERE wordIN (
31+
"TATH",
32+
"AGIN",
33+
"ACCIDENTLY",
34+
"ABUTTS",
35+
"ADN",
36+
"ALOT",
37+
"ANNOINT",
38+
"ANNOINTED",
39+
"ANNOINTING",
40+
"ANNOINTS",
41+
"ANSWERES",
42+
"BELIEFES",
43+
"BELIVE",
44+
"BELIVES",
45+
"CANCELLS",
46+
"CHINEES",
47+
"CANNISTERS",
48+
"CRACKES",
49+
"NINTY",
50+
"DEVASTED",
51+
"DISPENCED",
52+
"DISPENCING",
53+
"DRINKES",
54+
"CIGARETS",
55+
"OFFRED",
56+
"EUROPIAN",
57+
"PROOVED",
58+
"INSTRUCTER",
59+
"VESSELLS",
60+
"INSTRUCTERS",
61+
"INSTALS",
62+
"WICH",
63+
"EXPENCES",
64+
"EVERYTING",
65+
"EXPELLS",
66+
"VITAMINES",
67+
"STRAT",
68+
"STONG",
69+
"HUNDERD",
70+
"ECT",
71+
"TEH",
72+
"DOUB",
73+
"CHAMBRE",
74+
"CHAMBRES",
75+
"CLAS",
76+
"HALP",
77+
"CONTINUOS",
78+
"FACIST",
79+
"THRID",
80+
"TALLENTS",
81+
"TALLETS",
82+
"TEACHED",
83+
"SURVIVERS",
84+
"SURVIVOURS",
85+
"SURPRIZE",
86+
"SURPRIZED",
87+
"STREAMES",
88+
"STUPIDY",
89+
"STREAMES",
90+
"STILUS",
91+
"SUCCEDED",
92+
"SUCCEDES",
93+
"RISED",
94+
"SAUGHT",
95+
"QUITTED",
96+
"RAELISM",
97+
"PYRAMIDES",
98+
"PROPPER",
99+
"PERSUED",
100+
"PERSUING",
101+
"PAYED",
102+
"PATTENED",
103+
"NUCULAR",
104+
"NESTIN",
105+
"MEDICINS",
106+
"MAKED",
107+
"MACHINS",
108+
"LENGTHLY",
109+
"FLACONS",
110+
"FOUTH",
111+
"FROME",
112+
"GALATIC",
113+
"HEARED",
114+
"HARRASES",
115+
"HARRASSED",
116+
"INMIGRANT",
117+
"INMIGRANTS",
118+
"RABBITTS"
119+
)
120+
);
121+
122+
DELETEFROM enwords
123+
USING enwords, junk2
124+
WHEREenwords.id=junk2.id;
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
-- drops the crazy indexes in wikitionary
2+
dropindex cl_sortkeyon categorylinks;
3+
dropindex cl_fromon categorylinks;
4+
dropindex cl_sortkeyon categorylinks;
5+
dropindex cl_timestampon categorylinks;
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
2+
-- simplified schema
3+
DROPTABLE IF EXISTS enwords;
4+
CREATETABLE `enwords` (
5+
`id`int(8) unsignedNOT NULL DEFAULT'0',
6+
`tag` varbinary(255)NOT NULL DEFAULT'',
7+
`word` varbinary(230)NOT NULL DEFAULT'',
8+
UNIQUE KEY`cl_from` (`id`,`tag`),
9+
KEY`tag` (`tag`),
10+
KEY`word` (`word`)
11+
) ENGINE=InnoDB DEFAULT CHARSET=binary ROW_FORMAT=DYNAMIC;
12+
13+
-- sometimes the English word is in cl_sortkey and other times it's in cl_sortkey_prefix
14+
-- this tries to pick the right one
15+
-- Eliminate all hyphanted words and words with apostrophe.
16+
-- misspell doesn't work with those.
17+
-- Eliminate all non-ASCII words --
18+
-- * all sorts of emojis and symbols
19+
-- * braille forms
20+
-- * don't care about accented words
21+
-- NOTE: sortkey regexp includes a newline. For unknown reasons
22+
-- sortkey is often in the form of "FOO\nFOO"
23+
--
24+
INSERT INTO enwords (id, tag, word)
25+
SELECT cl_fromAS id, cl_toas tag, IF(cl_sortkey_prefix="", cl_sortkey, cl_sortkey_prefix)AS word
26+
FROM categorylinks
27+
WHERE (cl_toLIKE"English_%"OR cl_toLIKE"en:%")
28+
AND cl_sortkey REGEXP"^[A-Z\n]*$"
29+
AND cl_sortkey_prefix REGEXP"^[A-Z]*$";
Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1 @@
1-
SELECT DISTINCT(word)FROM
2-
(
3-
SELECT IF(cl_sortkey_prefix="", cl_sortkey, cl_sortkey_prefix)AS word
4-
FROM categorylinks
5-
WHERE (cl_sortkey notlike"%'%")
6-
AND (cl_tolike"English_%"OR cl_tolike"en:%")
7-
AND (cl_to<>"English_misspellings")
8-
)AS TMP
9-
WHERE word REGEXP"^[A-Z ]+$";
1+
SELECT DISTINCT(word)FROM enwords;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp