Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc26b25e

Browse files
committed
Add an Ssurgeon operation which adds an (English only) lemma to text
1 parentd302c63 commitc26b25e

File tree

3 files changed

+130
-0
lines changed

3 files changed

+130
-0
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
packageedu.stanford.nlp.semgraph.semgrex.ssurgeon;
2+
3+
importjava.util.Objects;
4+
5+
importedu.stanford.nlp.international.Language;
6+
importedu.stanford.nlp.ling.CoreAnnotations;
7+
importedu.stanford.nlp.ling.IndexedWord;
8+
importedu.stanford.nlp.process.Morphology;
9+
importedu.stanford.nlp.semgraph.SemanticGraph;
10+
importedu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
11+
12+
/**
13+
* Add the output of the English lemmatizer to the word in question.
14+
* Currently this only supports English! You can add a known lemma
15+
* for a different language by using EditNode and setting the lemma
16+
* attribute
17+
*
18+
* @author John Bauer
19+
*/
20+
publicclassLemmatizeextendsSsurgeonEdit {
21+
publicstaticfinalStringLABEL ="lemmatize";
22+
23+
finalStringnodeName;
24+
finalMorphologymorphology;
25+
finalLanguagelanguage;
26+
27+
publicLemmatize(StringnodeName,Languagelanguage) {
28+
if (nodeName ==null) {
29+
thrownewSsurgeonParseException("Cannot make a Lemmatize with no nodeName");
30+
}
31+
this.nodeName =nodeName;
32+
33+
if (language ==Language.UniversalEnglish ||language ==Language.English) {
34+
this.language =Language.English;
35+
}elseif (language ==Language.Unknown) {
36+
// log something here?
37+
this.language =Language.English;
38+
}else {
39+
thrownewSsurgeonParseException("Lemmatizing " +language +" is not supported");
40+
}
41+
42+
this.morphology =newMorphology();
43+
}
44+
45+
@Override
46+
publicStringtoEditString() {
47+
StringBuilderbuf =newStringBuilder();
48+
buf.append(LABEL);buf.append("\t");
49+
buf.append(Ssurgeon.NODENAME_ARG);buf.append(" ");
50+
buf.append(nodeName);
51+
returnbuf.toString();
52+
}
53+
54+
publicbooleanevaluate(SemanticGraphsg,SemgrexMatchersm) {
55+
IndexedWordword =sm.getNode(nodeName);
56+
if (word ==null)
57+
returnfalse;
58+
59+
StringoldLemma =word.lemma();
60+
morphology.stem(word.backingLabel(),CoreAnnotations.LemmaAnnotation.class);
61+
StringnewLemma =word.lemma();
62+
booleanchanged = !Objects.equals(oldLemma,newLemma);
63+
returnchanged;
64+
}
65+
}

‎src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java‎

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
* <li> {@code reattachNamedEdge -edge edgename -gov gov -dep dep}
8484
* <li> {@code addDep -gov node1 -reln depType -position where ...attributes...}
8585
* <li> {@code editNode -node node ...attributes...}
86+
* <li> {@code lemmatize -node node}
8687
* <li> {@code combineMWT -node node -word word}
8788
* <li> {@code setRoots n1 (n2 n3 ...)}
8889
* <li> {@code mergeNodes n1 n2}
@@ -137,6 +138,10 @@
137138
* needs the ability to add or remove features without resetting the entire features map,
138139
* please file an issue on github.
139140
*</p><p>
141+
* {@code lemmatize} will put a lemma on a word.
142+
* {@code -node} is the node to edit.
143+
* This only works on English text.
144+
*</p><p>
140145
* {@code combineMWT} will add MWT attributes to a sequence of two or more words.
141146
* {@code -node} (repeated) is the nodes to edit.
142147
* {@code -word} is the optional text to use for the new MWT. If not set, the words will be concatenated.
@@ -566,6 +571,11 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
566571
thrownewSsurgeonParseException("Cannot make an EditNode out of " +argsBox.nodes.size() +" nodes. Please use exactly one -node");
567572
}
568573
returnnewEditNode(argsBox.nodes.get(0),argsBox.annotations,argsBox.updateMorphoFeatures);
574+
}elseif (command.equalsIgnoreCase(Lemmatize.LABEL)) {
575+
if (argsBox.nodes.size() !=1) {
576+
thrownewSsurgeonParseException("Cannot make a Lemmatize out of " +argsBox.nodes.size() +" nodes. Please use exactly one -node");
577+
}
578+
returnnewLemmatize(argsBox.nodes.get(0),language);
569579
}elseif (command.equalsIgnoreCase(MergeNodes.LABEL)) {
570580
if (argsBox.nodes.size() <2) {
571581
thrownewSsurgeonParseException("Cannot make a MergeNodes out of fewer than 2 nodes (got " +argsBox.nodes.size() +")");

‎test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java‎

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1362,6 +1362,61 @@ public void checkAnnotationConversionErrors() {
13621362

13631363

13641364
/**
1365+
* Check that the edit which puts a lemma on a node redoes the lemma on the nodes it targets
1366+
*/
1367+
@Test
1368+
publicvoidreadXMLLemmatize() {
1369+
Ssurgeoninst =Ssurgeon.inst();
1370+
1371+
// use "dep" as the dependency so as to be language-agnostic in this test
1372+
Stringlemma =String.join(newline,
1373+
"<ssurgeon-pattern-list>",
1374+
" <ssurgeon-pattern>",
1375+
" <uid>38</uid>",
1376+
" <notes>Edit a node</notes>",
1377+
" <semgrex>" +XMLUtils.escapeXML("!{lemma:/.+/}=nolemma") +"</semgrex>",
1378+
" <edit-list>lemmatize -node nolemma</edit-list>",
1379+
" </ssurgeon-pattern>",
1380+
"</ssurgeon-pattern-list>");
1381+
List<SsurgeonPattern>patterns =inst.readFromString(lemma);
1382+
assertEquals(patterns.size(),1);
1383+
SsurgeonPatternlemmatizeSsurgeon =patterns.get(0);
1384+
1385+
SemanticGraphsg =SemanticGraph.valueOf("[has/VBZ-2 nsubj> Jennifer/NNP-1 obj> [antennae/NNS-4 dep> green/JJ-3]]");
1386+
for (IndexedWordword :sg.vertexSet()) {
1387+
assertNull(word.lemma());
1388+
}
1389+
SemanticGraphnewSG =lemmatizeSsurgeon.iterate(sg).first;
1390+
String[]expectedLemmas = {"Jennifer","have","green","antenna"};
1391+
for (IndexedWordword :newSG.vertexSet()) {
1392+
assertEquals(expectedLemmas[word.index() -1],word.lemma());
1393+
}
1394+
1395+
// this version would bomb if lemmatize were not bomb-proof
1396+
lemma =String.join(newline,
1397+
"<ssurgeon-pattern-list>",
1398+
" <ssurgeon-pattern>",
1399+
" <uid>38</uid>",
1400+
" <notes>Edit a node</notes>",
1401+
" <semgrex>" +XMLUtils.escapeXML("{}=nolemma") +"</semgrex>",
1402+
" <edit-list>lemmatize -node nolemma</edit-list>",
1403+
" </ssurgeon-pattern>",
1404+
"</ssurgeon-pattern-list>");
1405+
patterns =inst.readFromString(lemma);
1406+
assertEquals(patterns.size(),1);
1407+
lemmatizeSsurgeon =patterns.get(0);
1408+
1409+
sg =SemanticGraph.valueOf("[has/VBZ-2 nsubj> Jennifer/NNP-1 obj> [antennae/NNS-4 dep> green/JJ-3]]");
1410+
for (IndexedWordword :sg.vertexSet()) {
1411+
assertNull(word.lemma());
1412+
}
1413+
newSG =lemmatizeSsurgeon.iterate(sg).first;
1414+
for (IndexedWordword :newSG.vertexSet()) {
1415+
assertEquals(expectedLemmas[word.index() -1],word.lemma());
1416+
}
1417+
}
1418+
1419+
/*
13651420
* Check that a basic edit script works as expected
13661421
*/
13671422
@Test

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp