Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc03419c

Browse files
committed
Blocks to work with PARSEME MWE corpora.
1 parent19fe229 commitc03419c

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

‎udapi/block/mwe/normalize.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""Block that takes PARSEME-like annotation of multiword expressions from MISC
2+
and normalizes it so that the type is always annotated at the first word of
3+
the expression."""
4+
fromudapi.core.blockimportBlock
5+
importlogging
6+
importre
7+
8+
classNormalize(Block):
9+
10+
defcollect_mwes(self,root):
11+
"""
12+
Collects annotations of multiword expressions from MISC of the nodes.
13+
The expected annotation is in the style of Parseme (see
14+
https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
15+
the data from http://hdl.handle.net/11372/LRT-5124), except that there
16+
are only ten columns and the annotation from the eleventh column is
17+
copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
18+
"""
19+
nodes=root.descendants
20+
mwes= {}# for each mwe id, its type and list of node ids
21+
mwes_by_nodes= {}# for each node id, a list of mwe ids
22+
forninnodes:
23+
mwes_by_nodes[n.ord]= []
24+
miscmwe=n.misc['Mwe']
25+
ifmiscmwe:
26+
# A node may belong to multiple multiword expressions.
27+
miscmwes=miscmwe.split(';')
28+
forminmiscmwes:
29+
# Either it is NUMBER:TYPE, or just NUMBER.
30+
# Number identifies this MWE among all MWEs in the sentence.
31+
# Type is a main uppercase string (VID, LVC etc.), optionally
32+
# followed by a subtype ('LVC.cause').
33+
# See https://gitlab.com/parseme/corpora/-/wikis/home
34+
match=re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$",m)
35+
ifmatch:
36+
number=match.group(1)
37+
type=match.group(2)
38+
ifnotnumberinmwes:
39+
mwes[number]= {'nodes': [],'type':''}
40+
iftype:
41+
mwes[number]['type']=type
42+
mwes[number]['nodes'].append(n.ord)
43+
mwes_by_nodes[n.ord].append(number)
44+
else:
45+
logging.warning("Cannot parse Mwe=%s"%m)
46+
return (mwes,mwes_by_nodes)
47+
48+
defprocess_tree(self,root):
49+
"""
50+
Collects annotations of multiword expressions from MISC of the nodes.
51+
Then saves them back but makes sure that the type is annotated at the
52+
first word of the expression (as opposed to the syntactic head or to
53+
any other word).
54+
"""
55+
(mwes,mwes_by_nodes)=self.collect_mwes(root)
56+
nodes=root.descendants
57+
forninnodes:
58+
# Erase the previous MWE annotations so we can start from scratch.
59+
n.misc['Mwe']=''
60+
# There may be multiple MWEs this node is member of.
61+
annotations= []
62+
forminmwes_by_nodes[n.ord]:
63+
ifn.ord==mwes[m]['nodes'][0]:
64+
annotations.append("%s:%s"% (m,mwes[m]['type']))
65+
else:
66+
annotations.append(m)
67+
ifannotations:
68+
n.misc['Mwe']=';'.join(annotations)

‎udapi/block/mwe/possessives.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Block that takes PARSEME-like annotation of multiword expressions from MISC,
2+
looks for dependent possessive pronouns and reports how they are treated."""
3+
fromudapi.core.blockimportBlock
4+
importlogging
5+
importre
6+
7+
classPossessives(Block):
8+
9+
defcollect_mwes(self,root):
10+
"""
11+
Collects annotations of multiword expressions from MISC of the nodes.
12+
The expected annotation is in the style of Parseme (see
13+
https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
14+
the data from http://hdl.handle.net/11372/LRT-5124), except that there
15+
are only ten columns and the annotation from the eleventh column is
16+
copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
17+
"""
18+
nodes=root.descendants
19+
mwes= {}# for each mwe id, its type and list of node ids
20+
mwes_by_nodes= {}# for each node id, a list of mwe ids
21+
forninnodes:
22+
mwes_by_nodes[n.ord]= []
23+
miscmwe=n.misc['Mwe']
24+
ifmiscmwe:
25+
# A node may belong to multiple multiword expressions.
26+
miscmwes=miscmwe.split(';')
27+
forminmiscmwes:
28+
# Either it is NUMBER:TYPE, or just NUMBER.
29+
# Number identifies this MWE among all MWEs in the sentence.
30+
# Type is a main uppercase string (VID, LVC etc.), optionally
31+
# followed by a subtype ('LVC.cause').
32+
# See https://gitlab.com/parseme/corpora/-/wikis/home
33+
match=re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$",m)
34+
ifmatch:
35+
number=match.group(1)
36+
type=match.group(2)
37+
ifnotnumberinmwes:
38+
mwes[number]= {'nodes': [],'type':''}
39+
iftype:
40+
mwes[number]['type']=type
41+
mwes[number]['nodes'].append(n.ord)
42+
mwes_by_nodes[n.ord].append(number)
43+
else:
44+
logging.warning("Cannot parse Mwe=%s"%m)
45+
return (mwes,mwes_by_nodes)
46+
47+
defprocess_tree(self,root):
48+
"""
49+
Collects annotations of multiword expressions from MISC of the nodes.
50+
Then surveys the possessive pronouns.
51+
"""
52+
(mwes,mwes_by_nodes)=self.collect_mwes(root)
53+
nodes=root.descendants
54+
forminmwes:
55+
mwenodes= [xforxinnodesifminmwes_by_nodes[x.ord]]
56+
mweheads= [xforxinmwenodesifnotx.parentinmwenodes]
57+
mwedescendantset=set()
58+
forxinmweheads:
59+
mwedescendantset=mwedescendantset.union(set(x.descendants))
60+
mwedescendants=list(sorted(mwedescendantset))
61+
# Is there a possessive pronoun?
62+
possprons= [xforxinmwedescendantsifx.upos=='PRON'andx.feats['Poss']=='Yes']
63+
inpp= [xforxinposspronsifminmwes_by_nodes[x.ord]]
64+
outpp= [xforxinposspronsifnotminmwes_by_nodes[x.ord]]
65+
observation=''
66+
ifinppandoutpp:
67+
observation='both'
68+
elifinpp:
69+
observation='in'
70+
elifoutpp:
71+
observation='out'
72+
ifobservation:
73+
expression=' '.join([x.formifminmwes_by_nodes[x.ord]else'('+x.form+')'forxinmwedescendants])
74+
print(observation+': '+expression)

‎udapi/block/mwe/tosubdeprels.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""Block that takes PARSEME-like annotation of multiword expressions from MISC
2+
and projects it to subtypes of dependency relation labels. The motivation is
3+
that a parser could learn to predict the multiword expressions."""
4+
fromudapi.core.blockimportBlock
5+
importlogging
6+
importre
7+
8+
classToSubDeprels(Block):
9+
10+
defcollect_mwes(self,root):
11+
"""
12+
Collects annotations of multiword expressions from MISC of the nodes.
13+
The expected annotation is in the style of Parseme (see
14+
https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
15+
the data from http://hdl.handle.net/11372/LRT-5124), except that there
16+
are only ten columns and the annotation from the eleventh column is
17+
copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
18+
"""
19+
nodes=root.descendants
20+
mwes= {}# for each mwe id, its type and list of node ids
21+
mwes_by_nodes= {}# for each node id, a list of mwe ids
22+
forninnodes:
23+
mwes_by_nodes[n.ord]= []
24+
miscmwe=n.misc['Mwe']
25+
ifmiscmwe:
26+
# A node may belong to multiple multiword expressions.
27+
miscmwes=miscmwe.split(';')
28+
forminmiscmwes:
29+
# Either it is NUMBER:TYPE, or just NUMBER.
30+
# Number identifies this MWE among all MWEs in the sentence.
31+
# Type is a main uppercase string (VID, LVC etc.), optionally
32+
# followed by a subtype ('LVC.cause').
33+
# See https://gitlab.com/parseme/corpora/-/wikis/home
34+
match=re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$",m)
35+
ifmatch:
36+
number=match.group(1)
37+
type=match.group(2)
38+
ifnotnumberinmwes:
39+
mwes[number]= {'nodes': [],'type':''}
40+
iftype:
41+
mwes[number]['type']=type
42+
mwes[number]['nodes'].append(n.ord)
43+
mwes_by_nodes[n.ord].append(number)
44+
else:
45+
logging.warning("Cannot parse Mwe=%s"%m)
46+
return (mwes,mwes_by_nodes)
47+
48+
defprocess_tree(self,root):
49+
"""
50+
Collects annotations of multiword expressions from MISC of the nodes.
51+
Then saves the type of the MWE as a subtype of the deprels inside.
52+
"""
53+
nodes=root.descendants
54+
(mwes,mwes_by_nodes)=self.collect_mwes(root)
55+
# Now we hopefully know the type of every multiword expression in the sentence.
56+
forninnodes:
57+
ifmwes_by_nodes[n.ord]:
58+
forminmwes_by_nodes[n.ord]:
59+
type=re.sub(r"\.",'',mwes[m]['type'].lower())
60+
# Add the MWE type to the DEPREL if the parent is also in the same MWE.
61+
ifn.parent.ord>0andminmwes_by_nodes[n.parent.ord]:
62+
n.deprel+=':'+type

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp