Commitc03419c

committed

Blocks to work with PARSEME MWE corpora.

1 parent19fe229 commitc03419cCopy full SHA for c03419c

File tree

3 files changed

+204

-0

lines changed

udapi/block/mwe

3 files changed

+204

-0

lines changed

`‎udapi/block/mwe/normalize.py`

Lines changed: 68 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,68 @@`
	`1`	`+"""Block that takes PARSEME-like annotation of multiword expressions from MISC`
	`2`	`+ and normalizes it so that the type is always annotated at the first word of`
	`3`	`+ the expression."""`
	`4`	`+fromudapi.core.blockimportBlock`
	`5`	`+importlogging`
	`6`	`+importre`
	`7`	`+`
	`8`	`+classNormalize(Block):`
	`9`	`+`
	`10`	`+defcollect_mwes(self,root):`
	`11`	`+"""`
	`12`	`+ Collects annotations of multiword expressions from MISC of the nodes.`
	`13`	`+ The expected annotation is in the style of Parseme (see`
	`14`	`+ https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download`
	`15`	`+ the data from http://hdl.handle.net/11372/LRT-5124), except that there`
	`16`	`+ are only ten columns and the annotation from the eleventh column is`
	`17`	`+ copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).`
	`18`	`+ """`
	`19`	`+nodes=root.descendants`
	`20`	`+mwes= {}# for each mwe id, its type and list of node ids`
	`21`	`+mwes_by_nodes= {}# for each node id, a list of mwe ids`
	`22`	`+forninnodes:`
	`23`	`+mwes_by_nodes[n.ord]= []`
	`24`	`+miscmwe=n.misc['Mwe']`
	`25`	`+ifmiscmwe:`
	`26`	`+# A node may belong to multiple multiword expressions.`
	`27`	`+miscmwes=miscmwe.split(';')`
	`28`	`+forminmiscmwes:`
	`29`	`+# Either it is NUMBER:TYPE, or just NUMBER.`
	`30`	`+# Number identifies this MWE among all MWEs in the sentence.`
	`31`	`+# Type is a main uppercase string (VID, LVC etc.), optionally`
	`32`	`+# followed by a subtype ('LVC.cause').`
	`33`	`+# See https://gitlab.com/parseme/corpora/-/wikis/home`
	`34`	`+match=re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$",m)`
	`35`	`+ifmatch:`
	`36`	`+number=match.group(1)`
	`37`	`+type=match.group(2)`
	`38`	`+ifnotnumberinmwes:`
	`39`	`+mwes[number]= {'nodes': [],'type':''}`
	`40`	`+iftype:`
	`41`	`+mwes[number]['type']=type`
	`42`	`+mwes[number]['nodes'].append(n.ord)`
	`43`	`+mwes_by_nodes[n.ord].append(number)`
	`44`	`+else:`
	`45`	`+logging.warning("Cannot parse Mwe=%s"%m)`
	`46`	`+return (mwes,mwes_by_nodes)`
	`47`	`+`
	`48`	`+defprocess_tree(self,root):`
	`49`	`+"""`
	`50`	`+ Collects annotations of multiword expressions from MISC of the nodes.`
	`51`	`+ Then saves them back but makes sure that the type is annotated at the`
	`52`	`+ first word of the expression (as opposed to the syntactic head or to`
	`53`	`+ any other word).`
	`54`	`+ """`
	`55`	`+ (mwes,mwes_by_nodes)=self.collect_mwes(root)`
	`56`	`+nodes=root.descendants`
	`57`	`+forninnodes:`
	`58`	`+# Erase the previous MWE annotations so we can start from scratch.`
	`59`	`+n.misc['Mwe']=''`
	`60`	`+# There may be multiple MWEs this node is member of.`
	`61`	`+annotations= []`
	`62`	`+forminmwes_by_nodes[n.ord]:`
	`63`	`+ifn.ord==mwes[m]['nodes'][0]:`
	`64`	`+annotations.append("%s:%s"% (m,mwes[m]['type']))`
	`65`	`+else:`
	`66`	`+annotations.append(m)`
	`67`	`+ifannotations:`
	`68`	`+n.misc['Mwe']=';'.join(annotations)`

`‎udapi/block/mwe/possessives.py`

Lines changed: 74 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,74 @@`
	`1`	`+"""Block that takes PARSEME-like annotation of multiword expressions from MISC,`
	`2`	`+ looks for dependent possessive pronouns and reports how they are treated."""`
	`3`	`+fromudapi.core.blockimportBlock`
	`4`	`+importlogging`
	`5`	`+importre`
	`6`	`+`
	`7`	`+classPossessives(Block):`
	`8`	`+`
	`9`	`+defcollect_mwes(self,root):`
	`10`	`+"""`
	`11`	`+ Collects annotations of multiword expressions from MISC of the nodes.`
	`12`	`+ The expected annotation is in the style of Parseme (see`
	`13`	`+ https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download`
	`14`	`+ the data from http://hdl.handle.net/11372/LRT-5124), except that there`
	`15`	`+ are only ten columns and the annotation from the eleventh column is`
	`16`	`+ copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).`
	`17`	`+ """`
	`18`	`+nodes=root.descendants`
	`19`	`+mwes= {}# for each mwe id, its type and list of node ids`
	`20`	`+mwes_by_nodes= {}# for each node id, a list of mwe ids`
	`21`	`+forninnodes:`
	`22`	`+mwes_by_nodes[n.ord]= []`
	`23`	`+miscmwe=n.misc['Mwe']`
	`24`	`+ifmiscmwe:`
	`25`	`+# A node may belong to multiple multiword expressions.`
	`26`	`+miscmwes=miscmwe.split(';')`
	`27`	`+forminmiscmwes:`
	`28`	`+# Either it is NUMBER:TYPE, or just NUMBER.`
	`29`	`+# Number identifies this MWE among all MWEs in the sentence.`
	`30`	`+# Type is a main uppercase string (VID, LVC etc.), optionally`
	`31`	`+# followed by a subtype ('LVC.cause').`
	`32`	`+# See https://gitlab.com/parseme/corpora/-/wikis/home`
	`33`	`+match=re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$",m)`
	`34`	`+ifmatch:`
	`35`	`+number=match.group(1)`
	`36`	`+type=match.group(2)`
	`37`	`+ifnotnumberinmwes:`
	`38`	`+mwes[number]= {'nodes': [],'type':''}`
	`39`	`+iftype:`
	`40`	`+mwes[number]['type']=type`
	`41`	`+mwes[number]['nodes'].append(n.ord)`
	`42`	`+mwes_by_nodes[n.ord].append(number)`
	`43`	`+else:`
	`44`	`+logging.warning("Cannot parse Mwe=%s"%m)`
	`45`	`+return (mwes,mwes_by_nodes)`
	`46`	`+`
	`47`	`+defprocess_tree(self,root):`
	`48`	`+"""`
	`49`	`+ Collects annotations of multiword expressions from MISC of the nodes.`
	`50`	`+ Then surveys the possessive pronouns.`
	`51`	`+ """`
	`52`	`+ (mwes,mwes_by_nodes)=self.collect_mwes(root)`
	`53`	`+nodes=root.descendants`
	`54`	`+forminmwes:`
	`55`	`+mwenodes= [xforxinnodesifminmwes_by_nodes[x.ord]]`
	`56`	`+mweheads= [xforxinmwenodesifnotx.parentinmwenodes]`
	`57`	`+mwedescendantset=set()`
	`58`	`+forxinmweheads:`
	`59`	`+mwedescendantset=mwedescendantset.union(set(x.descendants))`
	`60`	`+mwedescendants=list(sorted(mwedescendantset))`
	`61`	`+# Is there a possessive pronoun?`
	`62`	`+possprons= [xforxinmwedescendantsifx.upos=='PRON'andx.feats['Poss']=='Yes']`
	`63`	`+inpp= [xforxinposspronsifminmwes_by_nodes[x.ord]]`
	`64`	`+outpp= [xforxinposspronsifnotminmwes_by_nodes[x.ord]]`
	`65`	`+observation=''`
	`66`	`+ifinppandoutpp:`
	`67`	`+observation='both'`
	`68`	`+elifinpp:`
	`69`	`+observation='in'`
	`70`	`+elifoutpp:`
	`71`	`+observation='out'`
	`72`	`+ifobservation:`
	`73`	`+expression=' '.join([x.formifminmwes_by_nodes[x.ord]else'('+x.form+')'forxinmwedescendants])`
	`74`	`+print(observation+': '+expression)`

`‎udapi/block/mwe/tosubdeprels.py`

Lines changed: 62 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,62 @@`
	`1`	`+"""Block that takes PARSEME-like annotation of multiword expressions from MISC`
	`2`	`+ and projects it to subtypes of dependency relation labels. The motivation is`
	`3`	`+ that a parser could learn to predict the multiword expressions."""`
	`4`	`+fromudapi.core.blockimportBlock`
	`5`	`+importlogging`
	`6`	`+importre`
	`7`	`+`
	`8`	`+classToSubDeprels(Block):`
	`9`	`+`
	`10`	`+defcollect_mwes(self,root):`
	`11`	`+"""`
	`12`	`+ Collects annotations of multiword expressions from MISC of the nodes.`
	`13`	`+ The expected annotation is in the style of Parseme (see`
	`14`	`+ https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download`
	`15`	`+ the data from http://hdl.handle.net/11372/LRT-5124), except that there`
	`16`	`+ are only ten columns and the annotation from the eleventh column is`
	`17`	`+ copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).`
	`18`	`+ """`
	`19`	`+nodes=root.descendants`
	`20`	`+mwes= {}# for each mwe id, its type and list of node ids`
	`21`	`+mwes_by_nodes= {}# for each node id, a list of mwe ids`
	`22`	`+forninnodes:`
	`23`	`+mwes_by_nodes[n.ord]= []`
	`24`	`+miscmwe=n.misc['Mwe']`
	`25`	`+ifmiscmwe:`
	`26`	`+# A node may belong to multiple multiword expressions.`
	`27`	`+miscmwes=miscmwe.split(';')`
	`28`	`+forminmiscmwes:`
	`29`	`+# Either it is NUMBER:TYPE, or just NUMBER.`
	`30`	`+# Number identifies this MWE among all MWEs in the sentence.`
	`31`	`+# Type is a main uppercase string (VID, LVC etc.), optionally`
	`32`	`+# followed by a subtype ('LVC.cause').`
	`33`	`+# See https://gitlab.com/parseme/corpora/-/wikis/home`
	`34`	`+match=re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$",m)`
	`35`	`+ifmatch:`
	`36`	`+number=match.group(1)`
	`37`	`+type=match.group(2)`
	`38`	`+ifnotnumberinmwes:`
	`39`	`+mwes[number]= {'nodes': [],'type':''}`
	`40`	`+iftype:`
	`41`	`+mwes[number]['type']=type`
	`42`	`+mwes[number]['nodes'].append(n.ord)`
	`43`	`+mwes_by_nodes[n.ord].append(number)`
	`44`	`+else:`
	`45`	`+logging.warning("Cannot parse Mwe=%s"%m)`
	`46`	`+return (mwes,mwes_by_nodes)`
	`47`	`+`
	`48`	`+defprocess_tree(self,root):`
	`49`	`+"""`
	`50`	`+ Collects annotations of multiword expressions from MISC of the nodes.`
	`51`	`+ Then saves the type of the MWE as a subtype of the deprels inside.`
	`52`	`+ """`
	`53`	`+nodes=root.descendants`
	`54`	`+ (mwes,mwes_by_nodes)=self.collect_mwes(root)`
	`55`	`+# Now we hopefully know the type of every multiword expression in the sentence.`
	`56`	`+forninnodes:`
	`57`	`+ifmwes_by_nodes[n.ord]:`
	`58`	`+forminmwes_by_nodes[n.ord]:`
	`59`	`+type=re.sub(r"\.",'',mwes[m]['type'].lower())`
	`60`	`+# Add the MWE type to the DEPREL if the parent is also in the same MWE.`
	`61`	`+ifn.parent.ord>0andminmwes_by_nodes[n.parent.ord]:`
	`62`	`+n.deprel+=':'+type`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitc03419c

File tree

3 files changed

3 files changed

`‎udapi/block/mwe/normalize.py`

`‎udapi/block/mwe/possessives.py`

`‎udapi/block/mwe/tosubdeprels.py`

0 commit comments