Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit2bc7cc0

Browse files
committed
read.Conll2012 coreference format reader
1 parent290edbc commit2bc7cc0

File tree

1 file changed

+110
-0
lines changed

1 file changed

+110
-0
lines changed

‎udapi/block/read/conll2012.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
""""Conll2012 is a reader block for the coreference in CoNLL-2012 format.
2+
3+
This implementation was tested on the LitBank files only, so far.
4+
LitBank does not use most of the columns, so the implementation
5+
should be improved to handle other types of CoNLL-2012 files.
6+
"""
7+
importjson
8+
importlogging
9+
importre
10+
11+
importudapi.block.read.conllu
12+
fromudapi.core.rootimportRoot
13+
fromudapi.core.nodeimportNode
14+
15+
#RE_BEGIN = re.compile(r'^#begin document \(([^)]+)\); part (\d+)')
16+
RE_BEGIN=re.compile(r'^#begin document \(([^)]+)\)')
17+
18+
classConll2012(udapi.block.read.conllu.Conllu):
19+
"""A reader of the Conll2012 files."""
20+
21+
def__init__(self,attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref',**kwargs):
22+
"""Create the Conll2012 reader object.
23+
24+
Args:
25+
attributes: comma-separated list of column names in the input files
26+
(default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref')
27+
For ignoring a column, use "_" as its name.
28+
Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based)
29+
word-order number/index (usualy called ID).
30+
"""
31+
super().__init__(**kwargs)
32+
self.node_attributes=attributes.split(',')
33+
self._docname='d'
34+
35+
defparse_comment_line(self,line,root):
36+
ifline.startswith("#end document"):
37+
return
38+
match=RE_BEGIN.match(line)
39+
ifmatch:
40+
docname=match.group(1)
41+
root.newdoc=docname
42+
self._global_entity='eid-etype-head-other'
43+
root.comment+='$GLOBAL.ENTITY\n'
44+
self._docname=docname
45+
else:
46+
logging.warning(f"Unexpected comment line:{line}")
47+
48+
defparse_node_line(self,line,root,nodes):
49+
fields=line.split('\t')
50+
iflen(fields)!=len(self.node_attributes):
51+
ifself.strict:
52+
raiseRuntimeError('Wrong number of columns in %r'%line)
53+
fields.extend(['_']* (len(self.node_attributes)-len(fields)))
54+
55+
# This implementation is slower than in read.Conllu,
56+
# but it allows for arbitrary columns
57+
node=root.create_child()
58+
for (n_attribute,attribute_name)inenumerate(self.node_attributes):
59+
value=fields[n_attribute]
60+
ifattribute_name=='docname':
61+
ifvalue!=self._docname:
62+
logging.warning(f"Document name mismatch{value} !={self._docname}")
63+
64+
# convert the zero-based index to one-based
65+
elifattribute_name=='ord':
66+
setattr(node,'ord',int(value)+1)
67+
68+
elifattribute_name=='coref':
69+
ifvalueandvalue!='_':
70+
entities=value.split("|")
71+
modified_entities= []
72+
escaped_docname=self._docname.replace("-","")
73+
forentityinentities:
74+
entity_num=entity.replace("(","").replace(")","")
75+
modified_entity=f"{escaped_docname}_e{entity_num}--1"
76+
ifentity.startswith("(")andentity.endswith(")"):
77+
modified_entity="("+modified_entity+")"
78+
elifentity.startswith("("):
79+
modified_entity="("+modified_entity
80+
elifentity.endswith(")"):
81+
modified_entity=f"{escaped_docname}_e{entity_num}"+")"
82+
83+
# to avoid parentheses clashes, put the entities with ")" first
84+
ifmodified_entity.startswith("("):
85+
modified_entities.append(modified_entity)
86+
else:
87+
modified_entities.insert(0,modified_entity)
88+
node.misc['Entity']=''.join(modified_entities)
89+
90+
elifattribute_name=='form'or (attribute_name!='_'andvalue!='_'):
91+
setattr(node,attribute_name,value)
92+
nodes.append(node)
93+
94+
defread_tree_from_lines(self,lines):
95+
root=Root()
96+
nodes= [root]
97+
forlineinlines:
98+
ifline=='':
99+
pass
100+
elifline[0]=='#':
101+
self.parse_comment_line(line,root)
102+
else:
103+
self.parse_node_line(line,root,nodes)
104+
105+
# If no nodes were read from the filehandle (so only root remained in nodes),
106+
# we return None as a sign of failure (end of file or more than one empty line).
107+
iflen(nodes)==1:
108+
returnNone
109+
110+
returnroot

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp