|
| 1 | +""""Conll2012 is a reader block for the coreference in CoNLL-2012 format. |
| 2 | +
|
| 3 | +This implementation was tested on the LitBank files only, so far. |
| 4 | +LitBank does not use most of the columns, so the implementation |
| 5 | +should be improved to handle other types of CoNLL-2012 files. |
| 6 | +""" |
| 7 | +importjson |
| 8 | +importlogging |
| 9 | +importre |
| 10 | + |
| 11 | +importudapi.block.read.conllu |
| 12 | +fromudapi.core.rootimportRoot |
| 13 | +fromudapi.core.nodeimportNode |
| 14 | + |
| 15 | +#RE_BEGIN = re.compile(r'^#begin document \(([^)]+)\); part (\d+)') |
| 16 | +RE_BEGIN=re.compile(r'^#begin document \(([^)]+)\)') |
| 17 | + |
| 18 | +classConll2012(udapi.block.read.conllu.Conllu): |
| 19 | +"""A reader of the Conll2012 files.""" |
| 20 | + |
| 21 | +def__init__(self,attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref',**kwargs): |
| 22 | +"""Create the Conll2012 reader object. |
| 23 | +
|
| 24 | + Args: |
| 25 | + attributes: comma-separated list of column names in the input files |
| 26 | + (default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref') |
| 27 | + For ignoring a column, use "_" as its name. |
| 28 | + Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based) |
| 29 | + word-order number/index (usualy called ID). |
| 30 | + """ |
| 31 | +super().__init__(**kwargs) |
| 32 | +self.node_attributes=attributes.split(',') |
| 33 | +self._docname='d' |
| 34 | + |
| 35 | +defparse_comment_line(self,line,root): |
| 36 | +ifline.startswith("#end document"): |
| 37 | +return |
| 38 | +match=RE_BEGIN.match(line) |
| 39 | +ifmatch: |
| 40 | +docname=match.group(1) |
| 41 | +root.newdoc=docname |
| 42 | +self._global_entity='eid-etype-head-other' |
| 43 | +root.comment+='$GLOBAL.ENTITY\n' |
| 44 | +self._docname=docname |
| 45 | +else: |
| 46 | +logging.warning(f"Unexpected comment line:{line}") |
| 47 | + |
| 48 | +defparse_node_line(self,line,root,nodes): |
| 49 | +fields=line.split('\t') |
| 50 | +iflen(fields)!=len(self.node_attributes): |
| 51 | +ifself.strict: |
| 52 | +raiseRuntimeError('Wrong number of columns in %r'%line) |
| 53 | +fields.extend(['_']* (len(self.node_attributes)-len(fields))) |
| 54 | + |
| 55 | +# This implementation is slower than in read.Conllu, |
| 56 | +# but it allows for arbitrary columns |
| 57 | +node=root.create_child() |
| 58 | +for (n_attribute,attribute_name)inenumerate(self.node_attributes): |
| 59 | +value=fields[n_attribute] |
| 60 | +ifattribute_name=='docname': |
| 61 | +ifvalue!=self._docname: |
| 62 | +logging.warning(f"Document name mismatch{value} !={self._docname}") |
| 63 | + |
| 64 | +# convert the zero-based index to one-based |
| 65 | +elifattribute_name=='ord': |
| 66 | +setattr(node,'ord',int(value)+1) |
| 67 | + |
| 68 | +elifattribute_name=='coref': |
| 69 | +ifvalueandvalue!='_': |
| 70 | +entities=value.split("|") |
| 71 | +modified_entities= [] |
| 72 | +escaped_docname=self._docname.replace("-","") |
| 73 | +forentityinentities: |
| 74 | +entity_num=entity.replace("(","").replace(")","") |
| 75 | +modified_entity=f"{escaped_docname}_e{entity_num}--1" |
| 76 | +ifentity.startswith("(")andentity.endswith(")"): |
| 77 | +modified_entity="("+modified_entity+")" |
| 78 | +elifentity.startswith("("): |
| 79 | +modified_entity="("+modified_entity |
| 80 | +elifentity.endswith(")"): |
| 81 | +modified_entity=f"{escaped_docname}_e{entity_num}"+")" |
| 82 | + |
| 83 | +# to avoid parentheses clashes, put the entities with ")" first |
| 84 | +ifmodified_entity.startswith("("): |
| 85 | +modified_entities.append(modified_entity) |
| 86 | +else: |
| 87 | +modified_entities.insert(0,modified_entity) |
| 88 | +node.misc['Entity']=''.join(modified_entities) |
| 89 | + |
| 90 | +elifattribute_name=='form'or (attribute_name!='_'andvalue!='_'): |
| 91 | +setattr(node,attribute_name,value) |
| 92 | +nodes.append(node) |
| 93 | + |
| 94 | +defread_tree_from_lines(self,lines): |
| 95 | +root=Root() |
| 96 | +nodes= [root] |
| 97 | +forlineinlines: |
| 98 | +ifline=='': |
| 99 | +pass |
| 100 | +elifline[0]=='#': |
| 101 | +self.parse_comment_line(line,root) |
| 102 | +else: |
| 103 | +self.parse_node_line(line,root,nodes) |
| 104 | + |
| 105 | +# If no nodes were read from the filehandle (so only root remained in nodes), |
| 106 | +# we return None as a sign of failure (end of file or more than one empty line). |
| 107 | +iflen(nodes)==1: |
| 108 | +returnNone |
| 109 | + |
| 110 | +returnroot |