66import json
77import pprint
88from collections import OrderedDict
9- from itertools import chain
9+ from itertools import chain ,permutations
10+ from collections import Counter
1011from dnlp .utils .constant import UNK
1112
1213RE_SAPCE = re .compile ('[ ]+' )
1314
15+
1416# print(pprint.pformat([1,[[2]],3,4444444444,77777777777777777777777777],indent=2,width=10))
1517class ProcessEMR (object ):
1618def __init__ (self ,base_folder :str ,dict_path :str = '' ,mode = 'train' ,directed = False ):
@@ -21,12 +23,14 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed
2123self .directed = directed
2224with open (self .relation_name_file ,'rb' )as f :
2325self .category_name = pickle .load (f )
26+ with open (self .relation_pair_file ,'rb' )as f :
27+ self .relation_pair_names = pickle .load (f )
2428# self.reversed_category_name = dict(zip(self.category_name.values(),self.category_name.keys()))
2529self .mode = mode
2630if self .mode == 'train' :
27- self .window = 5
31+ self .window = 200
2832else :
29- self .window = 100
33+ self .window = 200
3034self .dict_path = dict_path
3135self .files = self .get_files ()
3236self .annotations = self .read_annotations (directed )
@@ -51,17 +55,53 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed
5155for relation_category in self .relation_categories :
5256self .relation_category_labels [relation_category ]= relation_category_index
5357relation_category_index += 1
54- print (len (self .relation_category_labels ))
58+ # print(len(self.relation_category_labels))
5559with open (self .base_folder + 'relation_index.pickle' ,'wb' )as f :
5660pickle .dump (self .relation_category_labels ,f )
5761self .two_categories = self .generate_re_two_training_data ()
5862self .multi_categories = self .generate_re_mutli_training_data ()
59- with open (self .data_folder + '/emr_relation.rel' ,'wb' )as f :
60- pickle .dump (self .multi_categories ,f )
63+ if mode == 'train' and directed :
64+ with open (self .data_folder + '/emr_relation.rel' ,'wb' )as f :
65+ pickle .dump (self .multi_categories ,f )
6166self .export_structured_emr ()
6267self .data = self .read_file ()
6368self .export ()
6469self .save_data ()
70+ self .export_type_dict ()
71+ self .export_relations ()
72+
73+ def export_relations (self ):
74+ data = {}
75+ for annotation in self .annotations :
76+ filename = annotation ['file' ]
77+ entities = annotation ['entities' ]
78+
79+ for relation in annotation ['true_relations' ].values ():
80+ ent1 = entities [relation ['first' ]]
81+ ent2 = entities [relation ['second' ]]
82+ rel = OrderedDict (
83+ {'ent1' :ent1 ['text' ],'ent2' :ent2 ['text' ],'ent1_type' :ent1 ['type' ],'ent2_type' :ent2 ['type' ],
84+ 'rel_type' :relation ['type' ]})
85+ if filename not in data :
86+ data [filename ]= [rel ]
87+ else :
88+ data [filename ].append (rel )
89+
90+ with open (self .data_folder + '/emr_test_rel.pickle' ,'wb' )as f :
91+ pickle .dump (data ,f )
92+
93+ def export_type_dict (self ):
94+ entity_dict = {}
95+ for annotation in self .annotations :
96+ for entity in annotation ['entities' ].values ():
97+ entity_text = entity ['text' ]
98+ entity_type = entity ['type' ]
99+ if entity_text not in entity_dict :
100+ entity_dict [entity_text ]= [entity_type ]
101+ else :
102+ entity_dict [entity_text ].append (entity_type )
103+ entity_dict = {k :Counter (v ).most_common (1 )[0 ][0 ]for k ,v in entity_dict .items ()}
104+ # print(len(entity_dict))
65105
66106def statistics (self ):
67107true_count = 0
@@ -77,7 +117,7 @@ def read_file(self):
77117data = {}
78118for f in self .files :
79119file_data = {'entities' : {},'relations' : {}}
80- with open (self .data_folder + self .mode + '/' + f + '.ann' ,encoding = 'utf-8' )as f :
120+ with open (self .data_folder + self .mode + '/' + f + '.ann' ,encoding = 'utf-8' )as f :
81121entries = [l .split ('\t ' )for l in f .read ().splitlines ()if l ]
82122for entry in entries :
83123idx = entry [0 ]
@@ -117,7 +157,7 @@ def export(self):
117157attribute [self .relation_categories [rt ]]= e2 ['text' ]
118158else :
119159if type (attribute [self .relation_categories [rt ]])== str :
120- attribute [self .relation_categories [rt ]]= [attribute [self .relation_categories [rt ]],e2 ['text' ]]
160+ attribute [self .relation_categories [rt ]]= [attribute [self .relation_categories [rt ]],e2 ['text' ]]
121161else :
122162attribute [self .relation_categories [rt ]].append (e2 ['text' ])
123163if not result .get (e_type ):
@@ -137,7 +177,7 @@ def export(self):
137177# entity['attributes'] = attribute
138178# result.append(entity)
139179with open (self .base_folder + 'structured/' + filename + '.json' ,'w' ,encoding = 'utf-8' )as f :
140- f .write (pprint .pformat (new_result ,width = 100 ).replace ('\' ' ,'"' ))
180+ f .write (pprint .pformat (new_result ,width = 100 ).replace ('\' ' ,'"' ))
141181# json.dump(new_result, f, ensure_ascii=False)
142182
143183def export_structured_emr (self ):
@@ -160,7 +200,7 @@ def export_structured_emr(self):
160200for r in true_relations :
161201if relations .get (r ['first' ]):
162202relations [r ['first' ]].append ((r ['second' ],r ['type' ]))
163- print (relations )
203+ # print(relations)
164204else :
165205relations [r ['first' ]]= [(r ['second' ],r ['type' ])]
166206for e in entities :
@@ -206,7 +246,8 @@ def generate_re_mutli_training_data(self):
206246word_indices = self .map_to_indices (annotation ['words' ])
207247for true_rel_name in annotation ['true_relations' ]:
208248true_rel = annotation ['true_relations' ][true_rel_name ]
209- train_data .append ({'words' :word_indices ,'primary' :true_rel ['primary' ],'secondary' :true_rel ['secondary' ],'type' :self .relation_category_labels [true_rel ['type' ]]})
249+ train_data .append ({'words' :word_indices ,'primary' :true_rel ['primary' ],'secondary' :true_rel ['secondary' ],
250+ 'type' :self .relation_category_labels [true_rel ['type' ]]})
210251return train_data
211252
212253def map_to_indices (self ,words ):
@@ -239,7 +280,7 @@ def read_dictionary(self, reverse=False):
239280
240281def get_files (self ):
241282files = set ()
242- print (os .path .abspath (self .data_folder ))
283+ # print(os.path.abspath(self.data_folder))
243284for l in os .listdir (self .data_folder + self .mode + '/' ):
244285files .add (os .path .splitext (l )[0 ])
245286return files
@@ -278,16 +319,68 @@ def read_annotations(self, directed=False):
278319print ('fuck your world' )
279320sentence ['new_entities' ][entity ['index' ]]= entity
280321
281- data = self .read_relation_in_single_file (filename + '.ann' ,sentence_dict ,directed )
322+ # data = self.read_relation_in_single_file(filename + '.ann', sentence_dict, directed)
323+ data = self .read_relation_in_single_file_permutation (filename + '.ann' ,sentence_dict ,directed )
282324all_sentences .extend (data .values ())
283325return all_sentences
284326
327+ def read_relation_in_single_file_permutation (self ,ann_file ,data ,directed = False ):
328+ with open (ann_file ,encoding = 'utf-8' )as f :
329+ entries = map (lambda l :l .strip ().split (' ' ),f .read ().replace ('\t ' ,' ' ).splitlines ())
330+ for entry in entries :
331+ idx = entry [0 ]
332+ if idx .startswith ('R' ):
333+ primary = entry [2 ][entry [2 ].find (':' )+ 1 :]
334+ # print(primary)
335+ secondary = entry [3 ][entry [3 ].find (':' )+ 1 :]
336+ for sentence_text in data :
337+ sentence = data [sentence_text ]
338+ entities = sentence ['entities' ]
339+ # sentence['true_relations'] = {}
340+ if primary in entities and secondary in entities :
341+ rel = {'id' :idx ,'primary' :entities [primary ]['index' ],'secondary' :entities [secondary ]['index' ],
342+ 'type' :entry [1 ],'first' :primary ,'second' :secondary }
343+ if sentence .get ('true_relations' ):
344+ sentence ['true_relations' ][idx ]= rel
345+ else :
346+ sentence ['true_relations' ]= {idx :rel }
347+ for sentence_text in data :
348+ sentence = data [sentence_text ]
349+ if not sentence .get ('true_relations' ):
350+ print ('sentence no relations' )
351+ continue
352+
353+ true_pairs = [(l ['primary' ],l ['secondary' ])for l in sentence ['true_relations' ].values ()]
354+ comma_index = [i for i ,w in enumerate (sentence ['words' ])if w in (',' ,',' )]
355+ all_info = {l ['index' ]:l ['type' ]for l in sentence ['entities' ].values ()}
356+ # all_indices = sorted([l[0] for l in all_info])
357+
358+ if not comma_index or comma_index [- 1 ]!= len (sentence ['words' ]):
359+ comma_index .append (len (sentence ['words' ]))
360+ comma_index = [- 1 ]+ comma_index
361+ rel_candidates = []
362+ for s ,e in zip (comma_index [:- 1 ],comma_index [1 :]):
363+ entity_candidates = [i for i in all_info if s < i < e ]
364+ rel_candidates_raw = permutations (entity_candidates ,2 )
365+
366+ for p ,s in rel_candidates_raw :
367+ p_type = all_info [p ]
368+ s_type = all_info [s ]
369+ if (p ,s )not in true_pairs :
370+ # if p_type in self.relation_pair_names and s_type in self.relation_pair_names[p_type]:
371+ rel_candidates .append ((p ,s ))
372+
373+ sentence ['false_relations' ]= {str (p )+ '-' + str (s ): {'primary' :p ,'secondary' :s }for p ,s in rel_candidates }
374+ remove_list = [s for s in data if not data [s ].get ('true_relations' )]
375+ [data .pop (s )for s in remove_list ]
376+ return data
377+
285378def read_relation_in_single_file (self ,ann_file ,data ,directed = False ):
286379with open (ann_file ,encoding = 'utf-8' )as f :
287380entries = map (lambda l :l .strip ().split (' ' ),f .read ().replace ('\t ' ,' ' ).splitlines ())
288381for entry in entries :
289- id = entry [0 ]
290- if id .startswith ('R' ):
382+ idx = entry [0 ]
383+ if idx .startswith ('R' ):
291384primary = entry [2 ][entry [2 ].find (':' )+ 1 :]
292385# print(primary)
293386secondary = entry [3 ][entry [3 ].find (':' )+ 1 :]
@@ -296,12 +389,12 @@ def read_relation_in_single_file(self, ann_file, data, directed=False):
296389entities = sentence ['entities' ]
297390# sentence['true_relations'] = {}
298391if primary in entities and secondary in entities :
299- rel = {'id' :id ,'primary' :entities [primary ]['index' ],'secondary' :entities [secondary ]['index' ],
392+ rel = {'id' :idx ,'primary' :entities [primary ]['index' ],'secondary' :entities [secondary ]['index' ],
300393'type' :entry [1 ],'first' :primary ,'second' :secondary }
301- if sentence .get ('true_realtions ' ):
302- sentence ['true_relations' ][id ]= rel
394+ if sentence .get ('true_relations ' ):
395+ sentence ['true_relations' ][idx ]= rel
303396else :
304- sentence ['true_relations' ]= {id :rel }
397+ sentence ['true_relations' ]= {idx :rel }
305398
306399for sentence_text in data :
307400sentence = data [sentence_text ]