33import re
44import pickle
55import random
6+ import json
7+ import pprint
8+ from collections import OrderedDict
9+ from itertools import chain
610from dnlp .utils .constant import UNK
711
812RE_SAPCE = re .compile ('[ ]+' )
913
10-
14+ # print(pprint.pformat([1,[[2]],3,4444444444,77777777777777777777777777],indent=2,width=10))
1115class ProcessEMR (object ):
1216def __init__ (self ,base_folder :str ,dict_path :str = '' ,mode = 'train' ,directed = False ):
1317self .base_folder = base_folder
@@ -35,6 +39,13 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed
3539'DoseOf' :'用量' ,'FamilyOf' :'家族成员' ,'ModifierOf' :'其他修饰词' ,'UseMedicine' :'用药' ,
3640'LeadTo' :'导致' ,'Find' :'发现' ,'Confirm' :'证实' ,'Adopt' :'采取' ,'Take' :'用药' ,
3741'Limit' :'限定' ,'AlongWith' :'伴随' ,'Complement' :'补足' }
42+ self .entity_categories = {'Sign' :'体征' ,'Symptom' :'症状' ,'Part' :'部位' ,'Property' :'属性' ,'Degree' :'程度' ,
43+ 'Quality' :'定性值' ,'Quantity' :'定量值' ,'Unit' :'单位' ,'Time' :'时间' ,'Date' :'日期' ,
44+ 'Result' :'结果' ,
45+ 'Disease' :'疾病' ,'DiseaseType' :'疾病类型' ,'Examination' :'检查' ,'Location' :'地址' ,
46+ 'Medicine' :'药物' ,'Spec' :'规格' ,'Usage' :'用法' ,'Dose' :'用量' ,'Treatment' :'治疗' ,
47+ 'Family' :'家族史' ,
48+ 'Modifier' :'修饰词' }
3849self .relation_category_labels = {}
3950relation_category_index = 0
4051for relation_category in self .relation_categories :
@@ -45,6 +56,9 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed
4556pickle .dump (self .relation_category_labels ,f )
4657self .two_categories = self .generate_re_two_training_data ()
4758self .multi_categories = self .generate_re_mutli_training_data ()
59+ self .export_structured_emr ()
60+ self .data = self .read_file ()
61+ self .export ()
4862self .save_data ()
4963
5064def statistics (self ):
@@ -57,6 +71,114 @@ def statistics(self):
5771print (false_count / all_count )
5872print (all_count )
5973
74+ def read_file (self ):
75+ data = {}
76+ for f in self .files :
77+ file_data = {'entities' : {},'relations' : {}}
78+ with open (self .data_folder + 'train/' + f + '.ann' ,encoding = 'utf-8' )as f :
79+ entries = [l .split ('\t ' )for l in f .read ().splitlines ()if l ]
80+ for entry in entries :
81+ idx = entry [0 ]
82+ if idx .startswith ('T' ):
83+ e_type ,start ,end = entry [1 ].split (' ' )
84+ e_type = self .entity_categories [e_type ]
85+ start = int (start )
86+ end = int (end )
87+ file_data ['entities' ][idx ]= {'text' :entry [2 ],'type' :e_type }
88+ elif idx .startswith ('R' ):
89+ r_type ,r1 ,r2 = entry [1 ].split (' ' )
90+ r1 = r1 [r1 .index (':' )+ 1 :]
91+ r2 = r2 [r2 .index (':' )+ 1 :]
92+ if r1 not in file_data ['relations' ]:
93+ file_data ['relations' ][r1 ]= [(r2 ,r_type )]
94+ else :
95+ file_data ['relations' ][r1 ].append ((r2 ,r_type ))
96+ data [f ]= file_data
97+ return data
98+
99+ def export (self ):
100+ for filename ,file_data in self .data .items ():
101+ filename = os .path .basename (filename .name [:- 4 ])
102+ result = {}
103+ entities = file_data ['entities' ].copy ()
104+ relations = file_data ['relations' ]
105+ for e_id ,entity in entities .items ():
106+ e_type = entity ['type' ]
107+ # e = entity['text']
108+ if e_id in relations :
109+ attribute = {}
110+ for r2 ,rt in relations [e_id ]:
111+ e2 = file_data ['entities' ][r2 ].copy ()
112+ # e2['name'] = self.relation_categories[rt]
113+ # attribute.append(e2)
114+ if not attribute .get (self .relation_categories [rt ]):
115+ attribute [self .relation_categories [rt ]]= e2 ['text' ]
116+ else :
117+ if type (attribute [self .relation_categories [rt ]])== str :
118+ attribute [self .relation_categories [rt ]]= [attribute [self .relation_categories [rt ]],e2 ['text' ]]
119+ else :
120+ attribute [self .relation_categories [rt ]].append (e2 ['text' ])
121+ if not result .get (e_type ):
122+ result [e_type ]= [{entity ['text' ]:attribute }]
123+ else :
124+ result [e_type ].append ({entity ['text' ]:attribute })
125+ else :
126+ if not result .get (e_type ):
127+ result [e_type ]= [entity ['text' ]]
128+ else :
129+ result [e_type ].append (entity ['text' ])
130+ new_result = {}
131+ for k ,v in result .items ():
132+ nv = [val for val in v if type (val )!= str ]
133+ if nv :
134+ new_result [k ]= nv
135+ # entity['attributes'] = attribute
136+ # result.append(entity)
137+ with open (self .base_folder + 'structured/' + filename + '.json' ,'w' ,encoding = 'utf-8' )as f :
138+ f .write (pprint .pformat (new_result ,width = 100 ).replace ('\' ' ,'"' ))
139+ # json.dump(new_result, f, ensure_ascii=False)
140+
141+ def export_structured_emr (self ):
142+ annotations = {}
143+ for sentence in self .annotations :
144+ sentence ['start' ]= min ([e ['start' ]for e in sentence ['entities' ].values ()])
145+ if sentence ['file' ]not in annotations :
146+ annotations [sentence ['file' ]]= [sentence ]
147+ else :
148+ annotations [sentence ['file' ]].append (sentence )
149+ structured_result = []
150+ for annotation in annotations .values ():
151+ filename = annotation [0 ]['file' ]+ '.json'
152+ result = []
153+ entities = list (chain (* [a ['entities' ].values ()for a in annotation ]))
154+ entities = sorted (entities ,key = lambda e :e ['start' ])
155+ entities_dict = {e ['id' ]:e for e in entities }
156+ true_relations = list (chain (* [a ['true_relations' ].values ()for a in annotation ]))
157+ relations = {}
158+ for r in true_relations :
159+ if relations .get (r ['first' ]):
160+ relations [r ['first' ]].append ((r ['second' ],r ['type' ]))
161+ print (relations )
162+ else :
163+ relations [r ['first' ]]= [(r ['second' ],r ['type' ])]
164+ for e in entities :
165+ e_id = e ['id' ]
166+ entity = {'text' :e ['text' ],'start' :e ['start' ],'length' :e ['length' ],'type' :e ['type' ]}
167+ if e_id in relations :
168+ attributes = []
169+ for ee_id ,ee_type in relations [e_id ]:
170+ ee = entities_dict [ee_id ]
171+ attributes .append ({'name' :self .relation_categories [ee_type ],'text' :ee ['text' ],'start' :ee ['start' ],
172+ 'length' :ee ['length' ],'type' :ee ['type' ]})
173+ entity .update ({'attributes' :attributes })
174+ result .append (entity )
175+ # with open(self.base_folder+'structured/'+filename,'w',encoding='utf-8') as f:
176+ # json.dump(result,f,ensure_ascii=False)
177+ # structured_result.append(result)
178+
179+ # annotations = OrderedDict(sorted(annotations.items(),key=lambda i:i[1]['start']))
180+ return annotations
181+
60182def generate_re_two_training_data (self ):
61183train_data = []
62184for annotation in self .annotations :
@@ -125,7 +247,7 @@ def read_annotations(self, directed=False):
125247all_sentences = []
126248for file in self .files :
127249filename = self .data_folder + self .mode + '/' + file
128- sentence_dict ,periods = self .read_entities_in_single_file (filename + '.txt' ,filename + '.ann' )
250+ sentence_dict ,periods = self .read_entities_in_single_file (filename + '.txt' ,filename + '.ann' , filename )
129251
130252sentence_words = self .read_cws_file (self .data_folder + 'cws/' + file + '.cws' ,periods )
131253sentences = ['' .join (s )for s in sentence_words ]
@@ -174,7 +296,7 @@ def read_relation_in_single_file(self, ann_file, data, directed=False):
174296# sentence['true_relations'] = {}
175297if primary in entities and secondary in entities :
176298rel = {'id' :id ,'primary' :entities [primary ]['index' ],'secondary' :entities [secondary ]['index' ],
177- 'type' :entry [1 ]}
299+ 'type' :entry [1 ], 'first' : primary , 'second' : secondary }
178300if sentence .get ('true_realtions' ):
179301sentence ['true_relations' ][id ]= rel
180302else :
@@ -245,7 +367,8 @@ def read_cws_file(self, cws_file, periods):
245367
246368return sentence_words
247369
248- def read_entities_in_single_file (self ,raw_file ,ann_file ):
370+ def read_entities_in_single_file (self ,raw_file ,ann_file ,common_name ):
371+ common_name = os .path .basename (common_name )
249372data = {}
250373with open (raw_file ,encoding = 'utf-8' )as r :
251374sentence = r .read ()
@@ -323,6 +446,7 @@ def read_entities_in_single_file(self, raw_file, ann_file):
323446entities [id ]= entity
324447else :
325448sentence_dict [new_sentence ]['entities' ]= {id :entity }
449+ sentence_dict [new_sentence ]['file' ]= common_name
326450break
327451else :
328452entity = {'id' :id ,'start' :start ,'length' :end - start ,'text' :text ,'type' :entry [1 ]}