11# -*- coding:utf-8 -*-
2+ import numpy as np
3+ import pickle
4+ from itertools import accumulate ,permutations
25from dnlp .config .sequence_labeling_config import DnnCrfConfig
36from dnlp .core .dnn_crf import DnnCrf
7+ from dnlp .core .re_cnn import RECNN
8+ from dnlp .config .re_config import RECNNConfig
9+ from dnlp .utils .constant import UNK ,BATCH_PAD
10+
11+
12+ def read_dictionary (dict_path :str ,reverse = False ):
13+ dictionary = {}
14+ with open (dict_path ,encoding = 'utf8' )as d :
15+ items = d .readlines ()
16+ for item in items :
17+ pair = item .split (' ' )
18+ dictionary [pair [0 ]]= int (pair [1 ])
19+ if reverse :
20+ return dictionary ,dict (zip (dictionary .values (),dictionary .keys ()))
21+ else :
22+ return dictionary
23+
24+
25+ BASE_FOLDER = '../dnlp/data/emr/'
26+ DICT_PATH = BASE_FOLDER + 'emr_merged_word_dict.utf8'
27+ DICTIONARY = read_dictionary (DICT_PATH )
28+ with open (BASE_FOLDER + 'rel_names' ,'rb' )as f :
29+ REL_PAIR_NAMES = pickle .load (f )
30+ REL_PAIR_NAMES = dict (zip (REL_PAIR_NAMES .values (),REL_PAIR_NAMES .keys ()))
31+ for rel_name in REL_PAIR_NAMES :
32+ REL_PAIR_NAMES [rel_name ]= REL_PAIR_NAMES [rel_name ].split (':' )
33+
34+ REL_NAMES = {'PartOf' :'部位' ,'PropertyOf' :'性质' ,'DegreeOf' :'程度' ,'QualityValue' :'定性值' ,
35+ 'QuantityValue' :'定量值' ,'UnitOf' :'单位' ,'TimeOf' :'持续时间' ,'StartTime' :'开始时间' ,
36+ 'EndTime' :'结束时间' ,'Moment' :'时间点' ,'DateOf' :'日期' ,'ResultOf' :'结果' ,
37+ 'LocationOf' :'地点' ,'DiseaseTypeOf' :'疾病分型分期' ,'SpecOf' :'规格' ,'UsageOf' :'用法' ,
38+ 'DoseOf' :'用量' ,'FamilyOf' :'家族成员' ,'ModifierOf' :'其他修饰词' ,'UseMedicine' :'用药' ,
39+ 'LeadTo' :'导致' ,'Find' :'发现' ,'Confirm' :'证实' ,'Adopt' :'采取' ,'Take' :'用药' ,
40+ 'Limit' :'限定' ,'AlongWith' :'伴随' ,'Complement' :'补足' }
41+ REL_NAME_LIST = list (REL_NAMES .keys ())
42+ ENTITY_NAMES = {'Sign' :'体征' ,'Symptom' :'症状' ,'Part' :'部位' ,'Property' :'属性' ,'Degree' :'程度' ,
43+ 'Quality' :'定性值' ,'Quantity' :'定量值' ,'Unit' :'单位' ,'Time' :'时间' ,'Date' :'日期' ,
44+ 'Result' :'结果' ,
45+ 'Disease' :'疾病' ,'DiseaseType' :'疾病类型' ,'Examination' :'检查' ,'Location' :'地址' ,
46+ 'Medicine' :'药物' ,'Spec' :'规格' ,'Usage' :'用法' ,'Dose' :'用量' ,'Treatment' :'治疗' ,
47+ 'Family' :'家族史' ,
48+ 'Modifier' :'修饰词' }
49+
50+
451def ner (sentence ):
552data_path = ''
6- config_bi_bigram = DnnCrfConfig (skip_left = 0 ,skip_right = 0 )
7- lstmcrf = DnnCrf (config = config_bi_bigram ,task = 'ner' ,data_path = data_path ,nn = 'lstm' ,remark = 'lstm' )
8- return lstmcrf .predict (sentence )
53+ model_path = '../dnlp/models/emr/ner-lstm-50.ckpt'
54+ config = DnnCrfConfig (skip_left = 1 ,skip_right = 1 )
55+ lstmcrf = DnnCrf (config = config ,task = 'ner' ,model_path = model_path ,mode = 'predict' ,data_path = data_path ,nn = 'lstm' ,
56+ remark = 'lstm' )
57+ return lstmcrf .predict_ll (sentence )
58+
59+
960def cws (sentence ):
10- config = DnnCrfConfig ()
11- model_path = '../dnlp/models/emr/cws-lstm-emr_cws-20 .ckpt'
61+ config = DnnCrfConfig (skip_left = 1 , skip_right = 1 )
62+ model_path = '../dnlp/models/emr/cws-lstm-emr_cws-50 .ckpt'
1263dnncrf = DnnCrf (config = config ,model_path = model_path ,mode = 'predict' ,nn = 'lstm' ,task = 'cws' ,remark = 'emr_cws' )
13- return dnncrf .predict (sentence )
64+ return dnncrf .predict_ll (sentence )
1465
15- def prepare_rel (sentence ):
66+
67+ def prepare_rel (sentence ,batch_length = 85 ):
1668cws_res = cws (sentence )
69+ ner_res = ner (sentence )
70+ lengths = list (accumulate ([len (l )for l in cws_res ]))
71+ ne_candidates = []
72+ words = list (map (lambda w :DICTIONARY [w ]if w in DICTIONARY else DICTIONARY [UNK ],cws_res ))
73+ if len (words )< batch_length :
74+ words += [DICTIONARY [BATCH_PAD ]]* (batch_length - len (words ))
75+ else :
76+ words = words [:batch_length ]
77+ for ne ,s in ner_res :
78+ idx = cws_res .index (ne )
79+ if idx != - 1 :
80+ ne_candidates .append (idx )
81+ else :
82+ print ('fuck' )
83+ rel_candidates = list (permutations (ne_candidates ,2 ))
84+ primary ,secondary = generate_rel (rel_candidates ,batch_length )
85+ word_array = np .array ([[words ]]* len (rel_candidates ))
86+ rel_count = len (rel_candidates )
87+ return np .array ([words ]* rel_count ),primary ,secondary , [cws_res ]* rel_count ,rel_candidates
88+
89+
90+ def generate_rel (rel_candidates ,batch_length ):
91+ primary = []
92+ secondary = []
93+ for f ,s in rel_candidates :
94+ primary .append (np .arange (batch_length )- f + batch_length - 1 )
95+ secondary .append (np .arange (batch_length )- s + batch_length - 1 )
96+ return np .array (primary ),np .array (secondary )
1797
1898
19- def rel ():
99+ def rel_extract (sentences ):
100+ words = []
101+ rel_pairs = []
102+ sentence_words = []
103+ primary = []
104+ secondary = []
105+ for sentence in sentences :
106+ w ,p ,s ,ww ,pp = prepare_rel (sentence )
107+ sentence_words .extend (w )
108+ primary .extend (p )
109+ secondary .extend (s )
110+ words .extend (ww )
111+ rel_pairs .extend (pp )
112+ config_two = RECNNConfig (window_size = (2 ,3 ,4 ))
113+ config_mutli = RECNNConfig (window_size = (2 ,3 ,4 ))
114+ model_path_two = '../dnlp/models/re_two/50-2_3_4_directed.ckpt'
115+ model_path_multi = '../dnlp/models/re_multi/50-2_3_4_directed.ckpt'
116+ recnn2 = RECNN (config = config_two ,dict_path = DICT_PATH ,mode = 'test' ,model_path = model_path_two ,relation_count = 2 ,data_mode = 'test' )
117+ recnn = RECNN (config = config_two ,dict_path = DICT_PATH ,mode = 'test' ,model_path = model_path_multi ,relation_count = 28 ,data_mode = 'test' )
118+ two_res = recnn2 .predict (sentence_words ,primary ,secondary )
119+ true_words = [words [i ]for i in two_res if i ]
120+ true_rel_pairs = [rel_pairs [i ]for i in two_res if i ]
121+ true_sentence_words = [sentence_words [i ]for i in two_res if i ]
122+ true_primary = [primary [i ]for i in two_res if i ]
123+ true_secondary = [secondary [i ]for i in two_res if i ]
124+ multi_res = recnn .predict (true_sentence_words ,true_primary ,true_secondary )
125+ get_rel_result (true_words ,true_rel_pairs ,multi_res )
126+
127+ def get_rel_result (words ,rel_pairs ,rel_types ):
128+ result = {}
129+ for sentence_words , (primary_idx ,secondary_idx ),rel_type in zip (words ,rel_pairs ,rel_types ):
130+ rel_type_name = REL_NAME_LIST [rel_type ]
131+ primary = sentence_words [primary_idx ]
132+ secondary = sentence_words [secondary_idx ]
133+ primary_type ,secondary_type = REL_PAIR_NAMES [rel_type_name ]
134+ primary_type = ENTITY_NAMES [primary_type ]
135+ secondary_type = ENTITY_NAMES [secondary_type ]
136+ # result[]
137+
138+
139+ def export ():
20140pass
21141
22142def get_sentences (filename ):
23- with open ('../dnlp/data/emr/emr_paper/train/' + filename ,encoding = 'utf-8' )as f :
24- return f .read ().split ('。' )
143+ with open ('../dnlp/data/emr/emr_paper/train/' + filename ,encoding = 'utf-8' )as f :
144+ sentences = [l + '。' for l in f .read ().split ('。' )]
145+ if sentences [- 1 ]== '。' :
146+ sentences = sentences [:- 1 ]
147+ else :
148+ sentences [- 1 ]= sentences [- 1 ][:- 1 ]
149+ return sentences
150+
25151
26152if __name__ == '__main__' :
27153sentences = get_sentences ('996716_admission.txt' )
28- for sentence in sentences :
29- prepare_rel (sentence )
154+ rel_extract (sentences )