Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commited380f9

Browse files
add some codes
1 parentc8b87f6 commited380f9

File tree

8 files changed

+184
-26
lines changed

8 files changed

+184
-26
lines changed

‎python/dnlp/core/cbow.py‎

Lines changed: 0 additions & 1 deletion
This file was deleted.

‎python/dnlp/core/re_cnn.py‎

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@
66
fromdnlp.configimportRECNNConfig
77

88

9-
109
classRECNN(RECNNBase):
1110
def__init__(self,config:RECNNConfig,dtype:type=tf.float32,dict_path:str='',mode:str='train',
1211
data_path:str='',relation_count:int=2,model_path:str='',embedding_path:str='',
13-
remark:str=''):
12+
remark:str=''):
1413
tf.reset_default_graph()
1514
RECNNBase.__init__(self,config,dict_path)
1615
self.dtype=dtype
@@ -44,8 +43,8 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str
4443
self.full_connected_weight=self.__weight_variable([self.filter_size*len(self.window_size),self.relation_count],
4544
name='full_connected_weight')
4645
self.full_connected_bias=self.__weight_variable([self.relation_count],name='full_connected_bias')
47-
self.input_words_lookup=tf.nn.embedding_lookup(self.input_words,self.input_indices)
48-
self.input_primary_lookup=tf.nn.embedding_lookup(self.input_primary,self.input_indices)
46+
self.input_words_lookup=tf.nn.embedding_lookup(self.input_words,self.input_indices)
47+
self.input_primary_lookup=tf.nn.embedding_lookup(self.input_primary,self.input_indices)
4948
self.input_secondary_lookup=tf.nn.embedding_lookup(self.input_secondary,self.input_indices)
5049
self.input_labels_lookup=tf.nn.embedding_lookup(self.input_labels,self.input_indices)
5150
self.position_lookup=tf.nn.embedding_lookup(self.position_embedding,self.input_position)
@@ -59,8 +58,6 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str
5958
self.emebd_concat=tf.expand_dims(
6059
tf.concat([self.character_embed_holder,self.primary_embed_holder,self.secondary_embed_holder],2),3)
6160

62-
63-
6461
ifself.mode=='train':
6562
self.start=0
6663
self.hidden_layer=tf.layers.dropout(self.get_hidden(),self.dropout_rate)
@@ -126,14 +123,14 @@ def fit(self, epochs=50, interval=5):
126123
print('epoch:'+str(i))
127124
forjinrange(self.data_count//self.batch_size):
128125
ifstart+self.batch_size<self.data_count:
129-
indices=list(range(start,start+self.batch_size))
126+
indices=list(range(start,start+self.batch_size))
130127
start+=self.batch_size
131128
else:
132-
new_start=self.batch_size-self.data_count+start
133-
indices=list(range(start,self.data_count))+list(range(0,new_start))
129+
new_start=self.batch_size-self.data_count+start
130+
indices=list(range(start,self.data_count))+list(range(0,new_start))
134131
start=new_start
135-
words,primary,secondary,labels=sess.run([self.input_words,self.input_primary,self.input_secondary,
136-
self.input_labels],feed_dict={self.input_indices:indices})
132+
words,primary,secondary,labels=sess.run([self.input_words,self.input_primary,self.input_secondary,
133+
self.input_labels],feed_dict={self.input_indices:indices})
137134
# words, primary, secondary, labels = self.load_batch()
138135
character_embeds,primary_embeds=sess.run([self.character_lookup,self.position_lookup],
139136
feed_dict={self.input_characters:words,
@@ -147,10 +144,10 @@ def fit(self, epochs=50, interval=5):
147144
ifi%interval==0:
148145
ifself.relation_count==2:
149146
model_name='../dnlp/models/re_{2}/{0}-{1}{3}.ckpt'.format(i,'_'.join(map(str,self.window_size)),
150-
'two',self.remark)
147+
'two',self.remark)
151148
else:
152149
model_name='../dnlp/models/re_{2}/{0}-{1}{3}.ckpt'.format(i,'_'.join(map(str,self.window_size)),
153-
'multi',self.remark)
150+
'multi',self.remark)
154151

155152
self.saver.save(sess,model_name)
156153

@@ -210,8 +207,6 @@ def load_batch(self):
210207
self.start=new_start
211208
returnwords,primary,secondary,labels
212209

213-
214-
215210
def__weight_variable(self,shape,name):
216211
initial=tf.truncated_normal(shape,stddev=0.1,dtype=self.dtype)
217212
returntf.Variable(initial,name=name)

‎python/dnlp/data_process/process_brat.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
classProcessBrat(object):
55
def__init__(self):
6-
pass
6+
pass
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding:utf-8 -*-
2+
fromdnlp.data_process.processorimportPreprocessor
3+
classProcessConll(Preprocessor):
4+
def__init__(self,*,files:tuple,name:str,base_folder:str='dnlp/data/',dict_path:str=''):
5+
ifdict_path:
6+
Preprocessor.__init__(self,base_folder=base_folder,dict_path=dict_path)
7+
else:
8+
Preprocessor.__init__(self,base_folder=base_folder,files=files,dict_path=base_folder+name+'.utf8')

‎python/dnlp/data_process/process_emr.py‎

Lines changed: 128 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@
33
importre
44
importpickle
55
importrandom
6+
importjson
7+
importpprint
8+
fromcollectionsimportOrderedDict
9+
fromitertoolsimportchain
610
fromdnlp.utils.constantimportUNK
711

812
RE_SAPCE=re.compile('[ ]+')
913

10-
14+
# print(pprint.pformat([1,[[2]],3,4444444444,77777777777777777777777777],indent=2,width=10))
1115
classProcessEMR(object):
1216
def__init__(self,base_folder:str,dict_path:str='',mode='train',directed=False):
1317
self.base_folder=base_folder
@@ -35,6 +39,13 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed
3539
'DoseOf':'用量','FamilyOf':'家族成员','ModifierOf':'其他修饰词','UseMedicine':'用药',
3640
'LeadTo':'导致','Find':'发现','Confirm':'证实','Adopt':'采取','Take':'用药',
3741
'Limit':'限定','AlongWith':'伴随','Complement':'补足'}
42+
self.entity_categories= {'Sign':'体征','Symptom':'症状','Part':'部位','Property':'属性','Degree':'程度',
43+
'Quality':'定性值','Quantity':'定量值','Unit':'单位','Time':'时间','Date':'日期',
44+
'Result':'结果',
45+
'Disease':'疾病','DiseaseType':'疾病类型','Examination':'检查','Location':'地址',
46+
'Medicine':'药物','Spec':'规格','Usage':'用法','Dose':'用量','Treatment':'治疗',
47+
'Family':'家族史',
48+
'Modifier':'修饰词'}
3849
self.relation_category_labels= {}
3950
relation_category_index=0
4051
forrelation_categoryinself.relation_categories:
@@ -45,6 +56,9 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed
4556
pickle.dump(self.relation_category_labels,f)
4657
self.two_categories=self.generate_re_two_training_data()
4758
self.multi_categories=self.generate_re_mutli_training_data()
59+
self.export_structured_emr()
60+
self.data=self.read_file()
61+
self.export()
4862
self.save_data()
4963

5064
defstatistics(self):
@@ -57,6 +71,114 @@ def statistics(self):
5771
print(false_count/all_count)
5872
print(all_count)
5973

74+
defread_file(self):
75+
data= {}
76+
forfinself.files:
77+
file_data= {'entities': {},'relations': {}}
78+
withopen(self.data_folder+'train/'+f+'.ann',encoding='utf-8')asf:
79+
entries= [l.split('\t')forlinf.read().splitlines()ifl]
80+
forentryinentries:
81+
idx=entry[0]
82+
ifidx.startswith('T'):
83+
e_type,start,end=entry[1].split(' ')
84+
e_type=self.entity_categories[e_type]
85+
start=int(start)
86+
end=int(end)
87+
file_data['entities'][idx]= {'text':entry[2],'type':e_type}
88+
elifidx.startswith('R'):
89+
r_type,r1,r2=entry[1].split(' ')
90+
r1=r1[r1.index(':')+1:]
91+
r2=r2[r2.index(':')+1:]
92+
ifr1notinfile_data['relations']:
93+
file_data['relations'][r1]= [(r2,r_type)]
94+
else:
95+
file_data['relations'][r1].append((r2,r_type))
96+
data[f]=file_data
97+
returndata
98+
99+
defexport(self):
100+
forfilename,file_datainself.data.items():
101+
filename=os.path.basename(filename.name[:-4])
102+
result= {}
103+
entities=file_data['entities'].copy()
104+
relations=file_data['relations']
105+
fore_id,entityinentities.items():
106+
e_type=entity['type']
107+
# e = entity['text']
108+
ife_idinrelations:
109+
attribute= {}
110+
forr2,rtinrelations[e_id]:
111+
e2=file_data['entities'][r2].copy()
112+
# e2['name'] = self.relation_categories[rt]
113+
# attribute.append(e2)
114+
ifnotattribute.get(self.relation_categories[rt]):
115+
attribute[self.relation_categories[rt]]=e2['text']
116+
else:
117+
iftype(attribute[self.relation_categories[rt]])==str:
118+
attribute[self.relation_categories[rt]]= [attribute[self.relation_categories[rt]],e2['text']]
119+
else:
120+
attribute[self.relation_categories[rt]].append(e2['text'])
121+
ifnotresult.get(e_type):
122+
result[e_type]= [{entity['text']:attribute}]
123+
else:
124+
result[e_type].append({entity['text']:attribute})
125+
else:
126+
ifnotresult.get(e_type):
127+
result[e_type]= [entity['text']]
128+
else:
129+
result[e_type].append(entity['text'])
130+
new_result= {}
131+
fork,vinresult.items():
132+
nv= [valforvalinviftype(val)!=str]
133+
ifnv:
134+
new_result[k]=nv
135+
# entity['attributes'] = attribute
136+
# result.append(entity)
137+
withopen(self.base_folder+'structured/'+filename+'.json','w',encoding='utf-8')asf:
138+
f.write(pprint.pformat(new_result,width=100).replace('\'','"'))
139+
# json.dump(new_result, f, ensure_ascii=False)
140+
141+
defexport_structured_emr(self):
142+
annotations= {}
143+
forsentenceinself.annotations:
144+
sentence['start']=min([e['start']foreinsentence['entities'].values()])
145+
ifsentence['file']notinannotations:
146+
annotations[sentence['file']]= [sentence]
147+
else:
148+
annotations[sentence['file']].append(sentence)
149+
structured_result= []
150+
forannotationinannotations.values():
151+
filename=annotation[0]['file']+'.json'
152+
result= []
153+
entities=list(chain(*[a['entities'].values()forainannotation]))
154+
entities=sorted(entities,key=lambdae:e['start'])
155+
entities_dict= {e['id']:eforeinentities}
156+
true_relations=list(chain(*[a['true_relations'].values()forainannotation]))
157+
relations= {}
158+
forrintrue_relations:
159+
ifrelations.get(r['first']):
160+
relations[r['first']].append((r['second'],r['type']))
161+
print(relations)
162+
else:
163+
relations[r['first']]= [(r['second'],r['type'])]
164+
foreinentities:
165+
e_id=e['id']
166+
entity= {'text':e['text'],'start':e['start'],'length':e['length'],'type':e['type']}
167+
ife_idinrelations:
168+
attributes= []
169+
foree_id,ee_typeinrelations[e_id]:
170+
ee=entities_dict[ee_id]
171+
attributes.append({'name':self.relation_categories[ee_type],'text':ee['text'],'start':ee['start'],
172+
'length':ee['length'],'type':ee['type']})
173+
entity.update({'attributes':attributes})
174+
result.append(entity)
175+
# with open(self.base_folder+'structured/'+filename,'w',encoding='utf-8') as f:
176+
# json.dump(result,f,ensure_ascii=False)
177+
# structured_result.append(result)
178+
179+
# annotations = OrderedDict(sorted(annotations.items(),key=lambda i:i[1]['start']))
180+
returnannotations
181+
60182
defgenerate_re_two_training_data(self):
61183
train_data= []
62184
forannotationinself.annotations:
@@ -125,7 +247,7 @@ def read_annotations(self, directed=False):
125247
all_sentences= []
126248
forfileinself.files:
127249
filename=self.data_folder+self.mode+'/'+file
128-
sentence_dict,periods=self.read_entities_in_single_file(filename+'.txt',filename+'.ann')
250+
sentence_dict,periods=self.read_entities_in_single_file(filename+'.txt',filename+'.ann',filename)
129251

130252
sentence_words=self.read_cws_file(self.data_folder+'cws/'+file+'.cws',periods)
131253
sentences= [''.join(s)forsinsentence_words]
@@ -174,7 +296,7 @@ def read_relation_in_single_file(self, ann_file, data, directed=False):
174296
# sentence['true_relations'] = {}
175297
ifprimaryinentitiesandsecondaryinentities:
176298
rel= {'id':id,'primary':entities[primary]['index'],'secondary':entities[secondary]['index'],
177-
'type':entry[1]}
299+
'type':entry[1],'first':primary,'second':secondary}
178300
ifsentence.get('true_realtions'):
179301
sentence['true_relations'][id]=rel
180302
else:
@@ -245,7 +367,8 @@ def read_cws_file(self, cws_file, periods):
245367

246368
returnsentence_words
247369

248-
defread_entities_in_single_file(self,raw_file,ann_file):
370+
defread_entities_in_single_file(self,raw_file,ann_file,common_name):
371+
common_name=os.path.basename(common_name)
249372
data= {}
250373
withopen(raw_file,encoding='utf-8')asr:
251374
sentence=r.read()
@@ -323,6 +446,7 @@ def read_entities_in_single_file(self, raw_file, ann_file):
323446
entities[id]=entity
324447
else:
325448
sentence_dict[new_sentence]['entities']= {id:entity}
449+
sentence_dict[new_sentence]['file']=common_name
326450
break
327451
else:
328452
entity= {'id':id,'start':start,'length':end-start,'text':text,'type':entry[1]}

‎python/scripts/cws_ner.py‎

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def test_cws():
3232

3333
deftrain_emr_cws():
3434
data_path='../dnlp/data/emr/emr_cws.pickle'
35-
config=DnnCrfConfig()
35+
config=DnnCrfConfig(skip_left=1,skip_right=1)
3636
dnncrf=DnnCrf(config=config,data_path=data_path,nn='lstm',task='cws',remark='emr_cws')
3737
dnncrf.fit()
3838

@@ -404,9 +404,9 @@ def export_cws(data, filename):
404404
train_cws()
405405
elifargs.emr:
406406
# train_emr_old_method()
407-
#train_emr_cws()
408-
train_emr_word_skipgram()
409-
train_emr_word_cbow()
407+
train_emr_cws()
408+
#train_emr_word_skipgram()
409+
#train_emr_word_cbow()
410410
# train_emr_with_embeddings()
411411
# train_emr_ngram('mlp')
412412
# train_emr_ngram('rnn')

‎python/scripts/init_datasets.py‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,7 @@ def build_emr_cws_files(base_folder):
8282
# copy()
8383
# build_cws_datasets()
8484
# build_emr_datasets()
85-
build_emr_re()
85+
# build_emr_re()
86+
base_folder='../dnlp/data/emr/'
87+
dict_path=base_folder+'emr_merged_word_dict.utf8'
88+
ProcessEMR(base_folder=base_folder,dict_path=dict_path,directed=True)

‎python/scripts/pipeline.py‎

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# -*- coding:utf-8 -*-
2+
fromdnlp.config.sequence_labeling_configimportDnnCrfConfig
3+
fromdnlp.core.dnn_crfimportDnnCrf
4+
defner(sentence):
5+
data_path=''
6+
config_bi_bigram=DnnCrfConfig(skip_left=0,skip_right=0)
7+
lstmcrf=DnnCrf(config=config_bi_bigram,task='ner',data_path=data_path,nn='lstm',remark='lstm')
8+
returnlstmcrf.predict(sentence)
9+
defcws(sentence):
10+
config=DnnCrfConfig()
11+
model_path='../dnlp/models/emr/cws-lstm-emr_cws-20.ckpt'
12+
dnncrf=DnnCrf(config=config,model_path=model_path,mode='predict',nn='lstm',task='cws',remark='emr_cws')
13+
returndnncrf.predict(sentence)
14+
15+
defprepare_rel(sentence):
16+
cws_res=cws(sentence)
17+
18+
19+
defrel():
20+
pass
21+
22+
defget_sentences(filename):
23+
withopen('../dnlp/data/emr/emr_paper/train/'+filename,encoding='utf-8')asf:
24+
returnf.read().split('。')
25+
26+
if__name__=='__main__':
27+
sentences=get_sentences('996716_admission.txt')
28+
forsentenceinsentences:
29+
prepare_rel(sentence)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp