Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitbab0698

Browse files
Merge branch 'emr' into develop
# Conflicts:#python/dnlp/config/re_config.py#python/dnlp/config/sequence_labeling_config.py#python/dnlp/core/dnn_crf.py#python/dnlp/core/dnn_crf_base.py#python/dnlp/core/re_cnn.py#python/scripts/cws_ner.py#python/scripts/init_datasets.py#python/scripts/pipeline.py#python/scripts/rel.py
2 parents88f4555 +9a13f6d commitbab0698

File tree

15 files changed

+870
-40
lines changed

15 files changed

+870
-40
lines changed

‎python/dnlp/config/__init__.py‎

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
#-*- coding: UTF-8 -*-
1+
#-*- coding: UTF-8 -*-
2+
fromdnlp.config.sequence_labeling_configimportDnnCrfConfig
3+
fromdnlp.config.re_configimportRECNNConfig
File renamed without changes.

‎python/dnlp/core/cbow.py‎

Lines changed: 0 additions & 1 deletion
This file was deleted.

‎python/dnlp/core/dnn_crf_emr.py‎

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
# -*- coding: UTF-8 -*-
2+
importtensorflowastf
3+
importnumpyasnp
4+
importmath
5+
fromdnlp.core.dnn_crf_baseimportDnnCrfBase
6+
fromdnlp.configimportDnnCrfConfig
7+
8+
9+
classDnnCrfEmr(DnnCrfBase):
10+
def__init__(self,*,config:DnnCrfConfig=None,data_path:str='',dtype:type=tf.float32,task:str='ner',mode:str='train',
11+
nn:str,model_path:str=''):
12+
ifmodenotin ['train','predict']:
13+
raiseException('mode error')
14+
ifnnnotin ['mlp','rnn','lstm','bilstm','gru']:
15+
raiseException('name of neural network entered is not supported')
16+
17+
DnnCrfBase.__init__(self,config,data_path,mode,model_path)
18+
self.dtype=dtype
19+
self.mode=mode
20+
self.nn=nn
21+
self.task=task
22+
23+
# 构建
24+
tf.reset_default_graph()
25+
self.transition=self.__get_variable([self.tags_count,self.tags_count],'transition')
26+
self.transition_init=self.__get_variable([self.tags_count],'transition_init')
27+
self.params= [self.transition,self.transition_init]
28+
# 输入层
29+
ifmode=='train':
30+
self.input=tf.placeholder(tf.int32, [self.batch_size,self.batch_length,self.windows_size])
31+
self.real_indices=tf.placeholder(tf.int32, [self.batch_size,self.batch_length])
32+
self.seq_length=tf.placeholder(tf.int32, [None])
33+
else:
34+
self.input=tf.placeholder(tf.int32, [None,self.windows_size])
35+
36+
# 查找表层
37+
self.embedding_layer=self.get_embedding_layer()
38+
# 隐藏层
39+
ifnn=='mlp':
40+
self.hidden_layer=self.get_mlp_layer(tf.transpose(self.embedding_layer))
41+
elifnn=='lstm':
42+
self.hidden_layer=self.get_lstm_layer(tf.transpose(self.embedding_layer))
43+
elifnn=='gru':
44+
self.hidden_layer=self.get_gru_layer(tf.transpose(self.embedding_layer))
45+
else:
46+
self.hidden_layer=self.get_rnn_layer(tf.transpose(self.embedding_layer))
47+
# 输出层
48+
self.output=self.get_output_layer(self.hidden_layer)
49+
50+
ifmode=='predict':
51+
self.output=tf.squeeze(self.output,axis=1)
52+
self.sess=tf.Session()
53+
self.sess.run(tf.global_variables_initializer())
54+
tf.train.Saver().restore(save_path=self.model_path,sess=self.sess)
55+
else:
56+
# 构建训练函数
57+
# 训练用placeholder
58+
self.ll_corr=tf.placeholder(tf.int32,shape=[None,3])
59+
self.ll_curr=tf.placeholder(tf.int32,shape=[None,3])
60+
self.trans_corr=tf.placeholder(tf.int32, [None,2])
61+
self.trans_curr=tf.placeholder(tf.int32, [None,2])
62+
self.trans_init_corr=tf.placeholder(tf.int32, [None,1])
63+
self.trans_init_curr=tf.placeholder(tf.int32, [None,1])
64+
# 损失函数
65+
self.loss,self.loss_with_init=self.get_loss()
66+
self.optimizer=tf.train.AdagradOptimizer(self.learning_rate)
67+
self.train=self.optimizer.minimize(self.loss)
68+
self.train_with_init=self.optimizer.minimize(self.loss_with_init)
69+
70+
deffit(self,epochs:int=100,interval:int=20):
71+
withtf.Session()assess:
72+
tf.global_variables_initializer().run()
73+
saver=tf.train.Saver(max_to_keep=100)
74+
forepochinrange(1,epochs+1):
75+
print('epoch:',epoch)
76+
for_inrange(self.batch_count):
77+
characters,labels,lengths=self.get_batch()
78+
self.fit_batch(characters,labels,lengths,sess)
79+
ifepoch%interval==0:
80+
model_path='../dnlp/models/emr_old/{0}-{1}.ckpt'.format(self.nn,epoch)
81+
saver.save(sess,model_path)
82+
self.save_config(model_path)
83+
84+
deffit_batch(self,characters,labels,lengths,sess):
85+
scores=sess.run(self.output,feed_dict={self.input:characters})
86+
transition=self.transition.eval(session=sess)
87+
transition_init=self.transition_init.eval(session=sess)
88+
update_labels_pos=None
89+
update_labels_neg=None
90+
current_labels= []
91+
trans_pos_indices= []
92+
trans_neg_indices= []
93+
trans_init_pos_indices= []
94+
trans_init_neg_indices= []
95+
foriinrange(self.batch_size):
96+
current_label=self.viterbi(scores[:, :lengths[i],i],transition,transition_init)
97+
current_labels.append(current_label)
98+
diff_tag=np.subtract(labels[i, :lengths[i]],current_label)
99+
update_index=np.where(diff_tag!=0)[0]
100+
update_length=len(update_index)
101+
ifupdate_length==0:
102+
continue
103+
update_label_pos=np.stack([labels[i,update_index],update_index,i*np.ones([update_length])],axis=-1)
104+
update_label_neg=np.stack([current_label[update_index],update_index,i*np.ones([update_length])],axis=-1)
105+
ifupdate_labels_posisnotNone:
106+
np.concatenate((update_labels_pos,update_label_pos))
107+
np.concatenate((update_labels_neg,update_label_neg))
108+
else:
109+
update_labels_pos=update_label_pos
110+
update_labels_neg=update_label_neg
111+
112+
trans_pos_index,trans_neg_index,trans_init_pos,trans_init_neg,update_init=self.generate_transition_update_index(
113+
labels[i, :lengths[i]],current_labels[i])
114+
115+
trans_pos_indices.extend(trans_pos_index)
116+
trans_neg_indices.extend(trans_neg_index)
117+
118+
ifupdate_init:
119+
trans_init_pos_indices.append(trans_init_pos)
120+
trans_init_neg_indices.append(trans_init_neg)
121+
122+
ifupdate_labels_posisnotNoneandupdate_labels_negisnotNone:
123+
feed_dict= {self.input:characters,self.ll_curr:update_labels_neg,self.ll_corr:update_labels_pos,
124+
self.trans_curr:trans_neg_indices,self.trans_corr:trans_pos_indices}
125+
126+
ifnottrans_init_pos_indices:
127+
sess.run(self.train,feed_dict)
128+
else:
129+
feed_dict[self.trans_init_corr]=trans_init_pos_indices
130+
feed_dict[self.trans_init_curr]=trans_init_neg_indices
131+
sess.run(self.train_with_init,feed_dict)
132+
133+
defgenerate_transition_update_index(self,correct_labels,current_labels):
134+
ifcorrect_labels.shape!=current_labels.shape:
135+
print('sequence length is not equal')
136+
returnNone
137+
138+
before_corr=correct_labels[0]
139+
before_curr=current_labels[0]
140+
update_init=False
141+
142+
trans_init_pos=None
143+
trans_init_neg=None
144+
trans_pos= []
145+
trans_neg= []
146+
147+
ifbefore_corr!=before_curr:
148+
trans_init_pos= [before_corr]
149+
trans_init_neg= [before_curr]
150+
update_init=True
151+
152+
for_, (corr_label,curr_label)inenumerate(zip(correct_labels[1:],current_labels[1:])):
153+
ifcorr_label!=curr_labelorbefore_corr!=before_curr:
154+
trans_pos.append([before_corr,corr_label])
155+
trans_neg.append([before_curr,curr_label])
156+
before_corr=corr_label
157+
before_curr=curr_label
158+
159+
returntrans_pos,trans_neg,trans_init_pos,trans_init_neg,update_init
160+
161+
defpredict_ll(self,sentence:str,return_labels=False):
162+
ifself.mode!='predict':
163+
raiseException('mode is not allowed to predict')
164+
165+
input=self.indices2input(self.sentence2indices(sentence))
166+
runner= [self.output,self.transition,self.transition_init]
167+
output,trans,trans_init=self.sess.run(runner,feed_dict={self.input:input})
168+
labels=self.viterbi(output,trans,trans_init)
169+
ifself.task=='cws':
170+
result=self.tags2words(sentence,labels)
171+
else:
172+
result=self.tags2entities(sentence,labels)
173+
ifnotreturn_labels:
174+
returnresult
175+
else:
176+
returnresult,self.tag2sequences(labels)
177+
178+
defget_embedding_layer(self)->tf.Tensor:
179+
embeddings=self.__get_variable([self.dict_size,self.embed_size],'embeddings')
180+
self.params.append(embeddings)
181+
ifself.mode=='train':
182+
input_size= [self.batch_size,self.batch_length,self.concat_embed_size]
183+
layer=tf.reshape(tf.nn.embedding_lookup(embeddings,self.input),input_size)
184+
else:
185+
layer=tf.reshape(tf.nn.embedding_lookup(embeddings,self.input), [1,-1,self.concat_embed_size])
186+
returnlayer
187+
188+
defget_mlp_layer(self,layer:tf.Tensor)->tf.Tensor:
189+
hidden_weight=self.__get_variable([self.hidden_units,self.concat_embed_size],'hidden_weight')
190+
hidden_bias=self.__get_variable([self.hidden_units,1,1],'hidden_bias')
191+
self.params+= [hidden_weight,hidden_bias]
192+
layer=tf.sigmoid(tf.tensordot(hidden_weight,layer, [[1], [0]])+hidden_bias)
193+
returnlayer
194+
195+
defget_rnn_layer(self,layer:tf.Tensor)->tf.Tensor:
196+
rnn=tf.nn.rnn_cell.BasicRNNCell(self.hidden_units)
197+
rnn_output,rnn_out_state=tf.nn.dynamic_rnn(rnn,layer,dtype=self.dtype)
198+
self.params+= [vforvintf.global_variables()ifv.name.startswith('rnn')]
199+
returntf.transpose(rnn_output)
200+
201+
defget_lstm_layer(self,layer:tf.Tensor)->tf.Tensor:
202+
lstm=tf.nn.rnn_cell.BasicLSTMCell(self.hidden_units)
203+
lstm_output,lstm_out_state=tf.nn.dynamic_rnn(lstm,layer,dtype=self.dtype)
204+
self.params+= [vforvintf.global_variables()ifv.name.startswith('rnn')]
205+
returntf.transpose(lstm_output)
206+
207+
defget_gru_layer(self,layer:tf.Tensor)->tf.Tensor:
208+
gru=tf.nn.rnn_cell.GRUCell(self.hidden_units)
209+
gru_output,gru_out_state=tf.nn.dynamic_rnn(gru,layer,dtype=self.dtype)
210+
self.params+= [vforvintf.global_variables()ifv.name.startswith('rnn')]
211+
returntf.transpose(gru_output)
212+
213+
defget_dropout_layer(self,layer:tf.Tensor)->tf.Tensor:
214+
returntf.layers.dropout(layer,self.dropout_rate)
215+
216+
defget_output_layer(self,layer:tf.Tensor)->tf.Tensor:
217+
output_weight=self.__get_variable([self.tags_count,self.hidden_units],'output_weight')
218+
output_bias=self.__get_variable([self.tags_count,1,1],'output_bias')
219+
self.params+= [output_weight,output_bias]
220+
returntf.tensordot(output_weight,layer, [[1], [0]])+output_bias
221+
222+
defget_loss(self)-> (tf.Tensor,tf.Tensor):
223+
output_loss=tf.reduce_sum(tf.gather_nd(self.output,self.ll_curr)-tf.gather_nd(self.output,self.ll_corr))
224+
trans_loss=tf.gather_nd(self.transition,self.trans_curr)-tf.gather_nd(self.transition,self.trans_corr)
225+
trans_i_curr=tf.gather_nd(self.transition_init,self.trans_init_curr)
226+
trans_i_corr=tf.gather_nd(self.transition_init,self.trans_init_corr)
227+
trans_init_loss=tf.reduce_sum(trans_i_curr-trans_i_corr)
228+
loss=output_loss+trans_loss
229+
regu=tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(self.lam),self.params)
230+
l1=loss+regu
231+
l2=l1+trans_init_loss
232+
returnl1,l2
233+
234+
def__get_variable(self,size,name)->tf.Variable:
235+
returntf.Variable(tf.truncated_normal(size,stddev=1.0/math.sqrt(size[-1]),dtype=self.dtype),name=name)

‎python/dnlp/core/re_cnn_base.py‎

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,54 @@
1-
#-*- coding: UTF-8 -*-
1+
# -*- coding:utf-8 -*-
2+
importnumpyasnp
3+
importpickle
4+
fromdnlp.configimportRECNNConfig
5+
fromdnlp.utils.constantimportBATCH_PAD,BATCH_PAD_VAL
6+
7+
8+
classRECNNBase(object):
9+
def__init__(self,config:RECNNConfig,dict_path:str,data_path:str='',mode='train'):
10+
self.window_size=config.window_size
11+
self.filter_size=config.filter_size
12+
self.learning_rate=config.learning_rate
13+
self.dropout_rate=config.dropout_rate
14+
self.lam=config.lam
15+
self.word_embed_size=config.word_embed_size
16+
self.position_embed_size=config.position_embed_size
17+
self.batch_length=config.batch_length
18+
ifmode=='train':
19+
self.batch_size=config.batch_size
20+
else:
21+
self.batch_size=1
22+
self.dictionary=self.read_dictionary(dict_path)
23+
self.words_size=len(self.dictionary)
24+
25+
defread_dictionary(self,dict_path):
26+
withopen(dict_path,encoding='utf-8')asf:
27+
content=f.read().splitlines()
28+
dictionary= {}
29+
dict_arr=map(lambdaitem:item.split(' '),content)
30+
for_,dict_iteminenumerate(dict_arr):
31+
dictionary[dict_item[0]]=int(dict_item[1])
32+
33+
returndictionary
34+
35+
defload_data(self):
36+
primary= []
37+
secondary= []
38+
words= []
39+
labels= []
40+
withopen(self.data_path,'rb')asf:
41+
data=pickle.load(f)
42+
forsentenceindata:
43+
sentence_words=sentence['words']
44+
iflen(sentence_words)<self.batch_length:
45+
sentence_words+= [self.dictionary[BATCH_PAD]]* (self.batch_length-len(sentence_words))
46+
else:
47+
sentence_words=sentence_words[:self.batch_length]
48+
words.append(sentence_words)
49+
primary.append(np.arange(self.batch_length)-sentence['primary']+self.batch_length-1)
50+
secondary.append(np.arange(self.batch_length)-sentence['secondary']+self.batch_length-1)
51+
sentence_labels=np.zeros([self.relation_count])
52+
sentence_labels[sentence['type']]=1
53+
labels.append(sentence_labels)
54+
returnnp.array(words,np.int32),np.array(primary,np.int32),np.array(secondary,np.int32),np.array(labels,np.float32)

‎python/dnlp/core/skip_gram.py‎renamed to ‎python/dnlp/core/word2vec.py‎

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@
55
importtensorflowastf
66

77

8-
classSkipGram(object):
9-
def__init__(self,src_filename:str,dest_filename:str,batch_size:int=128,embed_size:int=100,
10-
num_sampled:int=64,steps:int=50000):
8+
classWord2Vec(object):
9+
def__init__(self,src_filename:str,dest_filename:str,window_size:int=4,mode='skip_gram',batch_size:int=128,
10+
embed_size:int=100,num_sampled:int=64,steps:int=50000):
1111
withopen(src_filename,'rb')asf:
1212
data=pickle.load(f)
1313
self.input=data['input']
1414
self.output=data['output']
1515
self.dictionary=data['dictionary']
1616
self.vocab_size=len(self.dictionary)
17+
self.mode=mode
18+
self.window_size=window_size
1719
self.start=0
1820
self.dest_filename=dest_filename
1921
self.batch_size=batch_size
@@ -24,10 +26,16 @@ def __init__(self, src_filename: str, dest_filename: str, batch_size: int = 128,
2426
self.embeddings=tf.Variable(tf.random_uniform([self.vocab_size,self.embed_size],-1.0,1.0))
2527

2628
deftrain(self):
27-
train_inputs=tf.placeholder(tf.int32,shape=[self.batch_size])
29+
ifself.mode=='skip_gram':
30+
train_inputs=tf.placeholder(tf.int32,shape=[self.batch_size])
31+
else:
32+
train_inputs=tf.placeholder(tf.int32,shape=[self.batch_size,self.window_size])
2833
train_labels=tf.placeholder(tf.int32,shape=[self.batch_size,1])
2934

3035
embed=tf.nn.embedding_lookup(self.embeddings,train_inputs)
36+
ifself.mode=='cbow':
37+
# embed = tf.reduce_sum(embed, 1)
38+
embed=tf.reduce_mean(embed,1)
3139

3240
nce_weights=tf.Variable(
3341
tf.truncated_normal([self.vocab_size,self.embed_size],
@@ -37,13 +45,13 @@ def train(self):
3745
loss=tf.reduce_mean(
3846
tf.nn.nce_loss(weights=nce_weights,biases=nce_biases,labels=train_labels,inputs=embed,
3947
num_sampled=self.num_sampled,num_classes=self.vocab_size))
40-
optimizer=tf.train.GradientDescentOptimizer(0.2).minimize(loss)
48+
optimizer=tf.train.AdagradOptimizer(0.2).minimize(loss)
4149

4250
withtf.Session()assess:
4351
tf.global_variables_initializer().run()
4452

4553
aver_loss=0
46-
forstepinrange(1,self.steps+1):
54+
forstepinrange(1,self.steps+1):
4755
batch_inputs,batch_labels=self.generate_batch()
4856
feed_dict= {train_inputs:batch_inputs,train_labels:batch_labels}
4957
_,loss_val=sess.run([optimizer,loss],feed_dict=feed_dict)

‎python/dnlp/data_process/process_brat.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
classProcessBrat(object):
55
def__init__(self):
6-
pass
6+
pass
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding:utf-8 -*-
2+
fromdnlp.data_process.processorimportPreprocessor
3+
classProcessConll(Preprocessor):
4+
def__init__(self,*,files:tuple,name:str,base_folder:str='dnlp/data/',dict_path:str=''):
5+
ifdict_path:
6+
Preprocessor.__init__(self,base_folder=base_folder,dict_path=dict_path)
7+
else:
8+
Preprocessor.__init__(self,base_folder=base_folder,files=files,dict_path=base_folder+name+'.utf8')

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp