Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd97aaab

Browse files
add re_cnn evalution code, add cbow
1 parent746d2e4 commitd97aaab

File tree

12 files changed

+346
-100
lines changed

12 files changed

+346
-100
lines changed

‎python/dnlp/config/re_config.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding:utf-8 -*-
22

33
classRECNNConfig(object):
4-
def__init__(self,window_size:tuple=(3,),filter_size:int=150,learning_rate:float=0.4,dropout_rate:float=0.5,
4+
def__init__(self,window_size:tuple=(3,4,),filter_size:int=150,learning_rate:float=0.1,dropout_rate:float=0.5,
55
lam:float=1e-4,word_embed_size:int=300,position_embed_size:int=50,batch_length:int=85,
66
batch_size:int=50):
77
self.__window_size=window_size

‎python/dnlp/core/dnn_crf.py‎

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def __init__(self, *, config: DnnCrfConfig = None, task='cws', data_path: str =
8888
tf.summary.scalar('loss',self.mean_loss)
8989
self.merged=tf.summary.merge_all()
9090

91-
deffit(self,epochs:int=50,interval:int=10):
91+
deffit(self,epochs:int=40,interval:int=10):
9292
withtf.Session(graph=self.graph)assess:
9393
tf.global_variables_initializer().run()
9494
saver=tf.train.Saver(max_to_keep=epochs)
@@ -104,11 +104,11 @@ def fit(self, epochs: int = 50, interval: int = 10):
104104
ifepoch%interval==0:
105105
ifnotself.embedding_path:
106106
ifself.remark:
107-
model_path='../dnlp/models/{0}-{1}-{2}-{3}.ckpt'.format(self.task,self.nn,self.remark,epoch)
107+
model_path='../dnlp/models/emr/{0}-{1}-{2}-{3}.ckpt'.format(self.task,self.nn,self.remark,epoch)
108108
else:
109-
model_path='../dnlp/models/{0}-{1}-{2}.ckpt'.format(self.task,self.nn,epoch)
109+
model_path='../dnlp/models/emr/{0}-{1}-{2}.ckpt'.format(self.task,self.nn,epoch)
110110
else:
111-
model_path='../dnlp/models/{0}-{1}-embedding-{2}.ckpt'.format(self.task,self.nn,epoch)
111+
model_path='../dnlp/models/emr/{0}-{1}-embedding-{2}.ckpt'.format(self.task,self.nn,epoch)
112112
saver.save(sess,model_path)
113113
self.save_config(model_path)
114114
self.train_writer.close()

‎python/dnlp/core/dnn_crf_emr.py‎

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
classDnnCrfEmr(DnnCrfBase):
1010
def__init__(self,*,config:DnnCrfConfig=None,data_path:str='',dtype:type=tf.float32,task:str='ner',mode:str='train',
11-
train:str='',nn:str,model_path:str=''):
11+
nn:str,model_path:str=''):
1212
ifmodenotin ['train','predict']:
1313
raiseException('mode error')
1414
ifnnnotin ['mlp','rnn','lstm','bilstm','gru']:
@@ -52,11 +52,6 @@ def __init__(self, *, config: DnnCrfConfig = None, data_path: str = '', dtype: t
5252
self.sess=tf.Session()
5353
self.sess.run(tf.global_variables_initializer())
5454
tf.train.Saver().restore(save_path=self.model_path,sess=self.sess)
55-
eliftrain=='ll':
56-
self.ll_loss,_=tf.contrib.crf.crf_log_likelihood(self.output,self.real_indices,self.seq_length,
57-
self.transition)
58-
self.optimizer=tf.train.AdagradOptimizer(self.learning_rate)
59-
self.train_ll=self.optimizer.minimize(-self.ll_loss)
6055
else:
6156
# 构建训练函数
6257
# 训练用placeholder
@@ -135,26 +130,6 @@ def fit_batch(self, characters, labels, lengths, sess):
135130
feed_dict[self.trans_init_curr]=trans_init_neg_indices
136131
sess.run(self.train_with_init,feed_dict)
137132

138-
deffit_ll(self,epochs:int=100,interval:int=20):
139-
withtf.Session()assess:
140-
tf.global_variables_initializer().run()
141-
saver=tf.train.Saver(max_to_keep=epochs)
142-
forepochinrange(1,epochs+1):
143-
print('epoch:',epoch)
144-
for_inrange(self.batch_count):
145-
characters,labels,lengths=self.get_batch()
146-
# scores = sess.run(self.output, feed_dict={self.input: characters})
147-
feed_dict= {self.input:characters,self.real_indices:labels,self.seq_length:lengths}
148-
sess.run(self.train_ll,feed_dict=feed_dict)
149-
# self.fit_batch(characters, labels, lengths, sess)
150-
ifepoch%interval==0:
151-
model_path='../dnlp/models/emr_old/{0}-{1}.ckpt'.format(self.nn,epoch)
152-
saver.save(sess,model_path)
153-
self.save_config(model_path)
154-
155-
deffit_batch_ll(self):
156-
pass
157-
158133
defgenerate_transition_update_index(self,correct_labels,current_labels):
159134
ifcorrect_labels.shape!=current_labels.shape:
160135
print('sequence length is not equal')
@@ -218,7 +193,7 @@ def get_mlp_layer(self, layer: tf.Tensor) -> tf.Tensor:
218193
returnlayer
219194

220195
defget_rnn_layer(self,layer:tf.Tensor)->tf.Tensor:
221-
rnn=tf.nn.rnn_cell.RNNCell(self.hidden_units)
196+
rnn=tf.nn.rnn_cell.BasicRNNCell(self.hidden_units)
222197
rnn_output,rnn_out_state=tf.nn.dynamic_rnn(rnn,layer,dtype=self.dtype)
223198
self.params+= [vforvintf.global_variables()ifv.name.startswith('rnn')]
224199
returntf.transpose(rnn_output)

‎python/dnlp/core/re_cnn.py‎

Lines changed: 63 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,38 @@
22
importtensorflowastf
33
importnumpyasnp
44
importpickle
5+
fromcollectionsimportCounter
56
fromdnlp.core.re_cnn_baseimportRECNNBase
67
fromdnlp.configimportRECNNConfig
78
fromdnlp.utils.constantimportBATCH_PAD,BATCH_PAD_VAL
89

910

1011
classRECNN(RECNNBase):
1112
def__init__(self,config:RECNNConfig,dtype:type=tf.float32,dict_path:str='',mode:str='train',
12-
data_path:str='',relation_count:int=2,model_path:str=''):
13+
data_path:str='',relation_count:int=2,model_path:str='',embedding_path:str='',
14+
remark:str=''):
1315
tf.reset_default_graph()
1416
RECNNBase.__init__(self,config,dict_path)
1517
self.dtype=dtype
1618
self.mode=mode
1719
self.data_path=data_path
1820
self.model_path=model_path
1921
self.relation_count=relation_count
22+
self.embedding_path=embedding_path
23+
self.remark=remark
2024

2125
self.concat_embed_size=self.word_embed_size+2*self.position_embed_size
2226
self.input_characters=tf.placeholder(tf.int32, [None,self.batch_length])
2327
self.input_position=tf.placeholder(tf.int32, [None,self.batch_length])
2428
self.input=tf.placeholder(self.dtype, [None,self.batch_length,self.concat_embed_size,1])
2529
self.input_relation=tf.placeholder(self.dtype, [None,self.relation_count])
26-
self.position_embedding=self.__weight_variable([2*self.batch_length-1,self.position_embed_size],
30+
self.position_embedding=self.__weight_variable([2*self.batch_length-1,self.position_embed_size],
2731
name='position_embedding')
28-
self.word_embedding=self.__weight_variable([self.words_size,self.word_embed_size],name='word_embedding')
32+
ifself.embedding_path:
33+
self.word_embedding=tf.Variable(np.load(self.embedding_path),dtype=self.dtype,name='word_embedding',
34+
trainable=True)
35+
else:
36+
self.word_embedding=self.__weight_variable([self.words_size,self.word_embed_size],name='word_embedding')
2937
self.conv_kernel=self.get_conv_kernel()
3038
self.bias= [self.__weight_variable([self.filter_size],name='conv_bias')]*len(self.window_size)
3139
self.full_connected_weight=self.__weight_variable([self.filter_size*len(self.window_size),self.relation_count],
@@ -42,13 +50,15 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str
4250
self.emebd_concat=tf.expand_dims(
4351
tf.concat([self.character_embed_holder,self.primary_embed_holder,self.secondary_embed_holder],2),3)
4452
self.words,self.primary,self.secondary,self.labels=self.load_data()
53+
4554
ifself.mode=='train':
46-
self.hidden_layer=tf.layers.dropout(self.get_hidden(),self.dropout_rate)
4755
self.start=0
56+
self.hidden_layer=tf.layers.dropout(self.get_hidden(),self.dropout_rate)
4857
self.data_count=len(self.words)
4958
self.saver=tf.train.Saver(max_to_keep=100)
5059
else:
51-
self.hidden_layer=tf.expand_dims(tf.layers.dropout(self.get_hidden(),self.dropout_rate),0)
60+
self.hidden_layer=self.get_hidden()
61+
# self.hidden_layer = tf.expand_dims(tf.layers.dropout(self.get_hidden(), self.dropout_rate), 0)
5262
self.sess=tf.Session()
5363
self.saver=tf.train.Saver().restore(self.sess,self.model_path)
5464
self.output_no_softmax=tf.matmul(self.hidden_layer,self.full_connected_weight)+self.full_connected_bias
@@ -60,12 +70,11 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str
6070
self.loss=tf.reduce_sum(tf.square(self.output-self.input_relation))/self.batch_size+self.regularization
6171
self.cross_entropy=tf.nn.softmax_cross_entropy_with_logits(labels=self.input_relation,
6272
logits=self.output_no_softmax)+self.regularization
63-
self.optimizer=tf.train.GradientDescentOptimizer(self.learning_rate)
64-
#self.optimizer = tf.train.AdagradOptimizer(self.learning_rate)
73+
#self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
74+
self.optimizer=tf.train.AdagradOptimizer(self.learning_rate)
6575
self.train_model=self.optimizer.minimize(self.loss)
6676
self.train_cross_entropy_model=self.optimizer.minimize(self.cross_entropy)
6777

68-
6978
defget_conv_kernel(self):
7079
conv_kernel= []
7180
forwinself.window_size:
@@ -85,10 +94,10 @@ def get_hidden(self):
8594
h=tf.squeeze(self.max_pooling(tf.nn.relu(self.conv(conv)+bias),w))
8695
else:
8796
hh=tf.squeeze(self.max_pooling(tf.nn.relu(self.conv(conv)+bias),w))
88-
ifself.mode=='train':
89-
h=tf.concat([h,hh],1)
90-
else:
91-
h=tf.concat([h,hh],0)
97+
#if self.mode == 'train':
98+
h=tf.concat([h,hh],1)
99+
#else:
100+
# h = tf.concat([h, hh], 0)
92101
returnh
93102

94103
defconv(self,conv_kernel):
@@ -98,7 +107,7 @@ def max_pooling(self, x, window_size):
98107
returntf.nn.max_pool(x,ksize=[1,self.batch_length-window_size+1,1,1],
99108
strides=[1,1,1,1],padding='VALID')
100109

101-
deffit(self,epochs=100,interval=20):
110+
deffit(self,epochs=40,interval=5):
102111
withtf.Session()assess:
103112
tf.global_variables_initializer().run()
104113
sess.graph.finalize()
@@ -116,21 +125,53 @@ def fit(self, epochs=100, interval=20):
116125
# sess.run(self.train_model, feed_dict={self.input: input, self.input_relation: batch['label']})
117126
sess.run(self.train_cross_entropy_model,feed_dict={self.input:input,self.input_relation:labels})
118127
ifi%interval==0:
119-
model_name='../dnlp/models/re/{0}-{1}.ckpt'.format(i,'_'.join(map(str,self.window_size)))
128+
ifself.relation_count==2:
129+
model_name='../dnlp/models/re_{2}/{0}-{1}{3}.ckpt'.format(i,'_'.join(map(str,self.window_size)),
130+
'two',self.remark)
131+
else:
132+
model_name='../dnlp/models/re_{2}/{0}-{1}{3}.ckpt'.format(i,'_'.join(map(str,self.window_size)),
133+
'multi',self.remark)
134+
120135
self.saver.save(sess,model_name)
121-
defpredict(self,words,primary,secondary):
136+
137+
defpredict(self,words,primary,secondary):
122138
character_embeds,primary_embeds=self.sess.run([self.character_lookup,self.position_lookup],
123-
feed_dict={self.input_characters:words,
124-
self.input_position:primary})
139+
feed_dict={self.input_characters:words,
140+
self.input_position:primary})
125141
secondary_embeds=self.sess.run(self.position_lookup,feed_dict={self.input_position:secondary})
126142
input=self.sess.run(self.emebd_concat,feed_dict={self.character_embed_holder:character_embeds,
127-
self.primary_embed_holder:primary_embeds,
128-
self.secondary_embed_holder:secondary_embeds})
143+
self.primary_embed_holder:primary_embeds,
144+
self.secondary_embed_holder:secondary_embeds})
129145
output=self.sess.run(self.output,feed_dict={self.input:input})
130146
returnnp.argmax(output,1)
131147

132148
defevaluate(self):
133-
res=self.predict(self.words,self.primary,self.secondary)
149+
res=self.predict(self.words,self.primary,self.secondary)
150+
res_count=Counter(res)[1]
151+
target=np.argmax(self.labels,1)
152+
target_count=Counter(target)[1]
153+
correct_number=Counter(np.array(res)-target)
154+
print(correct_number)
155+
returnself.get_score(np.array(res),target)
156+
157+
defget_score(self,predict,true):
158+
types=Counter(predict).keys()
159+
corr_count= []
160+
true_count= []
161+
pred_count= []
162+
163+
fortintypes:
164+
corr_count.append(len([vforv,cinzip(predict-t,predict-true)ifv==0andc==0]))
165+
true_count.append(len([teforteintrueifte==t]))
166+
pred_count.append(len([pdforpdinpredictifpd==t]))
167+
168+
precs= [c/pforc,pinzip(corr_count,pred_count)ifp!=0andc!=0]
169+
recalls= [c/rforc,rinzip(corr_count,true_count)ifr!=0andc!=0]
170+
prec=sum(precs)/len(precs)
171+
recall=sum(recalls)/len(recalls)
172+
f1=2*prec*recall/ (prec+recall)
173+
print(prec,recall,f1)
174+
returnprec,recall,f1
134175

135176
defload_batch(self):
136177
ifself.start+self.batch_size>self.data_count:
@@ -163,8 +204,8 @@ def load_data(self):
163204
else:
164205
sentence_words=sentence_words[:self.batch_length]
165206
words.append(sentence_words)
166-
primary.append(np.arange(self.batch_length)-sentence['primary']+self.batch_length-1)
167-
secondary.append(np.arange(self.batch_length)-sentence['secondary']+self.batch_length-1)
207+
primary.append(np.arange(self.batch_length)-sentence['primary']+self.batch_length-1)
208+
secondary.append(np.arange(self.batch_length)-sentence['secondary']+self.batch_length-1)
168209
sentence_labels=np.zeros([self.relation_count])
169210
sentence_labels[sentence['type']]=1
170211
labels.append(sentence_labels)

‎python/dnlp/core/skip_gram.py‎renamed to ‎python/dnlp/core/word2vec.py‎

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@
66

77

88
classSkipGram(object):
9-
def__init__(self,src_filename:str,dest_filename:str,batch_size:int=128,embed_size:int=100,
10-
num_sampled:int=64,steps:int=50000):
9+
def__init__(self,src_filename:str,dest_filename:str,window_size:int=4,mode='skip_gram',batch_size:int=128,
10+
embed_size:int=100,num_sampled:int=64,steps:int=50000):
1111
withopen(src_filename,'rb')asf:
1212
data=pickle.load(f)
1313
self.input=data['input']
1414
self.output=data['output']
1515
self.dictionary=data['dictionary']
1616
self.vocab_size=len(self.dictionary)
17+
self.mode=mode
18+
self.window_size=window_size
1719
self.start=0
1820
self.dest_filename=dest_filename
1921
self.batch_size=batch_size
@@ -24,10 +26,15 @@ def __init__(self, src_filename: str, dest_filename: str, batch_size: int = 128,
2426
self.embeddings=tf.Variable(tf.random_uniform([self.vocab_size,self.embed_size],-1.0,1.0))
2527

2628
deftrain(self):
27-
train_inputs=tf.placeholder(tf.int32,shape=[self.batch_size])
29+
ifself.mode=='skip_gram':
30+
train_inputs=tf.placeholder(tf.int32,shape=[self.batch_size])
31+
else:
32+
train_inputs=tf.placeholder(tf.int32,shape=[self.batch_size,self.window_size])
2833
train_labels=tf.placeholder(tf.int32,shape=[self.batch_size,1])
2934

3035
embed=tf.nn.embedding_lookup(self.embeddings,train_inputs)
36+
ifself.mode=='cbow':
37+
embed=tf.reduce_sum(embed,1)
3138

3239
nce_weights=tf.Variable(
3340
tf.truncated_normal([self.vocab_size,self.embed_size],
@@ -37,13 +44,13 @@ def train(self):
3744
loss=tf.reduce_mean(
3845
tf.nn.nce_loss(weights=nce_weights,biases=nce_biases,labels=train_labels,inputs=embed,
3946
num_sampled=self.num_sampled,num_classes=self.vocab_size))
40-
optimizer=tf.train.GradientDescentOptimizer(0.2).minimize(loss)
47+
optimizer=tf.train.AdagradOptimizer(0.2).minimize(loss)
4148

4249
withtf.Session()assess:
4350
tf.global_variables_initializer().run()
4451

4552
aver_loss=0
46-
forstepinrange(1,self.steps+1):
53+
forstepinrange(1,self.steps+1):
4754
batch_inputs,batch_labels=self.generate_batch()
4855
feed_dict= {train_inputs:batch_inputs,train_labels:batch_labels}
4956
_,loss_val=sess.run([optimizer,loss],feed_dict=feed_dict)

‎python/dnlp/data_process/process_character_embedding_pretrain.py‎renamed to ‎python/dnlp/data_process/process_embedding_pretrain.py‎

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,30 @@
55
fromdnlp.utils.constantimportUNK
66

77

8-
classCharacterEmbeddingPertrainProcess(Preprocessor):
9-
def__init__(self,base_folder:str,files:tuple,dict_path:str,skip_window:int):
8+
classEmbeddingPertrainProcess(Preprocessor):
9+
def__init__(self,base_folder:str,files:tuple,dict_path:str,skip_window:int,
10+
output_name:str,mode:str='character',algorithm='skip_gram'):
1011
Preprocessor.__init__(self,base_folder=base_folder,files=files,dict_path=dict_path)
1112
self.skip_window=skip_window
1213
self.files=files
14+
self.output_name=output_name
15+
self.mode=mode
1316
self.sentences=self.preprocess()
1417
self.indices=self.map_to_indices()
15-
self.input,self.output=self.process()
18+
ifalgorithm=='skip_gram':
19+
self.input,self.output=self.process_skip_gram()
20+
else:
21+
self.input,self.output=self.process_cbow()
1622
self.save_data()
1723

1824
defpreprocess(self):
1925
sentences= []
2026
forfileinself.files:
2127
withopen(self.base_folder+file,encoding='utf-8')asf:
22-
sentences.extend(f.read().splitlines())
28+
ifself.mode=='character':
29+
sentences.extend(f.read().splitlines())
30+
else:
31+
sentences.extend([l.split(' ')forlinf.read().splitlines()])
2332
returnsentences
2433

2534
defmap_to_indices(self):
@@ -35,7 +44,17 @@ def map_to_indices(self):
3544
indices.append(idx)
3645
returnindices
3746

38-
defprocess(self):
47+
defprocess_cbow(self):
48+
input= []
49+
output= []
50+
forindexinself.indices:
51+
iflen(index)<2*self.skip_window+1:
52+
continue
53+
foriinrange(self.skip_window,len(index)-self.skip_window):
54+
input.append(index[i-self.skip_window:i]+index[i+1:i+1+self.skip_window])
55+
output.append(index[i])
56+
returninput,output
57+
defprocess_skip_gram(self):
3958
input= []
4059
output= []
4160
forindexinself.indices:
@@ -54,6 +73,6 @@ def shuffle(i):
5473
returninput,output
5574

5675
defsave_data(self):
57-
withopen(self.base_folder+'emr_skip_gram.pickle','wb', )asf:
76+
withopen(self.base_folder+self.output_name,'wb', )asf:
5877
data= {'input':self.input,'output':self.output,'dictionary':self.dictionary}
5978
pickle.dump(data,f)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp