Commit6d389a8

committed

Merge branch 'emr' ofhttps://github.com/supercoderhawk/DeepLearning_NLP into emr

2 parentsd2a926c +0095bca commit6d389a8Copy full SHA for 6d389a8

File tree

4 files changed

+41

-68

lines changed

python
- dnlp
  - core
    - re_cnn.py
    - re_cnn_base.py
  - data_process
    - process_emr.py
- scripts
  - rel.py

4 files changed

+41

-68

lines changed

`‎python/dnlp/core/re_cnn.py‎`

Lines changed: 23 additions & 51 deletions

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`classRECNN(RECNNBase):`
`10`	`10`	`def__init__(self,config:RECNNConfig,dtype:type=tf.float32,dict_path:str='',mode:str='train',`
`11`	`11`	`data_path:str='',relation_count:int=2,model_path:str='',embedding_path:str='',`
`12`		`-remark:str=''):`
	`12`	`+remark:str='',data_mode='prefetch'):`
`13`	`13`	`tf.reset_default_graph()`
`14`	`14`	`RECNNBase.__init__(self,config,dict_path)`
`15`	`15`	`self.dtype=dtype`
`@@ -21,12 +21,6 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str`
`21`	`21`	`self.remark=remark`
`22`	`22`
`23`	`23`	`self.concat_embed_size=self.word_embed_size+2*self.position_embed_size`
`24`		`-self.words,self.primary,self.secondary,self.labels=self.load_data()`
`25`		`-self.input_words=tf.constant(self.words)`
`26`		`-self.input_primary=tf.constant(self.primary)`
`27`		`-self.input_secondary=tf.constant(self.secondary)`
`28`		`-self.input_labels=tf.constant(self.labels)`
`29`		`-self.input_indices=tf.placeholder(tf.int32, [self.batch_size])`
`30`	`24`	`self.input_characters=tf.placeholder(tf.int32, [None,self.batch_length])`
`31`	`25`	`self.input_position=tf.placeholder(tf.int32, [None,self.batch_length])`
`32`	`26`	`self.input=tf.placeholder(self.dtype, [None,self.batch_length,self.concat_embed_size,1])`
`@@ -43,10 +37,6 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str`
`43`	`37`	`self.full_connected_weight=self.__weight_variable([self.filter_size*len(self.window_size),self.relation_count],`
`44`	`38`	`name='full_connected_weight')`
`45`	`39`	`self.full_connected_bias=self.__weight_variable([self.relation_count],name='full_connected_bias')`
`46`		`-self.input_words_lookup=tf.nn.embedding_lookup(self.input_words,self.input_indices)`
`47`		`-self.input_primary_lookup=tf.nn.embedding_lookup(self.input_primary,self.input_indices)`
`48`		`-self.input_secondary_lookup=tf.nn.embedding_lookup(self.input_secondary,self.input_indices)`
`49`		`-self.input_labels_lookup=tf.nn.embedding_lookup(self.input_labels,self.input_indices)`
`50`	`40`	`self.position_lookup=tf.nn.embedding_lookup(self.position_embedding,self.input_position)`
`51`	`41`	`self.character_lookup=tf.nn.embedding_lookup(self.word_embedding,self.input_characters)`
`52`	`42`	`self.character_embed_holder=tf.placeholder(self.dtype,`
`@@ -55,17 +45,24 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str`
`55`	`45`	`[None,self.batch_length,self.position_embed_size])`
`56`	`46`	`self.secondary_embed_holder=tf.placeholder(self.dtype,`
`57`	`47`	`[None,self.batch_length,self.position_embed_size])`
`58`		`-self.emebd_concat=tf.expand_dims(`
	`48`	`+self.embedded_concat=tf.expand_dims(`
`59`	`49`	`tf.concat([self.character_embed_holder,self.primary_embed_holder,self.secondary_embed_holder],2),3)`
`60`		`-`
	`50`	`+ifdata_mode=='prefetch':`
	`51`	`+self.words,self.primary,self.secondary,self.labels=self.load_data()`
	`52`	`+self.data_count=len(self.words)`
	`53`	`+self.words=tf.data.Dataset.from_tensor_slices(self.words)`
	`54`	`+self.primary=tf.data.Dataset.from_tensor_slices(self.primary)`
	`55`	`+self.secondary=tf.data.Dataset.from_tensor_slices(self.secondary)`
	`56`	`+self.labels=tf.data.Dataset.from_tensor_slices(self.labels)`
	`57`	`+self.input_data=tf.data.Dataset.zip((self.words,self.primary,self.secondary,self.labels))`
	`58`	`+self.input_data=self.input_data.repeat(-1).batch(self.batch_size)`
	`59`	`+self.input_data_iterator=self.input_data.make_initializable_iterator()`
	`60`	`+self.iterator=self.input_data_iterator.get_next()`
`61`	`61`	`ifself.mode=='train':`
`62`		`-self.start=0`
`63`	`62`	`self.hidden_layer=tf.layers.dropout(self.get_hidden(),self.dropout_rate)`
`64`		`-self.data_count=len(self.words)`
`65`	`63`	`self.saver=tf.train.Saver(max_to_keep=100)`
`66`	`64`	`else:`
`67`	`65`	`self.hidden_layer=self.get_hidden()`
`68`		`-# self.hidden_layer = tf.expand_dims(tf.layers.dropout(self.get_hidden(), self.dropout_rate), 0)`
`69`	`66`	`self.sess=tf.Session()`
`70`	`67`	`self.saver=tf.train.Saver().restore(self.sess,self.model_path)`
`71`	`68`	`self.output_no_softmax=tf.matmul(self.hidden_layer,self.full_connected_weight)+self.full_connected_bias`
`@@ -118,27 +115,19 @@ def fit(self, epochs=50, interval=5):`
`118`	`115`	`withtf.Session()assess:`
`119`	`116`	`tf.global_variables_initializer().run()`
`120`	`117`	`sess.graph.finalize()`
`121`		`-start=0`
	`118`	`+sess.run(self.input_data_iterator.initializer)`
	`119`	`+`
`122`	`120`	`foriinrange(1,epochs+1):`
`123`	`121`	`print('epoch:'+str(i))`
`124`	`122`	`forjinrange(self.data_count//self.batch_size):`
`125`		`-ifstart+self.batch_size<self.data_count:`
`126`		`-indices=list(range(start,start+self.batch_size))`
`127`		`-start+=self.batch_size`
`128`		`-else:`
`129`		`-new_start=self.batch_size-self.data_count+start`
`130`		`-indices=list(range(start,self.data_count))+list(range(0,new_start))`
`131`		`-start=new_start`
`132`		`-words,primary,secondary,labels=sess.run([self.input_words,self.input_primary,self.input_secondary,`
`133`		`-self.input_labels],feed_dict={self.input_indices:indices})`
`134`		`-# words, primary, secondary, labels = self.load_batch()`
	`123`	`+words,primary,secondary,labels=sess.run(self.iterator)`
`135`	`124`	`character_embeds,primary_embeds=sess.run([self.character_lookup,self.position_lookup],`
`136`	`125`	`feed_dict={self.input_characters:words,`
`137`	`126`	`self.input_position:primary})`
`138`	`127`	`secondary_embeds=sess.run(self.position_lookup,feed_dict={self.input_position:secondary})`
`139`		`-input=sess.run(self.emebd_concat,feed_dict={self.character_embed_holder:character_embeds,`
`140`		`-self.primary_embed_holder:primary_embeds,`
`141`		`-self.secondary_embed_holder:secondary_embeds})`
	`128`	`+input=sess.run(self.embedded_concat,feed_dict={self.character_embed_holder:character_embeds,`
	`129`	`+self.primary_embed_holder:primary_embeds,`
	`130`	`+self.secondary_embed_holder:secondary_embeds})`
`142`	`131`	`# sess.run(self.train_model, feed_dict={self.input: input, self.input_relation: batch['label']})`
`143`	`132`	`sess.run(self.train_cross_entropy_model,feed_dict={self.input:input,self.input_relation:labels})`
`144`	`133`	`ifi%interval==0:`
`@@ -156,10 +145,10 @@ def predict(self, words, primary, secondary):`
`156`	`145`	`feed_dict={self.input_characters:words,`
`157`	`146`	`self.input_position:primary})`
`158`	`147`	`secondary_embeds=self.sess.run(self.position_lookup,feed_dict={self.input_position:secondary})`
`159`		`-input=self.sess.run(self.emebd_concat,feed_dict={self.character_embed_holder:character_embeds,`
`160`		`-self.primary_embed_holder:primary_embeds,`
`161`		`-self.secondary_embed_holder:secondary_embeds})`
`162`		`-output=self.sess.run(self.output,feed_dict={self.input:input})`
	`148`	`+input_matrix=self.sess.run(self.embedded_concat,feed_dict={self.character_embed_holder:character_embeds,`
	`149`	`+self.primary_embed_holder:primary_embeds,`
	`150`	`+self.secondary_embed_holder:secondary_embeds})`
	`151`	`+output=self.sess.run(self.output,feed_dict={self.input:input_matrix})`
`163`	`152`	`returnnp.argmax(output,1)`
`164`	`153`
`165`	`154`	`defevaluate(self):`
`@@ -190,23 +179,6 @@ def get_score(self, predict, true):`
`190`	`179`	`print(prec,recall,f1)`
`191`	`180`	`returnprec,recall,f1`
`192`	`181`
`193`		`-defload_batch(self):`
`194`		`-ifself.start+self.batch_size>self.data_count:`
`195`		`-new_start=self.start+self.batch_size-self.data_count`
`196`		`-words=np.concatenate([self.words[self.start:],self.words[:new_start]])`
`197`		`-primary=np.concatenate([self.primary[self.start:],self.primary[:new_start]])`
`198`		`-secondary=np.concatenate([self.secondary[self.start:],self.secondary[:new_start]])`
`199`		`-labels=np.concatenate([self.labels[self.start:],self.labels[:new_start]])`
`200`		`-self.start=new_start`
`201`		`-else:`
`202`		`-new_start=self.start+self.batch_size`
`203`		`-words=self.words[self.start:new_start]`
`204`		`-primary=self.primary[self.start:new_start]`
`205`		`-secondary=self.secondary[self.start:new_start]`
`206`		`-labels=self.labels[self.start:new_start]`
`207`		`-self.start=new_start`
`208`		`-returnwords,primary,secondary,labels`
`209`		`-`
`210`	`182`	`def__weight_variable(self,shape,name):`
`211`	`183`	`initial=tf.truncated_normal(shape,stddev=0.1,dtype=self.dtype)`
`212`	`184`	`returntf.Variable(initial,name=name)`

`‎python/dnlp/core/re_cnn_base.py‎`

Lines changed: 5 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,8 +4,9 @@`
`4`	`4`	`fromdnlp.configimportRECNNConfig`
`5`	`5`	`fromdnlp.utils.constantimportBATCH_PAD,BATCH_PAD_VAL`
`6`	`6`
	`7`	`+`
`7`	`8`	`classRECNNBase(object):`
`8`		`-def__init__(self,config:RECNNConfig,dict_path:str,data_path:str=''):`
	`9`	`+def__init__(self,config:RECNNConfig,dict_path:str,data_path:str=''):`
`9`	`10`	`self.window_size=config.window_size`
`10`	`11`	`self.filter_size=config.filter_size`
`11`	`12`	`self.learning_rate=config.learning_rate`
`@@ -18,8 +19,8 @@ def __init__(self, config:RECNNConfig,dict_path:str,data_path:str=''):`
`18`	`19`	`self.dictionary=self.read_dictionary(dict_path)`
`19`	`20`	`self.words_size=len(self.dictionary)`
`20`	`21`
`21`		`-defread_dictionary(self,dict_path):`
`22`		`-withopen(dict_path,encoding='utf-8')asf:`
	`22`	`+defread_dictionary(self,dict_path):`
	`23`	`+withopen(dict_path,encoding='utf-8')asf:`
`23`	`24`	`content=f.read().splitlines()`
`24`	`25`	`dictionary= {}`
`25`	`26`	`dict_arr=map(lambdaitem:item.split(' '),content)`
`@@ -47,5 +48,4 @@ def load_data(self):`
`47`	`48`	`sentence_labels=np.zeros([self.relation_count])`
`48`	`49`	`sentence_labels[sentence['type']]=1`
`49`	`50`	`labels.append(sentence_labels)`
`50`		`-returnnp.array(words,np.int32),np.array(primary,np.int32),np.array(secondary,np.int32),np.array(labels,`
`51`		`-np.float32)`
	`51`	`+returnnp.array(words,np.int32),np.array(primary,np.int32),np.array(secondary,np.int32),np.array(labels,np.float32)`

`‎python/dnlp/data_process/process_emr.py‎`

Lines changed: 1 addition & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -204,8 +204,7 @@ def generate_re_mutli_training_data(self):`
`204`	`204`	`word_indices=self.map_to_indices(annotation['words'])`
`205`	`205`	`fortrue_rel_nameinannotation['true_relations']:`
`206`	`206`	`true_rel=annotation['true_relations'][true_rel_name]`
`207`		`-train_data.append({'words':word_indices,'primary':true_rel['primary'],'secondary':true_rel['secondary'],`
`208`		`-'type':self.relation_category_labels[true_rel['type']]})`
	`207`	`+train_data.append({'words':word_indices,'primary':true_rel['primary'],'secondary':true_rel['secondary'],'type':self.relation_category_labels[true_rel['type']]})`
`209`	`208`	`returntrain_data`
`210`	`209`
`211`	`210`	`defmap_to_indices(self,words):`

`‎python/scripts/rel.py‎`

Lines changed: 12 additions & 10 deletions

Original file line number	Diff line number	Diff line change
`@@ -23,15 +23,16 @@ def train_re_cnn():`
`23`	`23`	`cbow_path=BASE_FOLDER+'emr_word_light_cbow.npy'`
`24`	`24`	`forwinWINDOW_LIST:`
`25`	`25`	`start=time.time()`
`26`		`-train_re_cnn_by_window(w,data_path_two_directed,embedding_path=cbow_path,remark='_cbow_directed')`
`27`		`-train_re_cnn_by_window(w,data_path_two_directed,embedding_path=embedding_path,remark='_skip_gram_directed')`
`28`		`-train_re_cnn_by_window(w,data_path_multi_directed,relation_count=28,embedding_path=cbow_path,`
`29`		`-remark='_cbow_directed')`
`30`		`-train_re_cnn_by_window(w,data_path_multi_directed,relation_count=28,embedding_path=embedding_path,`
`31`		`-remark='_skip_gram_directed')`
	`26`	`+#train_re_cnn_by_window(w, data_path_two_directed, embedding_path=cbow_path, remark='_cbow_directed')`
	`27`	`+#train_re_cnn_by_window(w, data_path_two_directed, embedding_path=embedding_path, remark='_skip_gram_directed')`
	`28`	`+#train_re_cnn_by_window(w, data_path_multi_directed, relation_count=28, embedding_path=cbow_path,`
	`29`	`+# remark='_cbow_directed')`
	`30`	`+#train_re_cnn_by_window(w, data_path_multi_directed, relation_count=28, embedding_path=embedding_path,`
	`31`	`+# remark='_skip_gram_directed')`
`32`	`32`	`# train_re_cnn_by_window(w,data_path_two)`
`33`		`-train_re_cnn_by_window(w,data_path_multi_directed,28,remark='_directed')`
`34`	`33`	`train_re_cnn_by_window(w,data_path_two_directed,remark='_directed')`
	`34`	`+train_re_cnn_by_window(w,data_path_multi_directed,28,remark='_directed')`
	`35`	`+`
`35`	`36`	`print(time.time()-start)`
`36`	`37`
`37`	`38`
`@@ -161,9 +162,10 @@ def test_re_cnn_with_embedding():`
`161`	`162`	`# test_re_cnn()`
`162`	`163`	`# test_re_cnn_by_window((2,),epoch=1,embedding_path=SKIP_GRAM_PATH,remark='_skip_gram')`
`163`	`164`	`# test_re_cnn_by_window((2,), epoch=5, embedding_path=CBOW_PATH, remark='_cbow_directed')`
`164`		`-get_re_cnn_result()`
`165`		`-get_re_cnn_result('multi')`
`166`		`-# test_re_cnn(remark='_directed')`
	`165`	`+# get_re_cnn_result()`
	`166`	`+# get_re_cnn_result('multi')`
	`167`	`+test_re_cnn_by_window((2,3,4),50,mode='two',relation_count=2,remark='_directed')`
	`168`	`+test_re_cnn(remark='_directed')`
`167`	`169`	`# test_re_cnn('multi')`
`168`	`170`	`# test_re_cnn_with_embedding()`
`169`	`171`	`# test_single_model((2, 3, 4), 1)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit6d389a8

File tree

4 files changed

4 files changed

`‎python/dnlp/core/re_cnn.py‎`

`‎python/dnlp/core/re_cnn_base.py‎`

`‎python/dnlp/data_process/process_emr.py‎`

`‎python/scripts/rel.py‎`

0 commit comments