supercoderhawk/DeepLearning_NLPPublic

NotificationsYou must be signed in to change notification settings
Fork40
Star159

Commited380f9

committed

add some codes

1 parentc8b87f6 commited380f9Copy full SHA for ed380f9

File tree

8 files changed

+184

-26

lines changed

python
- dnlp
  - core
    - cbow.py
    - re_cnn.py
  - data_process
- scripts

8 files changed

+184

-26

lines changed

`‎python/dnlp/core/cbow.py‎`

Lines changed: 0 additions & 1 deletion

This file was deleted.

`‎python/dnlp/core/re_cnn.py‎`

Lines changed: 10 additions & 15 deletions

Original file line number	Diff line number	Diff line change
`@@ -6,11 +6,10 @@`
`6`	`6`	`fromdnlp.configimportRECNNConfig`
`7`	`7`
`8`	`8`
`9`		`-`
`10`	`9`	`classRECNN(RECNNBase):`
`11`	`10`	`def__init__(self,config:RECNNConfig,dtype:type=tf.float32,dict_path:str='',mode:str='train',`
`12`	`11`	`data_path:str='',relation_count:int=2,model_path:str='',embedding_path:str='',`
`13`		`-remark:str=''):`
	`12`	`+remark:str=''):`
`14`	`13`	`tf.reset_default_graph()`
`15`	`14`	`RECNNBase.__init__(self,config,dict_path)`
`16`	`15`	`self.dtype=dtype`
`@@ -44,8 +43,8 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str`
`44`	`43`	`self.full_connected_weight=self.__weight_variable([self.filter_size*len(self.window_size),self.relation_count],`
`45`	`44`	`name='full_connected_weight')`
`46`	`45`	`self.full_connected_bias=self.__weight_variable([self.relation_count],name='full_connected_bias')`
`47`		`-self.input_words_lookup=tf.nn.embedding_lookup(self.input_words,self.input_indices)`
`48`		`-self.input_primary_lookup=tf.nn.embedding_lookup(self.input_primary,self.input_indices)`
	`46`	`+self.input_words_lookup=tf.nn.embedding_lookup(self.input_words,self.input_indices)`
	`47`	`+self.input_primary_lookup=tf.nn.embedding_lookup(self.input_primary,self.input_indices)`
`49`	`48`	`self.input_secondary_lookup=tf.nn.embedding_lookup(self.input_secondary,self.input_indices)`
`50`	`49`	`self.input_labels_lookup=tf.nn.embedding_lookup(self.input_labels,self.input_indices)`
`51`	`50`	`self.position_lookup=tf.nn.embedding_lookup(self.position_embedding,self.input_position)`
`@@ -59,8 +58,6 @@ def __init__(self, config: RECNNConfig, dtype: type = tf.float32, dict_path: str`
`59`	`58`	`self.emebd_concat=tf.expand_dims(`
`60`	`59`	`tf.concat([self.character_embed_holder,self.primary_embed_holder,self.secondary_embed_holder],2),3)`
`61`	`60`
`62`		`-`
`63`		`-`
`64`	`61`	`ifself.mode=='train':`
`65`	`62`	`self.start=0`
`66`	`63`	`self.hidden_layer=tf.layers.dropout(self.get_hidden(),self.dropout_rate)`
`@@ -126,14 +123,14 @@ def fit(self, epochs=50, interval=5):`
`126`	`123`	`print('epoch:'+str(i))`
`127`	`124`	`forjinrange(self.data_count//self.batch_size):`
`128`	`125`	`ifstart+self.batch_size<self.data_count:`
`129`		`-indices=list(range(start,start+self.batch_size))`
	`126`	`+indices=list(range(start,start+self.batch_size))`
`130`	`127`	`start+=self.batch_size`
`131`	`128`	`else:`
`132`		`-new_start=self.batch_size-self.data_count+start`
`133`		`-indices=list(range(start,self.data_count))+list(range(0,new_start))`
	`129`	`+new_start=self.batch_size-self.data_count+start`
	`130`	`+indices=list(range(start,self.data_count))+list(range(0,new_start))`
`134`	`131`	`start=new_start`
`135`		`-words,primary,secondary,labels=sess.run([self.input_words,self.input_primary,self.input_secondary,`
`136`		`-self.input_labels],feed_dict={self.input_indices:indices})`
	`132`	`+words,primary,secondary,labels=sess.run([self.input_words,self.input_primary,self.input_secondary,`
	`133`	`+self.input_labels],feed_dict={self.input_indices:indices})`
`137`	`134`	`# words, primary, secondary, labels = self.load_batch()`
`138`	`135`	`character_embeds,primary_embeds=sess.run([self.character_lookup,self.position_lookup],`
`139`	`136`	`feed_dict={self.input_characters:words,`
`@@ -147,10 +144,10 @@ def fit(self, epochs=50, interval=5):`
`147`	`144`	`ifi%interval==0:`
`148`	`145`	`ifself.relation_count==2:`
`149`	`146`	`model_name='../dnlp/models/re_{2}/{0}-{1}{3}.ckpt'.format(i,'_'.join(map(str,self.window_size)),`
`150`		`-'two',self.remark)`
	`147`	`+'two',self.remark)`
`151`	`148`	`else:`
`152`	`149`	`model_name='../dnlp/models/re_{2}/{0}-{1}{3}.ckpt'.format(i,'_'.join(map(str,self.window_size)),`
`153`		`-'multi',self.remark)`
	`150`	`+'multi',self.remark)`
`154`	`151`
`155`	`152`	`self.saver.save(sess,model_name)`
`156`	`153`
`@@ -210,8 +207,6 @@ def load_batch(self):`
`210`	`207`	`self.start=new_start`
`211`	`208`	`returnwords,primary,secondary,labels`
`212`	`209`
`213`		`-`
`214`		`-`
`215`	`210`	`def__weight_variable(self,shape,name):`
`216`	`211`	`initial=tf.truncated_normal(shape,stddev=0.1,dtype=self.dtype)`
`217`	`212`	`returntf.Variable(initial,name=name)`

`‎python/dnlp/data_process/process_brat.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -3,4 +3,4 @@`
`3`	`3`
`4`	`4`	`classProcessBrat(object):`
`5`	`5`	`def__init__(self):`
`6`		`-pass`
	`6`	`+pass`

`‎python/dnlp/data_process/process_conll.py‎`

Lines changed: 8 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,8 @@`
	`1`	`+# -- coding:utf-8 --`
	`2`	`+fromdnlp.data_process.processorimportPreprocessor`
	`3`	`+classProcessConll(Preprocessor):`
	`4`	`+def__init__(self,*,files:tuple,name:str,base_folder:str='dnlp/data/',dict_path:str=''):`
	`5`	`+ifdict_path:`
	`6`	`+Preprocessor.__init__(self,base_folder=base_folder,dict_path=dict_path)`
	`7`	`+else:`
	`8`	`+Preprocessor.__init__(self,base_folder=base_folder,files=files,dict_path=base_folder+name+'.utf8')`

`‎python/dnlp/data_process/process_emr.py‎`

Lines changed: 128 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,11 +3,15 @@`
`3`	`3`	`importre`
`4`	`4`	`importpickle`
`5`	`5`	`importrandom`
	`6`	`+importjson`
	`7`	`+importpprint`
	`8`	`+fromcollectionsimportOrderedDict`
	`9`	`+fromitertoolsimportchain`
`6`	`10`	`fromdnlp.utils.constantimportUNK`
`7`	`11`
`8`	`12`	`RE_SAPCE=re.compile('[ ]+')`
`9`	`13`
`10`		`-`
	`14`	`+# print(pprint.pformat([1,[[2]],3,4444444444,77777777777777777777777777],indent=2,width=10))`
`11`	`15`	`classProcessEMR(object):`
`12`	`16`	`def__init__(self,base_folder:str,dict_path:str='',mode='train',directed=False):`
`13`	`17`	`self.base_folder=base_folder`
`@@ -35,6 +39,13 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed`
`35`	`39`	`'DoseOf':'用量','FamilyOf':'家族成员','ModifierOf':'其他修饰词','UseMedicine':'用药',`
`36`	`40`	`'LeadTo':'导致','Find':'发现','Confirm':'证实','Adopt':'采取','Take':'用药',`
`37`	`41`	`'Limit':'限定','AlongWith':'伴随','Complement':'补足'}`
	`42`	`+self.entity_categories= {'Sign':'体征','Symptom':'症状','Part':'部位','Property':'属性','Degree':'程度',`
	`43`	`+'Quality':'定性值','Quantity':'定量值','Unit':'单位','Time':'时间','Date':'日期',`
	`44`	`+'Result':'结果',`
	`45`	`+'Disease':'疾病','DiseaseType':'疾病类型','Examination':'检查','Location':'地址',`
	`46`	`+'Medicine':'药物','Spec':'规格','Usage':'用法','Dose':'用量','Treatment':'治疗',`
	`47`	`+'Family':'家族史',`
	`48`	`+'Modifier':'修饰词'}`
`38`	`49`	`self.relation_category_labels= {}`
`39`	`50`	`relation_category_index=0`
`40`	`51`	`forrelation_categoryinself.relation_categories:`
`@@ -45,6 +56,9 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed`
`45`	`56`	`pickle.dump(self.relation_category_labels,f)`
`46`	`57`	`self.two_categories=self.generate_re_two_training_data()`
`47`	`58`	`self.multi_categories=self.generate_re_mutli_training_data()`
	`59`	`+self.export_structured_emr()`
	`60`	`+self.data=self.read_file()`
	`61`	`+self.export()`
`48`	`62`	`self.save_data()`
`49`	`63`
`50`	`64`	`defstatistics(self):`
`@@ -57,6 +71,114 @@ def statistics(self):`
`57`	`71`	`print(false_count/all_count)`
`58`	`72`	`print(all_count)`
`59`	`73`
	`74`	`+defread_file(self):`
	`75`	`+data= {}`
	`76`	`+forfinself.files:`
	`77`	`+file_data= {'entities': {},'relations': {}}`
	`78`	`+withopen(self.data_folder+'train/'+f+'.ann',encoding='utf-8')asf:`
	`79`	`+entries= [l.split('\t')forlinf.read().splitlines()ifl]`
	`80`	`+forentryinentries:`
	`81`	`+idx=entry[0]`
	`82`	`+ifidx.startswith('T'):`
	`83`	`+e_type,start,end=entry[1].split(' ')`
	`84`	`+e_type=self.entity_categories[e_type]`
	`85`	`+start=int(start)`
	`86`	`+end=int(end)`
	`87`	`+file_data['entities'][idx]= {'text':entry[2],'type':e_type}`
	`88`	`+elifidx.startswith('R'):`
	`89`	`+r_type,r1,r2=entry[1].split(' ')`
	`90`	`+r1=r1[r1.index(':')+1:]`
	`91`	`+r2=r2[r2.index(':')+1:]`
	`92`	`+ifr1notinfile_data['relations']:`
	`93`	`+file_data['relations'][r1]= [(r2,r_type)]`
	`94`	`+else:`
	`95`	`+file_data['relations'][r1].append((r2,r_type))`
	`96`	`+data[f]=file_data`
	`97`	`+returndata`
	`98`	`+`
	`99`	`+defexport(self):`
	`100`	`+forfilename,file_datainself.data.items():`
	`101`	`+filename=os.path.basename(filename.name[:-4])`
	`102`	`+result= {}`
	`103`	`+entities=file_data['entities'].copy()`
	`104`	`+relations=file_data['relations']`
	`105`	`+fore_id,entityinentities.items():`
	`106`	`+e_type=entity['type']`
	`107`	`+# e = entity['text']`
	`108`	`+ife_idinrelations:`
	`109`	`+attribute= {}`
	`110`	`+forr2,rtinrelations[e_id]:`
	`111`	`+e2=file_data['entities'][r2].copy()`
	`112`	`+# e2['name'] = self.relation_categories[rt]`
	`113`	`+# attribute.append(e2)`
	`114`	`+ifnotattribute.get(self.relation_categories[rt]):`
	`115`	`+attribute[self.relation_categories[rt]]=e2['text']`
	`116`	`+else:`
	`117`	`+iftype(attribute[self.relation_categories[rt]])==str:`
	`118`	`+attribute[self.relation_categories[rt]]= [attribute[self.relation_categories[rt]],e2['text']]`
	`119`	`+else:`
	`120`	`+attribute[self.relation_categories[rt]].append(e2['text'])`
	`121`	`+ifnotresult.get(e_type):`
	`122`	`+result[e_type]= [{entity['text']:attribute}]`
	`123`	`+else:`
	`124`	`+result[e_type].append({entity['text']:attribute})`
	`125`	`+else:`
	`126`	`+ifnotresult.get(e_type):`
	`127`	`+result[e_type]= [entity['text']]`
	`128`	`+else:`
	`129`	`+result[e_type].append(entity['text'])`
	`130`	`+new_result= {}`
	`131`	`+fork,vinresult.items():`
	`132`	`+nv= [valforvalinviftype(val)!=str]`
	`133`	`+ifnv:`
	`134`	`+new_result[k]=nv`
	`135`	`+# entity['attributes'] = attribute`
	`136`	`+# result.append(entity)`
	`137`	`+withopen(self.base_folder+'structured/'+filename+'.json','w',encoding='utf-8')asf:`
	`138`	`+f.write(pprint.pformat(new_result,width=100).replace('\'','"'))`
	`139`	`+# json.dump(new_result, f, ensure_ascii=False)`
	`140`	`+`
	`141`	`+defexport_structured_emr(self):`
	`142`	`+annotations= {}`
	`143`	`+forsentenceinself.annotations:`
	`144`	`+sentence['start']=min([e['start']foreinsentence['entities'].values()])`
	`145`	`+ifsentence['file']notinannotations:`
	`146`	`+annotations[sentence['file']]= [sentence]`
	`147`	`+else:`
	`148`	`+annotations[sentence['file']].append(sentence)`
	`149`	`+structured_result= []`
	`150`	`+forannotationinannotations.values():`
	`151`	`+filename=annotation[0]['file']+'.json'`
	`152`	`+result= []`
	`153`	`+entities=list(chain(*[a['entities'].values()forainannotation]))`
	`154`	`+entities=sorted(entities,key=lambdae:e['start'])`
	`155`	`+entities_dict= {e['id']:eforeinentities}`
	`156`	`+true_relations=list(chain(*[a['true_relations'].values()forainannotation]))`
	`157`	`+relations= {}`
	`158`	`+forrintrue_relations:`
	`159`	`+ifrelations.get(r['first']):`
	`160`	`+relations[r['first']].append((r['second'],r['type']))`
	`161`	`+print(relations)`
	`162`	`+else:`
	`163`	`+relations[r['first']]= [(r['second'],r['type'])]`
	`164`	`+foreinentities:`
	`165`	`+e_id=e['id']`
	`166`	`+entity= {'text':e['text'],'start':e['start'],'length':e['length'],'type':e['type']}`
	`167`	`+ife_idinrelations:`
	`168`	`+attributes= []`
	`169`	`+foree_id,ee_typeinrelations[e_id]:`
	`170`	`+ee=entities_dict[ee_id]`
	`171`	`+attributes.append({'name':self.relation_categories[ee_type],'text':ee['text'],'start':ee['start'],`
	`172`	`+'length':ee['length'],'type':ee['type']})`
	`173`	`+entity.update({'attributes':attributes})`
	`174`	`+result.append(entity)`
	`175`	`+# with open(self.base_folder+'structured/'+filename,'w',encoding='utf-8') as f:`
	`176`	`+# json.dump(result,f,ensure_ascii=False)`
	`177`	`+# structured_result.append(result)`
	`178`	`+`
	`179`	`+# annotations = OrderedDict(sorted(annotations.items(),key=lambda i:i[1]['start']))`
	`180`	`+returnannotations`
	`181`	`+`
`60`	`182`	`defgenerate_re_two_training_data(self):`
`61`	`183`	`train_data= []`
`62`	`184`	`forannotationinself.annotations:`
`@@ -125,7 +247,7 @@ def read_annotations(self, directed=False):`
`125`	`247`	`all_sentences= []`
`126`	`248`	`forfileinself.files:`
`127`	`249`	`filename=self.data_folder+self.mode+'/'+file`
`128`		`-sentence_dict,periods=self.read_entities_in_single_file(filename+'.txt',filename+'.ann')`
	`250`	`+sentence_dict,periods=self.read_entities_in_single_file(filename+'.txt',filename+'.ann',filename)`
`129`	`251`
`130`	`252`	`sentence_words=self.read_cws_file(self.data_folder+'cws/'+file+'.cws',periods)`
`131`	`253`	`sentences= [''.join(s)forsinsentence_words]`
`@@ -174,7 +296,7 @@ def read_relation_in_single_file(self, ann_file, data, directed=False):`
`174`	`296`	`# sentence['true_relations'] = {}`
`175`	`297`	`ifprimaryinentitiesandsecondaryinentities:`
`176`	`298`	`rel= {'id':id,'primary':entities[primary]['index'],'secondary':entities[secondary]['index'],`
`177`		`-'type':entry[1]}`
	`299`	`+'type':entry[1],'first':primary,'second':secondary}`
`178`	`300`	`ifsentence.get('true_realtions'):`
`179`	`301`	`sentence['true_relations'][id]=rel`
`180`	`302`	`else:`
`@@ -245,7 +367,8 @@ def read_cws_file(self, cws_file, periods):`
`245`	`367`
`246`	`368`	`returnsentence_words`
`247`	`369`
`248`		`-defread_entities_in_single_file(self,raw_file,ann_file):`
	`370`	`+defread_entities_in_single_file(self,raw_file,ann_file,common_name):`
	`371`	`+common_name=os.path.basename(common_name)`
`249`	`372`	`data= {}`
`250`	`373`	`withopen(raw_file,encoding='utf-8')asr:`
`251`	`374`	`sentence=r.read()`
`@@ -323,6 +446,7 @@ def read_entities_in_single_file(self, raw_file, ann_file):`
`323`	`446`	`entities[id]=entity`
`324`	`447`	`else:`
`325`	`448`	`sentence_dict[new_sentence]['entities']= {id:entity}`
	`449`	`+sentence_dict[new_sentence]['file']=common_name`
`326`	`450`	`break`
`327`	`451`	`else:`
`328`	`452`	`entity= {'id':id,'start':start,'length':end-start,'text':text,'type':entry[1]}`

`‎python/scripts/cws_ner.py‎`

Lines changed: 4 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ def test_cws():`
`32`	`32`
`33`	`33`	`deftrain_emr_cws():`
`34`	`34`	`data_path='../dnlp/data/emr/emr_cws.pickle'`
`35`		`-config=DnnCrfConfig()`
	`35`	`+config=DnnCrfConfig(skip_left=1,skip_right=1)`
`36`	`36`	`dnncrf=DnnCrf(config=config,data_path=data_path,nn='lstm',task='cws',remark='emr_cws')`
`37`	`37`	`dnncrf.fit()`
`38`	`38`
`@@ -404,9 +404,9 @@ def export_cws(data, filename):`
`404`	`404`	`train_cws()`
`405`	`405`	`elifargs.emr:`
`406`	`406`	`# train_emr_old_method()`
`407`		`-#train_emr_cws()`
`408`		`-train_emr_word_skipgram()`
`409`		`-train_emr_word_cbow()`
	`407`	`+train_emr_cws()`
	`408`	`+#train_emr_word_skipgram()`
	`409`	`+#train_emr_word_cbow()`
`410`	`410`	`# train_emr_with_embeddings()`
`411`	`411`	`# train_emr_ngram('mlp')`
`412`	`412`	`# train_emr_ngram('rnn')`

`‎python/scripts/init_datasets.py‎`

Lines changed: 4 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -82,4 +82,7 @@ def build_emr_cws_files(base_folder):`
`82`	`82`	`# copy()`
`83`	`83`	`# build_cws_datasets()`
`84`	`84`	`# build_emr_datasets()`
`85`		`-build_emr_re()`
	`85`	`+# build_emr_re()`
	`86`	`+base_folder='../dnlp/data/emr/'`
	`87`	`+dict_path=base_folder+'emr_merged_word_dict.utf8'`
	`88`	`+ProcessEMR(base_folder=base_folder,dict_path=dict_path,directed=True)`

`‎python/scripts/pipeline.py‎`

Lines changed: 29 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,29 @@`
	`1`	`+# -- coding:utf-8 --`
	`2`	`+fromdnlp.config.sequence_labeling_configimportDnnCrfConfig`
	`3`	`+fromdnlp.core.dnn_crfimportDnnCrf`
	`4`	`+defner(sentence):`
	`5`	`+data_path=''`
	`6`	`+config_bi_bigram=DnnCrfConfig(skip_left=0,skip_right=0)`
	`7`	`+lstmcrf=DnnCrf(config=config_bi_bigram,task='ner',data_path=data_path,nn='lstm',remark='lstm')`
	`8`	`+returnlstmcrf.predict(sentence)`
	`9`	`+defcws(sentence):`
	`10`	`+config=DnnCrfConfig()`
	`11`	`+model_path='../dnlp/models/emr/cws-lstm-emr_cws-20.ckpt'`
	`12`	`+dnncrf=DnnCrf(config=config,model_path=model_path,mode='predict',nn='lstm',task='cws',remark='emr_cws')`
	`13`	`+returndnncrf.predict(sentence)`
	`14`	`+`
	`15`	`+defprepare_rel(sentence):`
	`16`	`+cws_res=cws(sentence)`
	`17`	`+`
	`18`	`+`
	`19`	`+defrel():`
	`20`	`+pass`
	`21`	`+`
	`22`	`+defget_sentences(filename):`
	`23`	`+withopen('../dnlp/data/emr/emr_paper/train/'+filename,encoding='utf-8')asf:`
	`24`	`+returnf.read().split('。')`
	`25`	`+`
	`26`	`+if__name__=='__main__':`
	`27`	`+sentences=get_sentences('996716_admission.txt')`
	`28`	`+forsentenceinsentences:`
	`29`	`+prepare_rel(sentence)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commited380f9

File tree

8 files changed

8 files changed

`‎python/dnlp/core/cbow.py‎`

`‎python/dnlp/core/re_cnn.py‎`

`‎python/dnlp/data_process/process_brat.py‎`

`‎python/dnlp/data_process/process_conll.py‎`

`‎python/dnlp/data_process/process_emr.py‎`

`‎python/scripts/cws_ner.py‎`

`‎python/scripts/init_datasets.py‎`

`‎python/scripts/pipeline.py‎`

0 commit comments