Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commiteba58c0

Browse files
update codes
1 parent79332be commiteba58c0

File tree

10 files changed

+473
-93
lines changed

10 files changed

+473
-93
lines changed

‎python/__init__.py‎

Whitespace-only changes.

‎python/dnlp/core/distant_supervision.py‎

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,61 @@ def construct_kb(relations):
2525
print(t)
2626
returnknowledge_base
2727

28+
2829
KB=construct_kb(read_relation('../dnlp/data/emr/emr_paper/emr_relation.rel'))
29-
defextract_relaction(entity1,entity2):
30-
key=':'.join((str(entity1),str(entity2)))
31-
ifkeyinKB:
32-
returnKB[key][2]
30+
31+
32+
defread_dictionary(dict_path:str,reverse=False):
33+
dictionary= {}
34+
withopen(dict_path,encoding='utf8')asd:
35+
items=d.readlines()
36+
foriteminitems:
37+
pair=item.split(' ')
38+
dictionary[pair[0]]=int(pair[1])
39+
ifreverse:
40+
returndictionary,dict(zip(dictionary.values(),dictionary.keys()))
41+
else:
42+
returndictionary
43+
44+
45+
BASE_FOLDER='../dnlp/data/emr/'
46+
DICT_PATH=BASE_FOLDER+'emr_merged_word_dict.utf8'
47+
withopen('../dnlp/data/emr/rel_names','rb')asf:
48+
REL_PAIR_NAMES_INIT=pickle.load(f)
49+
REL_PAIR_LIST=list([r.split(':')forrinREL_PAIR_NAMES_INIT])
50+
REL_NAME_IDX= {}
51+
relation_category_index=0
52+
REL_NAMES= {'PartOf':'部位','PropertyOf':'性质','DegreeOf':'程度','QualityValue':'定性值',
53+
'QuantityValue':'定量值','UnitOf':'单位','TimeOf':'持续时间','StartTime':'开始时间',
54+
'EndTime':'结束时间','Moment':'时间点','DateOf':'日期','ResultOf':'结果',
55+
'LocationOf':'地点','DiseaseTypeOf':'疾病分型分期','SpecOf':'规格','UsageOf':'用法',
56+
'DoseOf':'用量','FamilyOf':'家族成员','ModifierOf':'其他修饰词','UseMedicine':'用药',
57+
'LeadTo':'导致','Find':'发现','Confirm':'证实','Adopt':'采取','Take':'用药',
58+
'Limit':'限定','AlongWith':'伴随','Complement':'补足'}
59+
forrelation_categoryinREL_NAMES:
60+
REL_NAME_IDX[relation_category]=relation_category_index
61+
relation_category_index+=1
62+
63+
DICTIONARY=read_dictionary(DICT_PATH)
64+
65+
66+
defisdigit(s):
67+
try:
68+
float(s)
69+
returnTrue
70+
exceptExceptionase:
71+
returnFalse
72+
73+
74+
defextract_relaction(entity1,entity2,type1,type2):
75+
key=':'.join((str(entity1),str(entity2)))
76+
iftype2=='Quantity'andtype1+':'+type2inREL_PAIR_NAMES_INIT:
77+
returnREL_NAME_IDX[REL_PAIR_NAMES_INIT[type1+':'+type2]]
3378
else:
34-
return-1
79+
ifkeyinKB:
80+
returnKB[key][2]
81+
else:
82+
return-1
3583

3684

3785
if__name__=='__main__':

‎python/dnlp/core/dnn_crf_base.py‎

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def get_batch(self) -> (np.ndarray, np.ndarray, np.ndarray):
103103
self.batch_start=new_start
104104
returnself.indices2input(chs_batch),np.array(lls_batch,dtype=np.int32),np.array(len_batch,dtype=np.int32)
105105

106-
defviterbi(self,emission:np.ndarray,transition:np.ndarray,transition_init:np.ndarray,padding_length=-1):
106+
defviterbi(self,emission:np.ndarray,transition:np.ndarray,transition_init:np.ndarray,labels:np.ndarray=None,padding_length=-1):
107107
length=emission.shape[1]
108108
ifpadding_length==-1:
109109
padding_length=length
@@ -116,10 +116,14 @@ def viterbi(self, emission: np.ndarray, transition: np.ndarray, transition_init:
116116
fortinrange(self.tags_count):
117117
forprevinrange(self.tags_count):
118118
temp=path_score[prev][pos-1]+transition[prev][t]+emission[t][pos]
119+
iflabels[pos-1]!=prev:
120+
temp+=self.hinge_rate
119121
iftemp>=path_score[t][pos]:
120122
path[t][pos]=prev
121123
path_score[t][pos]=temp
122-
124+
foriinrange(self.tags_count):
125+
ifi!=labels[length-1]:
126+
path_score[i][length-1]+=self.tags_count
123127
max_index=np.argmax(path_score[:,-1])
124128
corr_path[length-1]=max_index
125129
foriinrange(length-1,0,-1):
@@ -170,27 +174,21 @@ def tags2words(self, sentence: str, tags_seq: np.ndarray) -> list:
170174

171175
returnwords
172176

173-
deftags2entities(self,sentence:str,tags_seq:np.ndarray,return_start:bool=False):
174-
entities= []
175-
entity_starts= []
176-
entity=''
177+
deftags2entities(self,sentence:str,tags_seq:np.ndarray,return_start:bool=True):
178+
entity_spans= {}
179+
entity_start=-1
177180

178181
fortag_index,taginenumerate(tags_seq):
179-
iftag==self.tags_map[TAG_OTHER]:
180-
continue
181-
eliftag==self.tags_map[TAG_BEGIN]:
182-
ifentity:
183-
entities.append((entity,tag_index))
184-
entity=sentence[tag_index]
185-
entity_starts.append(tag_index)
186-
else:
187-
entity+=sentence[tag_index]
188-
ifentity!='':
189-
entities.append((entity,len(sentence)-len(entity)))
182+
iftag==self.tags_map[TAG_BEGIN]:
183+
entity_spans[tag_index]=tag_index
184+
entity_start=tag_index
185+
eliftag==self.tags_map[TAG_INSIDE]:
186+
entity_spans[entity_start]=tag_index
187+
190188
ifreturn_start:
191-
returnentities,entity_starts
189+
return[(sentence[s:e+1],s)fors,einentity_spans.items()]
192190
else:
193-
returnentities
191+
return[sentence[s:e+1]fors,einentity_spans.items()]
194192

195193
deftag2sequences(self,tags_seq:np.ndarray):
196194
seq= []

‎python/dnlp/core/re_cnn.py‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ def get_score(self, predict, true):
184184
print(prec,recall,f1)
185185
returnprec,recall,f1
186186

187+
def__conv_kernel_variable(self):
188+
pass
187189
def__weight_variable(self,shape,name):
188190
initial=tf.truncated_normal(shape,stddev=0.1,dtype=self.dtype)
189191
returntf.Variable(initial,name=name)

‎python/dnlp/data_process/process_emr.py‎

Lines changed: 112 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66
importjson
77
importpprint
88
fromcollectionsimportOrderedDict
9-
fromitertoolsimportchain
9+
fromitertoolsimportchain,permutations
10+
fromcollectionsimportCounter
1011
fromdnlp.utils.constantimportUNK
1112

1213
RE_SAPCE=re.compile('[ ]+')
1314

15+
1416
# print(pprint.pformat([1,[[2]],3,4444444444,77777777777777777777777777],indent=2,width=10))
1517
classProcessEMR(object):
1618
def__init__(self,base_folder:str,dict_path:str='',mode='train',directed=False):
@@ -21,12 +23,14 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed
2123
self.directed=directed
2224
withopen(self.relation_name_file,'rb')asf:
2325
self.category_name=pickle.load(f)
26+
withopen(self.relation_pair_file,'rb')asf:
27+
self.relation_pair_names=pickle.load(f)
2428
# self.reversed_category_name = dict(zip(self.category_name.values(),self.category_name.keys()))
2529
self.mode=mode
2630
ifself.mode=='train':
27-
self.window=5
31+
self.window=200
2832
else:
29-
self.window=100
33+
self.window=200
3034
self.dict_path=dict_path
3135
self.files=self.get_files()
3236
self.annotations=self.read_annotations(directed)
@@ -51,17 +55,53 @@ def __init__(self, base_folder: str, dict_path: str = '', mode='train', directed
5155
forrelation_categoryinself.relation_categories:
5256
self.relation_category_labels[relation_category]=relation_category_index
5357
relation_category_index+=1
54-
print(len(self.relation_category_labels))
58+
#print(len(self.relation_category_labels))
5559
withopen(self.base_folder+'relation_index.pickle','wb')asf:
5660
pickle.dump(self.relation_category_labels,f)
5761
self.two_categories=self.generate_re_two_training_data()
5862
self.multi_categories=self.generate_re_mutli_training_data()
59-
withopen(self.data_folder+'/emr_relation.rel','wb')asf:
60-
pickle.dump(self.multi_categories,f)
63+
ifmode=='train'anddirected:
64+
withopen(self.data_folder+'/emr_relation.rel','wb')asf:
65+
pickle.dump(self.multi_categories,f)
6166
self.export_structured_emr()
6267
self.data=self.read_file()
6368
self.export()
6469
self.save_data()
70+
self.export_type_dict()
71+
self.export_relations()
72+
73+
defexport_relations(self):
74+
data= {}
75+
forannotationinself.annotations:
76+
filename=annotation['file']
77+
entities=annotation['entities']
78+
79+
forrelationinannotation['true_relations'].values():
80+
ent1=entities[relation['first']]
81+
ent2=entities[relation['second']]
82+
rel=OrderedDict(
83+
{'ent1':ent1['text'],'ent2':ent2['text'],'ent1_type':ent1['type'],'ent2_type':ent2['type'],
84+
'rel_type':relation['type']})
85+
iffilenamenotindata:
86+
data[filename]= [rel]
87+
else:
88+
data[filename].append(rel)
89+
90+
withopen(self.data_folder+'/emr_test_rel.pickle','wb')asf:
91+
pickle.dump(data,f)
92+
93+
defexport_type_dict(self):
94+
entity_dict= {}
95+
forannotationinself.annotations:
96+
forentityinannotation['entities'].values():
97+
entity_text=entity['text']
98+
entity_type=entity['type']
99+
ifentity_textnotinentity_dict:
100+
entity_dict[entity_text]= [entity_type]
101+
else:
102+
entity_dict[entity_text].append(entity_type)
103+
entity_dict= {k:Counter(v).most_common(1)[0][0]fork,vinentity_dict.items()}
104+
# print(len(entity_dict))
65105

66106
defstatistics(self):
67107
true_count=0
@@ -77,7 +117,7 @@ def read_file(self):
77117
data= {}
78118
forfinself.files:
79119
file_data= {'entities': {},'relations': {}}
80-
withopen(self.data_folder+self.mode+'/'+f+'.ann',encoding='utf-8')asf:
120+
withopen(self.data_folder+self.mode+'/'+f+'.ann',encoding='utf-8')asf:
81121
entries= [l.split('\t')forlinf.read().splitlines()ifl]
82122
forentryinentries:
83123
idx=entry[0]
@@ -117,7 +157,7 @@ def export(self):
117157
attribute[self.relation_categories[rt]]=e2['text']
118158
else:
119159
iftype(attribute[self.relation_categories[rt]])==str:
120-
attribute[self.relation_categories[rt]]= [attribute[self.relation_categories[rt]],e2['text']]
160+
attribute[self.relation_categories[rt]]= [attribute[self.relation_categories[rt]],e2['text']]
121161
else:
122162
attribute[self.relation_categories[rt]].append(e2['text'])
123163
ifnotresult.get(e_type):
@@ -137,7 +177,7 @@ def export(self):
137177
# entity['attributes'] = attribute
138178
# result.append(entity)
139179
withopen(self.base_folder+'structured/'+filename+'.json','w',encoding='utf-8')asf:
140-
f.write(pprint.pformat(new_result,width=100).replace('\'','"'))
180+
f.write(pprint.pformat(new_result,width=100).replace('\'','"'))
141181
# json.dump(new_result, f, ensure_ascii=False)
142182

143183
defexport_structured_emr(self):
@@ -160,7 +200,7 @@ def export_structured_emr(self):
160200
forrintrue_relations:
161201
ifrelations.get(r['first']):
162202
relations[r['first']].append((r['second'],r['type']))
163-
print(relations)
203+
#print(relations)
164204
else:
165205
relations[r['first']]= [(r['second'],r['type'])]
166206
foreinentities:
@@ -206,7 +246,8 @@ def generate_re_mutli_training_data(self):
206246
word_indices=self.map_to_indices(annotation['words'])
207247
fortrue_rel_nameinannotation['true_relations']:
208248
true_rel=annotation['true_relations'][true_rel_name]
209-
train_data.append({'words':word_indices,'primary':true_rel['primary'],'secondary':true_rel['secondary'],'type':self.relation_category_labels[true_rel['type']]})
249+
train_data.append({'words':word_indices,'primary':true_rel['primary'],'secondary':true_rel['secondary'],
250+
'type':self.relation_category_labels[true_rel['type']]})
210251
returntrain_data
211252

212253
defmap_to_indices(self,words):
@@ -239,7 +280,7 @@ def read_dictionary(self, reverse=False):
239280

240281
defget_files(self):
241282
files=set()
242-
print(os.path.abspath(self.data_folder))
283+
#print(os.path.abspath(self.data_folder))
243284
forlinos.listdir(self.data_folder+self.mode+'/'):
244285
files.add(os.path.splitext(l)[0])
245286
returnfiles
@@ -278,16 +319,68 @@ def read_annotations(self, directed=False):
278319
print('fuck your world')
279320
sentence['new_entities'][entity['index']]=entity
280321

281-
data=self.read_relation_in_single_file(filename+'.ann',sentence_dict,directed)
322+
# data = self.read_relation_in_single_file(filename + '.ann', sentence_dict, directed)
323+
data=self.read_relation_in_single_file_permutation(filename+'.ann',sentence_dict,directed)
282324
all_sentences.extend(data.values())
283325
returnall_sentences
284326

327+
defread_relation_in_single_file_permutation(self,ann_file,data,directed=False):
328+
withopen(ann_file,encoding='utf-8')asf:
329+
entries=map(lambdal:l.strip().split(' '),f.read().replace('\t',' ').splitlines())
330+
forentryinentries:
331+
idx=entry[0]
332+
ifidx.startswith('R'):
333+
primary=entry[2][entry[2].find(':')+1:]
334+
# print(primary)
335+
secondary=entry[3][entry[3].find(':')+1:]
336+
forsentence_textindata:
337+
sentence=data[sentence_text]
338+
entities=sentence['entities']
339+
# sentence['true_relations'] = {}
340+
ifprimaryinentitiesandsecondaryinentities:
341+
rel= {'id':idx,'primary':entities[primary]['index'],'secondary':entities[secondary]['index'],
342+
'type':entry[1],'first':primary,'second':secondary}
343+
ifsentence.get('true_relations'):
344+
sentence['true_relations'][idx]=rel
345+
else:
346+
sentence['true_relations']= {idx:rel}
347+
forsentence_textindata:
348+
sentence=data[sentence_text]
349+
ifnotsentence.get('true_relations'):
350+
print('sentence no relations')
351+
continue
352+
353+
true_pairs= [(l['primary'],l['secondary'])forlinsentence['true_relations'].values()]
354+
comma_index= [ifori,winenumerate(sentence['words'])ifwin (',',',')]
355+
all_info= {l['index']:l['type']forlinsentence['entities'].values()}
356+
# all_indices = sorted([l[0] for l in all_info])
357+
358+
ifnotcomma_indexorcomma_index[-1]!=len(sentence['words']):
359+
comma_index.append(len(sentence['words']))
360+
comma_index= [-1]+comma_index
361+
rel_candidates= []
362+
fors,einzip(comma_index[:-1],comma_index[1:]):
363+
entity_candidates= [iforiinall_infoifs<i<e]
364+
rel_candidates_raw=permutations(entity_candidates,2)
365+
366+
forp,sinrel_candidates_raw:
367+
p_type=all_info[p]
368+
s_type=all_info[s]
369+
if (p,s)notintrue_pairs:
370+
# if p_type in self.relation_pair_names and s_type in self.relation_pair_names[p_type]:
371+
rel_candidates.append((p,s))
372+
373+
sentence['false_relations']= {str(p)+'-'+str(s): {'primary':p,'secondary':s}forp,sinrel_candidates}
374+
remove_list= [sforsindataifnotdata[s].get('true_relations')]
375+
[data.pop(s)forsinremove_list]
376+
returndata
377+
285378
defread_relation_in_single_file(self,ann_file,data,directed=False):
286379
withopen(ann_file,encoding='utf-8')asf:
287380
entries=map(lambdal:l.strip().split(' '),f.read().replace('\t',' ').splitlines())
288381
forentryinentries:
289-
id=entry[0]
290-
ifid.startswith('R'):
382+
idx=entry[0]
383+
ifidx.startswith('R'):
291384
primary=entry[2][entry[2].find(':')+1:]
292385
# print(primary)
293386
secondary=entry[3][entry[3].find(':')+1:]
@@ -296,12 +389,12 @@ def read_relation_in_single_file(self, ann_file, data, directed=False):
296389
entities=sentence['entities']
297390
# sentence['true_relations'] = {}
298391
ifprimaryinentitiesandsecondaryinentities:
299-
rel= {'id':id,'primary':entities[primary]['index'],'secondary':entities[secondary]['index'],
392+
rel= {'id':idx,'primary':entities[primary]['index'],'secondary':entities[secondary]['index'],
300393
'type':entry[1],'first':primary,'second':secondary}
301-
ifsentence.get('true_realtions'):
302-
sentence['true_relations'][id]=rel
394+
ifsentence.get('true_relations'):
395+
sentence['true_relations'][idx]=rel
303396
else:
304-
sentence['true_relations']= {id:rel}
397+
sentence['true_relations']= {idx:rel}
305398

306399
forsentence_textindata:
307400
sentence=data[sentence_text]

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp