Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd16edbe

Browse files
committed
Update README.md
1 parent3288f37 commitd16edbe

File tree

11 files changed

+655
-36
lines changed

11 files changed

+655
-36
lines changed

‎README.md

Lines changed: 253 additions & 20 deletions
Large diffs are not rendered by default.

‎poetry.lock

Lines changed: 180 additions & 1 deletion
Some generated files are not rendered by default. Learn more aboutcustomizing how changed files appear on GitHub.

‎pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ mwparserfromhell = "^0.6.4"
2424
python-snappy ="^0.6.1"
2525
mojimoji ="^0.0.12"
2626
peft ="^0.3.0"
27-
pysen = {extras = ["format","lint"],version ="^0.10.5"}
27+
pysen = {extras = ["lint"],version ="^0.10.5" }
28+
sentence-transformers = {git ="https://github.com/UKPLab/sentence-transformers.git"}
29+
openai ="^0.28.1"
2830

2931
[tool.pysen]
3032
version ="0.10"

‎src/evaluate.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
importos
2+
fromconcurrent.futuresimportThreadPoolExecutor
3+
4+
importnumpyasnp
5+
importopenai
6+
importtorch.nnasnn
7+
frommore_itertoolsimportchunked
8+
fromopenai.openai_objectimportOpenAIObject
9+
fromsentence_transformersimportSentenceTransformer,models
10+
fromsrc.stsimportSTSEvaluation
11+
fromtransformersimportAutoModel,BertModel
12+
13+
openai.api_key=os.environ["OPENAI_API_KEY"]
14+
15+
# MODEL_PATH = "cl-nagoya/sup-simcse-ja-large"
16+
# MODEL_PATH = "cl-nagoya/sup-simcse-ja-base"
17+
# MODEL_PATH = "MU-Kindai/Japanese-SimCSE-BERT-large-sup"
18+
# MODEL_PATH = "colorfulscoop/sbert-base-ja"
19+
MODEL_PATH="pkshatech/GLuCoSE-base-ja"
20+
21+
22+
defload_jcse(model_name:str):
23+
backbone=models.Transformer(model_name)
24+
pretrained_model:BertModel=AutoModel.from_pretrained(model_name)
25+
hidden_size=pretrained_model.config.hidden_size
26+
27+
# load weights of Transformer layers
28+
backbone.auto_model.load_state_dict(pretrained_model.state_dict())
29+
pooling=models.Pooling(
30+
word_embedding_dimension=hidden_size,
31+
pooling_mode="cls",
32+
)
33+
34+
if"unsup"inmodel_name:
35+
model=SentenceTransformer(modules=[backbone,pooling]).eval().cuda()
36+
37+
else:
38+
# load weights of extra MLP layer
39+
# unsupervised models do not use this, so we need to create a new one only for supervised models
40+
mlp=models.Dense(
41+
in_features=hidden_size,
42+
out_features=hidden_size,
43+
activation_function=nn.Tanh(),
44+
)
45+
mlp_state_dict= {
46+
key.replace("dense.","linear."):param
47+
forkey,paraminpretrained_model.pooler.state_dict().items()
48+
}
49+
mlp.load_state_dict(mlp_state_dict)
50+
model=SentenceTransformer(modules=[backbone,pooling,mlp]).eval().cuda()
51+
52+
returnmodel
53+
54+
55+
defload_vanilla(model_name:str):
56+
backbone=models.Transformer(model_name)
57+
pooling=models.Pooling(
58+
word_embedding_dimension=backbone.auto_model.config.hidden_size,
59+
pooling_mode="cls",
60+
)
61+
returnSentenceTransformer(modules=[backbone,pooling]).eval().cuda()
62+
63+
64+
sts=STSEvaluation(sts_dir="./datasets/sts")
65+
66+
# model = load_jcse(MODEL_PATH)
67+
# model = load_vanilla("cl-tohoku/bert-base-japanese-v3")
68+
model=SentenceTransformer(MODEL_PATH).eval().cuda()
69+
print(sts.dev(encode=model.encode))
70+
print(sts(encode=model.encode))
71+
72+
73+
# def encode_openai(batch: list[str]):
74+
# res: OpenAIObject = openai.Embedding.create(
75+
# model="text-embedding-ada-002",
76+
# input=batch,
77+
# )
78+
# return [d.embedding for d in res.data]
79+
80+
81+
# def encode(sentences: list[str], batch_size: int = 128):
82+
# embs = []
83+
# with ThreadPoolExecutor(max_workers=32) as executor:
84+
# batches = chunked(list(sentences), batch_size)
85+
# for emb in executor.map(encode_openai, batches):
86+
# embs += emb
87+
# embs = np.array(embs)
88+
# return embs
89+
90+
91+
# print(sts.dev(encode=encode))
92+
# print(sts(encode=encode))

‎src/experiment.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
fromcollectionsimportdefaultdict
12
fromdataclassesimportdataclass
23
fromdatetimeimportdatetime
34
frompathlibimportPath
@@ -6,6 +7,7 @@
67
importpeft
78
importsrc.utilsasutils
89
importtorch
10+
fromsentence_transformersimportSentenceTransformer,models
911
fromsrc.datasetimportSupSimCSEDataset,UnsupSimCSEDataset
1012
fromsrc.modelsimportSimCSEModel
1113
fromsrc.stsimportSTSEvaluation
@@ -15,6 +17,13 @@
1517
fromtransformersimportAutoTokenizer,PreTrainedModel
1618
fromtransformers.tokenization_utilsimportBatchEncoding,PreTrainedTokenizer
1719

20+
TRAIN_DATASET_MAP= {
21+
"jsnli":"shunk031/jsnli",
22+
"janli":"hpprc/janli",
23+
"wiki40b":"wiki40b",
24+
"wikipedia":"wikipedia",
25+
}
26+
1827

1928
classCommonArgs(Tap):
2029
METHOD:str=None
@@ -37,6 +46,9 @@ class CommonArgs(Tap):
3746
use_lora:bool=False
3847
use_jumanpp:bool=False
3948

49+
save_model:bool=False
50+
save_model_name:str=None
51+
4052
num_training_examples:int=2**20
4153
num_eval_logging:int=2**6
4254
num_warmup_ratio:float=0.1
@@ -220,6 +232,22 @@ def log(self, metrics: dict) -> None:
220232
f"sts-dev:{metrics['sts-dev']:.4f}"
221233
)
222234

235+
defsave_model(self):
236+
backbone=models.Transformer(self.args.model_name)
237+
backbone.auto_model.load_state_dict(self.model.backbone.state_dict())
238+
pooling=models.Pooling(
239+
word_embedding_dimension=self.model.backbone.config.hidden_size,
240+
pooling_mode=self.args.pooling,
241+
)
242+
243+
model=SentenceTransformer(modules=[backbone,pooling])
244+
train_dataset_name=TRAIN_DATASET_MAP.get(
245+
self.args.dataset_name,self.args.dataset_name
246+
)
247+
model.save(path=str(self.args.output_dir),train_datasets=[train_dataset_name])
248+
249+
print(f"saved at{self.args.output_dir}")
250+
223251

224252
classUnsupSimCSEExperiment(Experiment):
225253
def__init__(self,*args,**kwargs):

‎src/sts.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def __init__(
3232
def__call__(self,encode:Callable[[list[str]],FloatTensor])->float:
3333
embeddings1=encode(self.sentences1)
3434
embeddings2=encode(self.sentences2)
35+
3536
# you can use any similarity function you want ↓
3637
cosine_scores=1-paired_cosine_distances(embeddings1,embeddings2)
3738
spearman,_=spearmanr(self.scores,cosine_scores)
@@ -106,9 +107,7 @@ def __call__(
106107
progress_bar:bool=True,
107108
)->dict[str,float]:
108109
ifprogress_bar:
109-
iterator=tqdm(
110-
list(self.sts_evaluators.items()),dynamic_ncols=True,leave=False
111-
)
110+
iterator=tqdm(list(self.sts_evaluators.items()),dynamic_ncols=True,leave=False)
112111
else:
113112
iterator=list(self.sts_evaluators.items())
114113

‎src/summarize_results.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,43 @@ def dataset_rank(args: Args):
139139
print(data)
140140

141141

142+
defbatch_rank(args:Args):
143+
print("batch_rank")
144+
145+
formethodin ["sup-simcse","unsup-simcse"]:
146+
path=args.input_dir/method/"all.csv"
147+
df=pd.read_csv(path)
148+
data_df= {}
149+
150+
data=defaultdict(lambda:defaultdict(dict))
151+
forkey,group_dfindf.groupby(["model_name","dataset_name"]):
152+
forlr,lr_groupingroup_df.groupby("lr"):
153+
lr_group=lr_group.sort_values("avg",ascending=False)
154+
forrank,batch_sizeinenumerate(lr_group["batch_size"].tolist()):
155+
data[lr][key][batch_size]=rank+1
156+
forlrindata.keys():
157+
data_df[lr]= (
158+
pd.DataFrame(data[lr]).mean(axis=1).sort_values(ascending=True)
159+
)
160+
data_df["avg"]=pd.DataFrame(data_df).mean(axis=1).sort_values(ascending=True)
161+
df=pd.DataFrame(data_df)
162+
print(df)
163+
164+
165+
defcounts(args:Args):
166+
print("counts")
167+
168+
formethodin ["sup-simcse","unsup-simcse"]:
169+
path=args.input_dir/method/"all.csv"
170+
df=pd.read_csv(path)
171+
print(df["count"].sum())
172+
173+
142174
if__name__=="__main__":
143175
args=Args().parse_args()
144176
by_models(args)
145177
by_datasets(args)
146178
model_rank(args)
147179
dataset_rank(args)
180+
batch_rank(args)
181+
counts(args)

‎src/train_sup.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -70,22 +70,15 @@ def main(args: Args):
7070
sent1=exp.model.forward(**batch.sent1)
7171
hard_neg=exp.model.forward(**batch.hard_neg)
7272

73-
sim_mat_1st=F.cosine_similarity(
74-
sent0.unsqueeze(1),sent1.unsqueeze(0),dim=-1
75-
)
76-
sim_mat_2nd=F.cosine_similarity(
77-
sent0.unsqueeze(1),hard_neg.unsqueeze(0),dim=-1
78-
)
73+
sim_mat_1st=F.cosine_similarity(sent0.unsqueeze(1),sent1.unsqueeze(0),dim=-1)
74+
sim_mat_2nd=F.cosine_similarity(sent0.unsqueeze(1),hard_neg.unsqueeze(0),dim=-1)
7975

8076
sim_mat=torch.cat([sim_mat_1st,sim_mat_2nd],dim=1)
8177
sim_mat=sim_mat/args.temperature
8278

83-
labels= (
84-
torch.arange(sim_mat.size(0))
85-
.long()
86-
.to(args.device,non_blocking=True)
87-
)
79+
labels=torch.arange(sim_mat.size(0)).long().to(args.device,non_blocking=True)
8880
loss=F.cross_entropy(sim_mat,labels)
81+
8982
train_losses.append(float(loss.item()))
9083

9184
optimizer.zero_grad()
@@ -118,6 +111,7 @@ def main(args: Args):
118111
"sts-dev":dev_score,
119112
}
120113
exp.log(val_metrics)
114+
print(loss1.item(),loss2.item())
121115
train_losses= []
122116

123117
exp.model.train()
@@ -144,6 +138,9 @@ def main(args: Args):
144138
utils.save_json(sts_metrics,args.output_dir/"sts-metrics.json")
145139
utils.save_config(args,args.output_dir/"config.json")
146140

141+
ifargs.save_model:
142+
exp.save_model()
143+
147144

148145
if__name__=="__main__":
149146
args=Args().parse_args()

‎src/train_unsup.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ def main(args: Args):
134134
utils.save_json(sts_metrics,args.output_dir/"sts-metrics.json")
135135
utils.save_config(args,args.output_dir/"config.json")
136136

137+
ifargs.save_model:
138+
exp.save_model()
139+
137140

138141
if__name__=="__main__":
139142
args=Args().parse_args()

‎src/upload_to_huggingface.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
fromhuggingface_hubimportHfApi
2+
3+
# MODEL_PATH = "./outputs/sup-simcse/jsnli/cl-tohoku__bert-base-japanese-v3/2023-10-02/16-22-36"
4+
# MODEL_PATH = "./outputs/sup-simcse/jsnli/cl-tohoku__bert-large-japanese-v2/2023-10-02/16-22-31"
5+
# MODEL_PATH = "./outputs/unsup-simcse/wiki40b/cl-tohoku__bert-base-japanese-v3/2023-10-02/17-15-39"
6+
MODEL_PATH="./outputs/unsup-simcse/wiki40b/cl-tohoku__bert-large-japanese-v2/2023-10-02/17-26-28"
7+
8+
REPO_ID="cl-nagoya/unsup-simcse-ja-large"
9+
10+
HfApi().create_repo(REPO_ID)
11+
HfApi().upload_folder(folder_path=MODEL_PATH,repo_id=REPO_ID)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp