1
- import os
2
- from concurrent .futures import ThreadPoolExecutor
3
-
4
- import numpy as np
5
- import openai
6
1
import torch .nn as nn
7
- from more_itertools import chunked
8
- from openai .openai_object import OpenAIObject
9
2
from sentence_transformers import SentenceTransformer ,models
10
3
from src .sts import STSEvaluation
11
4
from transformers import AutoModel ,BertModel
12
5
13
- openai .api_key = os .environ ["OPENAI_API_KEY" ]
14
-
15
6
# MODEL_PATH = "cl-nagoya/sup-simcse-ja-large"
16
7
# MODEL_PATH = "cl-nagoya/sup-simcse-ja-base"
17
8
# MODEL_PATH = "MU-Kindai/Japanese-SimCSE-BERT-large-sup"
18
9
# MODEL_PATH = "colorfulscoop/sbert-base-ja"
19
- MODEL_PATH = "pkshatech/GLuCoSE-base-ja"
10
+ # MODEL_PATH = "pkshatech/GLuCoSE-base-ja"
11
+ # MODEL_PATH = "oshizo/sbert-jsnli-luke-japanese-base-lite"
12
+ MODEL_PATH = "intfloat/multilingual-e5-large"
13
+
14
+
15
+ sts = STSEvaluation (sts_dir = "./datasets/sts" )
16
+
20
17
18
+ def evaluate ():
19
+ model = SentenceTransformer (MODEL_PATH ).eval ().cuda ()
20
+ print (sts .dev (encode = model .encode ))
21
+ print (sts (encode = model .encode ))
21
22
22
- def load_jcse (model_name :str ):
23
- backbone = models .Transformer (model_name )
24
- pretrained_model :BertModel = AutoModel .from_pretrained (model_name )
23
+
24
+ def evaluate_jcse ():
25
+ backbone = models .Transformer (MODEL_PATH )
26
+ pretrained_model :BertModel = AutoModel .from_pretrained (MODEL_PATH )
25
27
hidden_size = pretrained_model .config .hidden_size
26
28
27
29
# load weights of Transformer layers
@@ -31,7 +33,7 @@ def load_jcse(model_name: str):
31
33
pooling_mode = "cls" ,
32
34
)
33
35
34
- if "unsup" in model_name :
36
+ if "unsup" in MODEL_PATH :
35
37
model = SentenceTransformer (modules = [backbone ,pooling ]).eval ().cuda ()
36
38
37
39
else :
@@ -49,44 +51,64 @@ def load_jcse(model_name: str):
49
51
mlp .load_state_dict (mlp_state_dict )
50
52
model = SentenceTransformer (modules = [backbone ,pooling ,mlp ]).eval ().cuda ()
51
53
52
- return model
54
+ print (sts .dev (encode = model .encode ))
55
+ print (sts (encode = model .encode ))
53
56
54
57
55
- def load_vanilla ( model_name : str ):
56
- backbone = models .Transformer (model_name )
58
+ def evaluate_vanilla ( ):
59
+ backbone = models .Transformer (MODEL_PATH )
57
60
pooling = models .Pooling (
58
61
word_embedding_dimension = backbone .auto_model .config .hidden_size ,
59
62
pooling_mode = "cls" ,
60
63
)
61
- return SentenceTransformer (modules = [backbone ,pooling ]).eval ().cuda ()
64
+ model = SentenceTransformer (modules = [backbone ,pooling ]).eval ().cuda ()
65
+ print (sts .dev (encode = model .encode ))
66
+ print (sts (encode = model .encode ))
62
67
63
68
64
- sts = STSEvaluation (sts_dir = "./datasets/sts" )
69
+ def evaluate_openai ():
70
+ import os
71
+ import openai
72
+ import numpy as np
73
+ from concurrent .futures import ThreadPoolExecutor
74
+ from more_itertools import chunked
75
+ from openai .openai_object import OpenAIObject
76
+
77
+ openai .api_key = os .environ ["OPENAI_API_KEY" ]
78
+
79
+ def encode_openai (batch :list [str ]):
80
+ res :OpenAIObject = openai .Embedding .create (
81
+ model = "text-embedding-ada-002" ,
82
+ input = batch ,
83
+ )
84
+ return [d .embedding for d in res .data ]
85
+
86
+ def encode (sentences :list [str ],batch_size :int = 128 ):
87
+ embs = []
88
+ with ThreadPoolExecutor (max_workers = 32 )as executor :
89
+ batches = chunked (list (sentences ),batch_size )
90
+ for emb in executor .map (encode_openai ,batches ):
91
+ embs += emb
92
+ embs = np .array (embs )
93
+ return embs
65
94
66
- # model = load_jcse(MODEL_PATH)
67
- # model = load_vanilla("cl-tohoku/bert-base-japanese-v3")
68
- model = SentenceTransformer (MODEL_PATH ).eval ().cuda ()
69
- print (sts .dev (encode = model .encode ))
70
- print (sts (encode = model .encode ))
95
+ print (sts .dev (encode = encode ))
96
+ print (sts (encode = encode ))
71
97
72
98
73
- # def encode_openai(batch: list[str]):
74
- # res: OpenAIObject = openai.Embedding.create(
75
- # model="text-embedding-ada-002",
76
- # input=batch,
77
- # )
78
- # return [d.embedding for d in res.data]
99
+ def evaluate_e5 ():
100
+ model = SentenceTransformer (MODEL_PATH ).eval ().cuda ()
79
101
102
+ def encode (sentences :list [str ]):
103
+ sentences = [f"query:{ s } " for s in sentences ]
104
+ return model .encode (sentences )
80
105
81
- # def encode(sentences: list[str], batch_size: int = 128):
82
- # embs = []
83
- # with ThreadPoolExecutor(max_workers=32) as executor:
84
- # batches = chunked(list(sentences), batch_size)
85
- # for emb in executor.map(encode_openai, batches):
86
- # embs += emb
87
- # embs = np.array(embs)
88
- # return embs
106
+ print (sts .dev (encode = encode ))
107
+ print (sts (encode = encode ))
89
108
90
109
91
- # print(sts.dev(encode=encode))
92
- # print(sts(encode=encode))
110
+ if __name__ == "__main__" :
111
+ # evaluate()
112
+ # evaluate_vanilla()
113
+ # evaluate_openai()
114
+ evaluate_e5 ()