Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite6e62cc

Browse files
committed
fix up active nav
1 parent6510734 commite6e62cc

File tree

28 files changed

+776
-43
lines changed

28 files changed

+776
-43
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
importrandom
2+
embeddings= [[random.random()for_inrange(128)]for_inrange (10_000)]
3+
print(embeddings)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
-- SELECT ARRAY_AGG(random()) AS vector
2+
-- FROM generate_series(1, 1280000) i
3+
-- GROUP BY i % 10000;
4+
5+
SELECT1FROM (
6+
SELECT ARRAY_AGG(random())AS vector
7+
FROM generate_series(1,1280000) i
8+
GROUP BY i %10000
9+
) fLIMIT0;
10+
11+
-- CREATE TABLE embeddings AS
12+
-- SELECT ARRAY_AGG(random()) AS vector
13+
-- FROM generate_series(1, 1280000) i
14+
-- GROUP BY i % 10000;
15+
16+
-- COPY embeddings TO '/tmp/embeddings.csv' DELIMITER ',' CSV HEADER;
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
importsys
2+
importnumpy
3+
numpy.set_printoptions(threshold=sys.maxsize)
4+
5+
embeddings=numpy.random.rand(10_000,128)
6+
print(embeddings)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pgml.sql
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
importos
2+
importrequests
3+
fromtimeimporttime
4+
fromrichimportprint
5+
fromdatasetsimportload_dataset
6+
fromtqdm.autoimporttqdm
7+
fromdatasetsimportDataset
8+
fromdotenvimportload_dotenv
9+
10+
load_dotenv(".env")
11+
12+
api_org=os.environ["HF_API_KEY"]
13+
endpoint=os.environ["HF_ENDPOINT"]
14+
# add the api org token to the headers
15+
headers= {
16+
'Authorization':f'Bearer{api_org}'
17+
}
18+
19+
#squad = load_dataset("squad", split='train')
20+
squad=Dataset.from_file("squad-train.arrow")
21+
data=squad.to_pandas()
22+
data=data.drop_duplicates(subset=["context"])
23+
passages=list(data['context'])
24+
25+
total_documents=10000
26+
batch_size=1
27+
passages=passages[:total_documents]
28+
29+
start=time()
30+
foriintqdm(range(0,len(passages),batch_size)):
31+
# find end of batch
32+
i_end=min(i+batch_size,len(passages))
33+
# extract batch
34+
batch=passages[i:i_end]
35+
# generate embeddings for batch via endpoints
36+
res=requests.post(
37+
endpoint,
38+
headers=headers,
39+
json={"inputs":batch}
40+
)
41+
42+
print("Time taken for HF for %d documents = %0.3f"% (len(passages),time()-start))
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
importos
2+
importrequests
3+
fromtimeimporttime
4+
fromrichimportprint
5+
fromdatasetsimportload_dataset
6+
importpinecone
7+
fromtqdm.autoimporttqdm
8+
fromdatasetsimportDataset
9+
10+
api_org=os.environ["HF_API_KEY"]
11+
endpoint=os.environ["HF_ENDPOINT"]
12+
# add the api org token to the headers
13+
headers= {
14+
'Authorization':f'Bearer{api_org}'
15+
}
16+
17+
#squad = load_dataset("squad", split='train')
18+
squad=Dataset.from_file("squad-train.arrow")
19+
data=squad.to_pandas()
20+
data=data.drop_duplicates(subset=["context"])
21+
passages=list(data['context'])
22+
23+
total_documents=10000
24+
batch_size=64
25+
passages=passages[:total_documents]
26+
27+
# connect to pinecone environment
28+
pinecone.init(
29+
api_key=os.environ["PINECONE_API_KEY"],
30+
environment=os.environ["PINECONE_ENVIRONMENT"]
31+
)
32+
33+
index_name='hf-endpoints'
34+
35+
# check if the movie-emb index exists
36+
ifindex_namenotinpinecone.list_indexes():
37+
# create the index if it does not exist
38+
pinecone.create_index(
39+
index_name,
40+
dimension=dim,
41+
metric="cosine"
42+
)
43+
44+
# connect to movie-emb index we created
45+
index=pinecone.Index(index_name)
46+
47+
start=time()
48+
# we will use batches of 64
49+
foriintqdm(range(0,len(passages),batch_size)):
50+
# find end of batch
51+
i_end=min(i+batch_size,len(passages))
52+
# extract batch
53+
batch=passages[i:i_end]
54+
# generate embeddings for batch via endpoints
55+
res=requests.post(
56+
endpoint,
57+
headers=headers,
58+
json={"inputs":batch}
59+
)
60+
emb=res.json()['embeddings']
61+
# get metadata (just the original text)
62+
meta= [{'text':text}fortextinbatch]
63+
# create IDs
64+
ids= [str(x)forxinrange(i,i_end)]
65+
# add all to upsert list
66+
to_upsert=list(zip(ids,emb,meta))
67+
# upsert/insert these records to pinecone
68+
_=index.upsert(vectors=to_upsert)
69+
70+
print("Time taken for HF for %d documents = %0.3f"% (len(passages),time()-start))
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
importos
2+
importrequests
3+
fromtimeimporttime
4+
fromrichimportprint
5+
importpinecone
6+
fromtqdm.autoimporttqdm
7+
fromdatasetsimportDataset
8+
fromdotenvimportload_dotenv
9+
fromstatisticsimportmean
10+
11+
load_dotenv(".env")
12+
api_org=os.environ["HF_API_KEY"]
13+
endpoint=os.environ["HF_ENDPOINT"]
14+
# add the api org token to the headers
15+
headers= {
16+
'Authorization':f'Bearer{api_org}'
17+
}
18+
19+
#squad = load_dataset("squad", split='train')
20+
squad=Dataset.from_file("squad-train.arrow")
21+
data=squad.to_pandas()
22+
data=data.drop_duplicates(subset=["context"])
23+
passages=list(data['context'])
24+
25+
# connect to pinecone environment
26+
pinecone.init(
27+
api_key=os.environ["PINECONE_API_KEY"],
28+
environment=os.environ["PINECONE_ENVIRONMENT"]
29+
)
30+
31+
index_name='hf-endpoints'
32+
33+
# check if the movie-emb index exists
34+
ifindex_namenotinpinecone.list_indexes():
35+
# create the index if it does not exist
36+
pinecone.create_index(
37+
index_name,
38+
dimension=dim,
39+
metric="cosine"
40+
)
41+
42+
# connect to movie-emb index we created
43+
index=pinecone.Index(index_name)
44+
45+
46+
run_times= []
47+
forqueryindata["context"][0:100]:
48+
start=time()
49+
# encode with HF endpoints
50+
res=requests.post(endpoint,headers=headers,json={"inputs":query})
51+
xq=res.json()['embeddings']
52+
# query and return top 5
53+
xc=index.query(xq,top_k=5,include_metadata=True)
54+
_end=time()
55+
run_times.append(_end-start)
56+
print("HF + Pinecone Average query time: %0.3f"%(mean(run_times)))
57+
58+
59+
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
frompgmlimportDatabase
2+
importos
3+
fromdatasetsimportload_dataset
4+
fromtimeimporttime
5+
fromdotenvimportload_dotenv
6+
fromrichimportprint
7+
importasyncio
8+
fromtqdm.autoimporttqdm
9+
10+
asyncdefmain():
11+
load_dotenv()
12+
conninfo=os.environ.get("DATABASE_URL")
13+
db=Database(conninfo)
14+
15+
collection_name="squad_collection_benchmark"
16+
collection=awaitdb.create_or_get_collection(collection_name)
17+
model_id=awaitcollection.register_model(model_name="intfloat/e5-large")
18+
awaitcollection.generate_embeddings(model_id=model_id)
19+
20+
if__name__=="__main__":
21+
asyncio.run(main())
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
DO $$
2+
DECLARE
3+
curr_idinteger :=0;
4+
batch_sizeinteger:=2;
5+
total_recordsinteger:=10000;
6+
curr_valtext[];-- Use "text[]" instead of "varchar[]"
7+
embed_result json;-- Store the result of the pgml.embed function
8+
BEGIN
9+
LOOP
10+
--BEGIN RAISE NOTICE 'updating % to %', curr_id, curr_id + batch_size; END;
11+
SELECT ARRAY(SELECT chunk::text
12+
FROMsquad_collection_benchmark.chunks
13+
WHERE id BETWEEN curr_id+1AND curr_id+ batch_size)
14+
INTO curr_val;
15+
16+
-- Use the correct syntax to call pgml.embed and store the result
17+
PERFORM embedFROMpgml.embed('intfloat/e5-large', curr_val);
18+
19+
curr_id := curr_id+ batch_size;
20+
EXIT WHEN curr_id>= total_records;
21+
END LOOP;
22+
23+
SELECT ARRAY(SELECT chunk::text
24+
FROMsquad_collection_benchmark.chunks
25+
WHERE id BETWEEN curr_id-batch_sizeAND total_records)
26+
INTO curr_val;
27+
28+
-- Use the correct syntax to call pgml.embed and store the result
29+
PERFORM embedFROMpgml.embed('intfloat/e5-large', curr_val);
30+
31+
END;
32+
$$;
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
frompgmlimportDatabase
2+
importos
3+
fromdatasetsimportload_dataset
4+
fromtimeimporttime
5+
fromdotenvimportload_dotenv
6+
fromrichimportprint
7+
importasyncio
8+
fromtqdm.autoimporttqdm
9+
10+
asyncdefmain():
11+
load_dotenv()
12+
conninfo=os.environ.get("DATABASE_URL")
13+
db=Database(conninfo)
14+
15+
collection_name="squad_collection_benchmark"
16+
collection=awaitdb.create_or_get_collection(collection_name)
17+
18+
data=load_dataset("squad",split="train")
19+
data=data.to_pandas()
20+
data=data.drop_duplicates(subset=["context"])
21+
22+
documents= [
23+
{"id":r["id"],"text":r["context"],"title":r["title"]}
24+
forrindata.to_dict(orient="records")
25+
]
26+
27+
print("Ingesting and chunking documents ..")
28+
total_documents=10000
29+
batch_size=64
30+
embedding_times= []
31+
total_time=0
32+
documents=documents[:total_documents]
33+
foriintqdm(range(0,len(documents),batch_size)):
34+
i_end=min(i+batch_size,len(documents))
35+
batch=documents[i:i_end]
36+
awaitcollection.upsert_documents(batch)
37+
awaitcollection.generate_chunks()
38+
print("Ingesting and chunking completed")
39+
40+
if__name__=="__main__":
41+
asyncio.run(main())

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp