NotificationsYou must be signed in to change notification settings
Fork328
Star6.4k

Commite6e62cc

committed

fix up active nav

1 parent6510734 commite6e62ccCopy full SHA for e6e62cc

File tree

28 files changed

+776

-43

lines changed

pgml-dashboard
- content/blog/benchmarks
  - embedding.py
  - embedding.sql
  - embedding_numpy.py
  - hf_pinecone_vs_postgresml
  - python_microservices_vs_postgresml
  - rust_print_embedding
    - Cargo.lock
    - Cargo.toml
    - src
      - lib.rs
- src
  - api
    - cms.rs
  - components/cms/index_link
    - mod.rs
pgml-docs
- blog
  - README.md
- careers
  - SUMMARY.md

28 files changed

+776

-43

lines changed

`‎pgml-dashboard/content/blog/benchmarks/embedding.py`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+importrandom`
	`2`	`+embeddings= [[random.random()for_inrange(128)]for_inrange (10_000)]`
	`3`	`+print(embeddings)`

`‎pgml-dashboard/content/blog/benchmarks/embedding.sql`

Lines changed: 16 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,16 @@`
	`1`	`+-- SELECT ARRAY_AGG(random()) AS vector`
	`2`	`+-- FROM generate_series(1, 1280000) i`
	`3`	`+-- GROUP BY i % 10000;`
	`4`	`+`
	`5`	`+SELECT1FROM (`
	`6`	`+SELECT ARRAY_AGG(random())AS vector`
	`7`	`+FROM generate_series(1,1280000) i`
	`8`	`+GROUP BY i %10000`
	`9`	`+) fLIMIT0;`
	`10`	`+`
	`11`	`+-- CREATE TABLE embeddings AS`
	`12`	`+-- SELECT ARRAY_AGG(random()) AS vector`
	`13`	`+-- FROM generate_series(1, 1280000) i`
	`14`	`+-- GROUP BY i % 10000;`
	`15`	`+`
	`16`	`+-- COPY embeddings TO '/tmp/embeddings.csv' DELIMITER ',' CSV HEADER;`

`‎pgml-dashboard/content/blog/benchmarks/embedding_numpy.py`

Lines changed: 6 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,6 @@`
	`1`	`+importsys`
	`2`	`+importnumpy`
	`3`	`+numpy.set_printoptions(threshold=sys.maxsize)`
	`4`	`+`
	`5`	`+embeddings=numpy.random.rand(10_000,128)`
	`6`	`+print(embeddings)`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/.gitignore`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+pgml.sql`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_embeddings.py`

Lines changed: 42 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,42 @@`
	`1`	`+importos`
	`2`	`+importrequests`
	`3`	`+fromtimeimporttime`
	`4`	`+fromrichimportprint`
	`5`	`+fromdatasetsimportload_dataset`
	`6`	`+fromtqdm.autoimporttqdm`
	`7`	`+fromdatasetsimportDataset`
	`8`	`+fromdotenvimportload_dotenv`
	`9`	`+`
	`10`	`+load_dotenv(".env")`
	`11`	`+`
	`12`	`+api_org=os.environ["HF_API_KEY"]`
	`13`	`+endpoint=os.environ["HF_ENDPOINT"]`
	`14`	`+# add the api org token to the headers`
	`15`	`+headers= {`
	`16`	`+'Authorization':f'Bearer{api_org}'`
	`17`	`+}`
	`18`	`+`
	`19`	`+#squad = load_dataset("squad", split='train')`
	`20`	`+squad=Dataset.from_file("squad-train.arrow")`
	`21`	`+data=squad.to_pandas()`
	`22`	`+data=data.drop_duplicates(subset=["context"])`
	`23`	`+passages=list(data['context'])`
	`24`	`+`
	`25`	`+total_documents=10000`
	`26`	`+batch_size=1`
	`27`	`+passages=passages[:total_documents]`
	`28`	`+`
	`29`	`+start=time()`
	`30`	`+foriintqdm(range(0,len(passages),batch_size)):`
	`31`	`+# find end of batch`
	`32`	`+i_end=min(i+batch_size,len(passages))`
	`33`	`+# extract batch`
	`34`	`+batch=passages[i:i_end]`
	`35`	`+# generate embeddings for batch via endpoints`
	`36`	`+res=requests.post(`
	`37`	`+endpoint,`
	`38`	`+headers=headers,`
	`39`	`+json={"inputs":batch}`
	`40`	`+ )`
	`41`	`+`
	`42`	`+print("Time taken for HF for %d documents = %0.3f"% (len(passages),time()-start))`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_pinecone_ingest.py`

Lines changed: 70 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,70 @@`
	`1`	`+importos`
	`2`	`+importrequests`
	`3`	`+fromtimeimporttime`
	`4`	`+fromrichimportprint`
	`5`	`+fromdatasetsimportload_dataset`
	`6`	`+importpinecone`
	`7`	`+fromtqdm.autoimporttqdm`
	`8`	`+fromdatasetsimportDataset`
	`9`	`+`
	`10`	`+api_org=os.environ["HF_API_KEY"]`
	`11`	`+endpoint=os.environ["HF_ENDPOINT"]`
	`12`	`+# add the api org token to the headers`
	`13`	`+headers= {`
	`14`	`+'Authorization':f'Bearer{api_org}'`
	`15`	`+}`
	`16`	`+`
	`17`	`+#squad = load_dataset("squad", split='train')`
	`18`	`+squad=Dataset.from_file("squad-train.arrow")`
	`19`	`+data=squad.to_pandas()`
	`20`	`+data=data.drop_duplicates(subset=["context"])`
	`21`	`+passages=list(data['context'])`
	`22`	`+`
	`23`	`+total_documents=10000`
	`24`	`+batch_size=64`
	`25`	`+passages=passages[:total_documents]`
	`26`	`+`
	`27`	`+# connect to pinecone environment`
	`28`	`+pinecone.init(`
	`29`	`+api_key=os.environ["PINECONE_API_KEY"],`
	`30`	`+environment=os.environ["PINECONE_ENVIRONMENT"]`
	`31`	`+)`
	`32`	`+`
	`33`	`+index_name='hf-endpoints'`
	`34`	`+`
	`35`	`+# check if the movie-emb index exists`
	`36`	`+ifindex_namenotinpinecone.list_indexes():`
	`37`	`+# create the index if it does not exist`
	`38`	`+pinecone.create_index(`
	`39`	`+index_name,`
	`40`	`+dimension=dim,`
	`41`	`+metric="cosine"`
	`42`	`+ )`
	`43`	`+`
	`44`	`+# connect to movie-emb index we created`
	`45`	`+index=pinecone.Index(index_name)`
	`46`	`+`
	`47`	`+start=time()`
	`48`	`+# we will use batches of 64`
	`49`	`+foriintqdm(range(0,len(passages),batch_size)):`
	`50`	`+# find end of batch`
	`51`	`+i_end=min(i+batch_size,len(passages))`
	`52`	`+# extract batch`
	`53`	`+batch=passages[i:i_end]`
	`54`	`+# generate embeddings for batch via endpoints`
	`55`	`+res=requests.post(`
	`56`	`+endpoint,`
	`57`	`+headers=headers,`
	`58`	`+json={"inputs":batch}`
	`59`	`+ )`
	`60`	`+emb=res.json()['embeddings']`
	`61`	`+# get metadata (just the original text)`
	`62`	`+meta= [{'text':text}fortextinbatch]`
	`63`	`+# create IDs`
	`64`	`+ids= [str(x)forxinrange(i,i_end)]`
	`65`	`+# add all to upsert list`
	`66`	`+to_upsert=list(zip(ids,emb,meta))`
	`67`	`+# upsert/insert these records to pinecone`
	`68`	`+_=index.upsert(vectors=to_upsert)`
	`69`	`+`
	`70`	`+print("Time taken for HF for %d documents = %0.3f"% (len(passages),time()-start))`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_pinecone_query.py`

Lines changed: 59 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,59 @@`
	`1`	`+importos`
	`2`	`+importrequests`
	`3`	`+fromtimeimporttime`
	`4`	`+fromrichimportprint`
	`5`	`+importpinecone`
	`6`	`+fromtqdm.autoimporttqdm`
	`7`	`+fromdatasetsimportDataset`
	`8`	`+fromdotenvimportload_dotenv`
	`9`	`+fromstatisticsimportmean`
	`10`	`+`
	`11`	`+load_dotenv(".env")`
	`12`	`+api_org=os.environ["HF_API_KEY"]`
	`13`	`+endpoint=os.environ["HF_ENDPOINT"]`
	`14`	`+# add the api org token to the headers`
	`15`	`+headers= {`
	`16`	`+'Authorization':f'Bearer{api_org}'`
	`17`	`+}`
	`18`	`+`
	`19`	`+#squad = load_dataset("squad", split='train')`
	`20`	`+squad=Dataset.from_file("squad-train.arrow")`
	`21`	`+data=squad.to_pandas()`
	`22`	`+data=data.drop_duplicates(subset=["context"])`
	`23`	`+passages=list(data['context'])`
	`24`	`+`
	`25`	`+# connect to pinecone environment`
	`26`	`+pinecone.init(`
	`27`	`+api_key=os.environ["PINECONE_API_KEY"],`
	`28`	`+environment=os.environ["PINECONE_ENVIRONMENT"]`
	`29`	`+)`
	`30`	`+`
	`31`	`+index_name='hf-endpoints'`
	`32`	`+`
	`33`	`+# check if the movie-emb index exists`
	`34`	`+ifindex_namenotinpinecone.list_indexes():`
	`35`	`+# create the index if it does not exist`
	`36`	`+pinecone.create_index(`
	`37`	`+index_name,`
	`38`	`+dimension=dim,`
	`39`	`+metric="cosine"`
	`40`	`+ )`
	`41`	`+`
	`42`	`+# connect to movie-emb index we created`
	`43`	`+index=pinecone.Index(index_name)`
	`44`	`+`
	`45`	`+`
	`46`	`+run_times= []`
	`47`	`+forqueryindata["context"][0:100]:`
	`48`	`+start=time()`
	`49`	`+# encode with HF endpoints`
	`50`	`+res=requests.post(endpoint,headers=headers,json={"inputs":query})`
	`51`	`+xq=res.json()['embeddings']`
	`52`	`+# query and return top 5`
	`53`	`+xc=index.query(xq,top_k=5,include_metadata=True)`
	`54`	`+_end=time()`
	`55`	`+run_times.append(_end-start)`
	`56`	`+print("HF + Pinecone Average query time: %0.3f"%(mean(run_times)))`
	`57`	`+`
	`58`	`+`
	`59`	`+`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_embeddings.py`

Lines changed: 21 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,21 @@`
	`1`	`+frompgmlimportDatabase`
	`2`	`+importos`
	`3`	`+fromdatasetsimportload_dataset`
	`4`	`+fromtimeimporttime`
	`5`	`+fromdotenvimportload_dotenv`
	`6`	`+fromrichimportprint`
	`7`	`+importasyncio`
	`8`	`+fromtqdm.autoimporttqdm`
	`9`	`+`
	`10`	`+asyncdefmain():`
	`11`	`+load_dotenv()`
	`12`	`+conninfo=os.environ.get("DATABASE_URL")`
	`13`	`+db=Database(conninfo)`
	`14`	`+`
	`15`	`+collection_name="squad_collection_benchmark"`
	`16`	`+collection=awaitdb.create_or_get_collection(collection_name)`
	`17`	`+model_id=awaitcollection.register_model(model_name="intfloat/e5-large")`
	`18`	`+awaitcollection.generate_embeddings(model_id=model_id)`
	`19`	`+`
	`20`	`+if__name__=="__main__":`
	`21`	`+asyncio.run(main())`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_embeddings.sql`

Lines changed: 32 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,32 @@`
	`1`	`+DO $$`
	`2`	`+DECLARE`
	`3`	`+ curr_idinteger :=0;`
	`4`	`+ batch_sizeinteger:=2;`
	`5`	`+ total_recordsinteger:=10000;`
	`6`	`+ curr_valtext[];-- Use "text[]" instead of "varchar[]"`
	`7`	`+ embed_result json;-- Store the result of the pgml.embed function`
	`8`	`+BEGIN`
	`9`	`+ LOOP`
	`10`	`+--BEGIN RAISE NOTICE 'updating % to %', curr_id, curr_id + batch_size; END;`
	`11`	`+SELECT ARRAY(SELECT chunk::text`
	`12`	`+FROMsquad_collection_benchmark.chunks`
	`13`	`+WHERE id BETWEEN curr_id+1AND curr_id+ batch_size)`
	`14`	`+ INTO curr_val;`
	`15`	`+`
	`16`	`+-- Use the correct syntax to call pgml.embed and store the result`
	`17`	`+ PERFORM embedFROMpgml.embed('intfloat/e5-large', curr_val);`
	`18`	`+`
	`19`	`+ curr_id := curr_id+ batch_size;`
	`20`	`+ EXIT WHEN curr_id>= total_records;`
	`21`	`+ END LOOP;`
	`22`	`+`
	`23`	`+SELECT ARRAY(SELECT chunk::text`
	`24`	`+FROMsquad_collection_benchmark.chunks`
	`25`	`+WHERE id BETWEEN curr_id-batch_sizeAND total_records)`
	`26`	`+ INTO curr_val;`
	`27`	`+`
	`28`	`+-- Use the correct syntax to call pgml.embed and store the result`
	`29`	`+ PERFORM embedFROMpgml.embed('intfloat/e5-large', curr_val);`
	`30`	`+`
	`31`	`+END;`
	`32`	`+$$;`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_ingest.py`

Lines changed: 41 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,41 @@`
	`1`	`+frompgmlimportDatabase`
	`2`	`+importos`
	`3`	`+fromdatasetsimportload_dataset`
	`4`	`+fromtimeimporttime`
	`5`	`+fromdotenvimportload_dotenv`
	`6`	`+fromrichimportprint`
	`7`	`+importasyncio`
	`8`	`+fromtqdm.autoimporttqdm`
	`9`	`+`
	`10`	`+asyncdefmain():`
	`11`	`+load_dotenv()`
	`12`	`+conninfo=os.environ.get("DATABASE_URL")`
	`13`	`+db=Database(conninfo)`
	`14`	`+`
	`15`	`+collection_name="squad_collection_benchmark"`
	`16`	`+collection=awaitdb.create_or_get_collection(collection_name)`
	`17`	`+`
	`18`	`+data=load_dataset("squad",split="train")`
	`19`	`+data=data.to_pandas()`
	`20`	`+data=data.drop_duplicates(subset=["context"])`
	`21`	`+`
	`22`	`+documents= [`
	`23`	`+ {"id":r["id"],"text":r["context"],"title":r["title"]}`
	`24`	`+forrindata.to_dict(orient="records")`
	`25`	`+ ]`
	`26`	`+`
	`27`	`+print("Ingesting and chunking documents ..")`
	`28`	`+total_documents=10000`
	`29`	`+batch_size=64`
	`30`	`+embedding_times= []`
	`31`	`+total_time=0`
	`32`	`+documents=documents[:total_documents]`
	`33`	`+foriintqdm(range(0,len(documents),batch_size)):`
	`34`	`+i_end=min(i+batch_size,len(documents))`
	`35`	`+batch=documents[i:i_end]`
	`36`	`+awaitcollection.upsert_documents(batch)`
	`37`	`+awaitcollection.generate_chunks()`
	`38`	`+print("Ingesting and chunking completed")`
	`39`	`+`
	`40`	`+if__name__=="__main__":`
	`41`	`+asyncio.run(main())`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commite6e62cc

File tree

28 files changed

28 files changed

`‎pgml-dashboard/content/blog/benchmarks/embedding.py`

`‎pgml-dashboard/content/blog/benchmarks/embedding.sql`

`‎pgml-dashboard/content/blog/benchmarks/embedding_numpy.py`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/.gitignore`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_embeddings.py`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_pinecone_ingest.py`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_pinecone_query.py`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_embeddings.py`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_embeddings.sql`

`‎pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_ingest.py`

0 commit comments