May 23, 2023 · May 12, 2023 · May 13, 2023 · May 16, 2023 · May 16, 2023 · May 16, 2023
diff --git a/pgml-sdks/python/pgml/README.md b/pgml-sdks/python/pgml/README.md
 # PostgresML Python SDK
 This Python SDK provides an easy interface to use PostgresML generative AI capabilities.

 ## Table of Contents

 - [Quickstart](#quickstart)

 ### Quickstart
 1. Install Python 3.11. SDK should work for Python >=3.8. However, at this time, we have only tested Python 3.11.
 2. Clone the repository and checkout the SDK branch (before PR)
 ```
 git clone https://github.com/postgresml/postgresml
 cd postgresml
 git checkout santi-pgml-memory-sdk-python
 cd pgml-sdks/python/pgml
 ```
 3. Install poetry `pip install poetry`
 4. Initialize Python environment

 ```
 poetry env use python3.11
 poetry shell
 poetry install
 poetry build
 ```
 5. SDK uses your local PostgresML database by default
 `postgres://postgres@127.0.0.1:5433/pgml_development`

 If it is not up to date with `pgml.embed` please [signup for a free database](https://postgresml.org/signup) and set `PGML_CONNECTION` environment variable with serverless hosted database.

 ```
 export PGML_CONNECTION="postgres://<username>:<password>@<hostname>:<port>/pgm<database>"
 ```
 6. Run a **vector search** example
 ```
 python examples/vector_search.py
 ```

diff --git a/pgml-sdks/python/pgml/examples/vector_search.ipynb b/pgml-sdks/python/pgml/examples/vector_search.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pgml import Database\n",
    "import os\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n",
    "\n",
    "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n",
    "db = Database(conninfo,min_connections=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection_name = \"test_pgml_sdk_1\"\n",
    "collection = db.create_or_get_collection(collection_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "data = load_dataset(\"squad\", split=\"train\")\n",
    "data = data.to_pandas()\n",
    "data.head()\n",
    "\n",
    "data = data.drop_duplicates(subset=[\"context\"])\n",
    "print(len(data))\n",
    "data.head()\n",
    "\n",
    "documents = [\n",
    "    {\n",
    "        'text': r['context'],\n",
    "        'metadata': {\n",
    "            'title': r['title']\n",
    "        }\n",
    "    } for r in data.to_dict(orient='records')\n",
    "]\n",
    "documents[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.upsert_documents(documents[0:200])\n",
    "collection.generate_chunks()\n",
    "collection.generate_embeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2)\n",
    "print(json.dumps(results,indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.register_model(model_name=\"paraphrase-MiniLM-L6-v2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.get_models()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(json.dumps(collection.get_models(),indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.generate_embeddings(model_id=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=2)\n",
    "print(json.dumps(results,indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.register_model(model_name=\"hkunlp/instructor-xl\", model_params={\"instruction\": \"Represent the Wikipedia document for retrieval: \"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.get_models()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.generate_embeddings(model_id=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=3, query_parameters={\"instruction\": \"Represent the Wikipedia question for retrieving supporting documents: \"})\n",
    "print(json.dumps(results,indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.register_text_splitter(splitter_name=\"RecursiveCharacterTextSplitter\",splitter_params={\"chunk_size\": 100,\"chunk_overlap\": 20})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.generate_chunks(splitter_id=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection.generate_embeddings(splitter_id=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, splitter_id=2)\n",
    "print(json.dumps(results,indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db.delete_collection(collection_name)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pgml-zoggicR5-py3.11",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/pgml-sdks/python/pgml/examples/vector_search.py b/pgml-sdks/python/pgml/examples/vector_search.py
 from pgml import Database
 import os
 import json
 from datasets import load_dataset
 from time import time
 from rich import print as rprint

 local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"

 conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
 db = Database(conninfo)

 collection_name = "test_pgml_sdk_1"
 collection = db.create_or_get_collection(collection_name)


 data = load_dataset("squad", split="train")
 data = data.to_pandas()
 data = data.drop_duplicates(subset=["context"])

 documents = [
    {'id': r['id'], "text": r["context"], "title": r["title"]}
    for r in data.to_dict(orient="records")
 ]

 collection.upsert_documents(documents[:200])
 collection.generate_chunks()
 collection.generate_embeddings()

 start = time()
 results = collection.vector_search("Who won 20 grammy awards?", top_k=2)
 rprint(json.dumps(results, indent=2))
 rprint("Query time %0.3f"%(time()-start))
 db.archive_collection(collection_name)
diff --git a/pgml-sdks/python/pgml/pgml/__init__.py b/pgml-sdks/python/pgml/pgml/__init__.py
 from .database import Database
 from .collection import Collection
 from .dbutils import (
    run_create_or_insert_statement,
    run_select_statement,
    run_drop_or_delete_statement,
 )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,38 @@
		# PostgresML Python SDK
		This Python SDK provides an easy interface to use PostgresML generative AI capabilities.

		## Table of Contents

		- [Quickstart](#quickstart)

		### Quickstart
		1. Install Python 3.11. SDK should work for Python >=3.8. However, at this time, we have only tested Python 3.11.
		2. Clone the repository and checkout the SDK branch (before PR)
		```
		git clone https://github.com/postgresml/postgresml
		cd postgresml
		git checkout santi-pgml-memory-sdk-python
		cd pgml-sdks/python/pgml
		```
		3. Install poetry `pip install poetry`
		4. Initialize Python environment

		```
		poetry env use python3.11
		poetry shell
		poetry install
		poetry build
		```
		5. SDK uses your local PostgresML database by default
		`postgres://postgres@127.0.0.1:5433/pgml_development`

		If it is not up to date with `pgml.embed` please [signup for a free database](https://postgresml.org/signup) and set `PGML_CONNECTION` environment variable with serverless hosted database.

		```
		export PGML_CONNECTION="postgres://<username>:<password>@<hostname>:<port>/pgm<database>"
		```
		6. Run a vector search example
		```
		python examples/vector_search.py
		```
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,236 @@
		{
		"cells": [
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"from pgml import Database\n",
		"import os\n",
		"import json"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n",
		"\n",
		"conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n",
		"db = Database(conninfo,min_connections=4)"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection_name = \"test_pgml_sdk_1\"\n",
		"collection = db.create_or_get_collection(collection_name)"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"from datasets import load_dataset\n",
		"\n",
		"data = load_dataset(\"squad\", split=\"train\")\n",
		"data = data.to_pandas()\n",
		"data.head()\n",
		"\n",
		"data = data.drop_duplicates(subset=[\"context\"])\n",
		"print(len(data))\n",
		"data.head()\n",
		"\n",
		"documents = [\n",
		" {\n",
		" 'text': r['context'],\n",
		" 'metadata': {\n",
		" 'title': r['title']\n",
		" }\n",
		" } for r in data.to_dict(orient='records')\n",
		"]\n",
		"documents[:3]"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.upsert_documents(documents[0:200])\n",
		"collection.generate_chunks()\n",
		"collection.generate_embeddings()"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2)\n",
		"print(json.dumps(results,indent=2))"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.register_model(model_name=\"paraphrase-MiniLM-L6-v2\")"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.get_models()"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"print(json.dumps(collection.get_models(),indent=2))"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.generate_embeddings(model_id=2)"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=2)\n",
		"print(json.dumps(results,indent=2))"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.register_model(model_name=\"hkunlp/instructor-xl\", model_params={\"instruction\": \"Represent the Wikipedia document for retrieval: \"})"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.get_models()"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.generate_embeddings(model_id=3)"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=3, query_parameters={\"instruction\": \"Represent the Wikipedia question for retrieving supporting documents: \"})\n",
		"print(json.dumps(results,indent=2))"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.register_text_splitter(splitter_name=\"RecursiveCharacterTextSplitter\",splitter_params={\"chunk_size\": 100,\"chunk_overlap\": 20})"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.generate_chunks(splitter_id=2)"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"collection.generate_embeddings(splitter_id=2)"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, splitter_id=2)\n",
		"print(json.dumps(results,indent=2))"
		]
		},
		{
		"cell_type": "code",
		"execution_count": null,
		"metadata": {},
		"outputs": [],
		"source": [
		"db.delete_collection(collection_name)"
		]
		}
		],
		"metadata": {
		"kernelspec": {
		"display_name": "pgml-zoggicR5-py3.11",
		"language": "python",
		"name": "python3"
		},
		"language_info": {
		"codemirror_mode": {
		"name": "ipython",
		"version": 3
		},
		"file_extension": ".py",
		"mimetype": "text/x-python",
		"name": "python",
		"nbconvert_exporter": "python",
		"pygments_lexer": "ipython3",
		"version": "3.11.3"
		},
		"orig_nbformat": 4
		},
		"nbformat": 4,
		"nbformat_minor": 2
		}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,34 @@
		from pgml import Database
		import os
		import json
		from datasets import load_dataset
		from time import time
		from rich import print as rprint

		local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"

		conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
		db = Database(conninfo)

		collection_name = "test_pgml_sdk_1"
		collection = db.create_or_get_collection(collection_name)


		data = load_dataset("squad", split="train")
		data = data.to_pandas()
		data = data.drop_duplicates(subset=["context"])

		documents = [
		{'id': r['id'], "text": r["context"], "title": r["title"]}
		for r in data.to_dict(orient="records")
		]

		collection.upsert_documents(documents[:200])
		collection.generate_chunks()
		collection.generate_embeddings()

		start = time()
		results = collection.vector_search("Who won 20 grammy awards?", top_k=2)
		rprint(json.dumps(results, indent=2))
		rprint("Query time %0.3f"%(time()-start))
		db.archive_collection(collection_name)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,7 @@
		from .database import Database
		from .collection import Collection
		from .dbutils import (
		run_create_or_insert_statement,
		run_select_statement,
		run_drop_or_delete_statement,
		)