Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

HNSW and Migrations Done#988

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
SilasMarvin merged 11 commits intomasterfromsilas-hnsw-and-migrations
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletionspgml-sdks/pgml/Cargo.lock
View file
Open in desktop

Some generated files are not rendered by default. Learn more abouthow customized files appear on GitHub.

2 changes: 1 addition & 1 deletionpgml-sdks/pgml/Cargo.toml
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -20,7 +20,7 @@ serde_json = "1.0.9"
anyhow = "1.0.9"
tokio = { version = "1.28.2", features = [ "macros" ] }
chrono = "0.4.9"
pyo3 = { version = "0.18.3", optional = true, features = ["extension-module"] }
pyo3 = { version = "0.18.3", optional = true, features = ["extension-module", "anyhow"] }
pyo3-asyncio = { version = "0.18", features = ["attributes", "tokio-runtime"], optional = true }
neon = { version = "0.10", optional = true, default-features = false, features = ["napi-6", "promise-api", "channel-api"] }
itertools = "0.10.5"
Expand Down
6 changes: 4 additions & 2 deletionspgml-sdks/pgml/build.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -3,14 +3,16 @@ use std::fs::OpenOptions;
use std::io::Write;

const ADDITIONAL_DEFAULTS_FOR_PYTHON: &[u8] = br#"
def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
async def migrate() -> None

Json = Any
DateTime = int
"#;

const ADDITIONAL_DEFAULTS_FOR_#"2b83e2559d74b48bbbf1cfda26b8bddc6a369082140691474f9d743cb88918d3">export function js_init_logger(level?: string, format?: string): void;
export function init_logger(level?: string, format?: string): void;
export function migrate(): Promise<void>;

export type Json = { [key: string]: any };
export type DateTime = Date;
Expand Down
18 changes: 18 additions & 0 deletionspgml-sdks/pgml/javascript/README.md
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -519,6 +519,24 @@ const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
await collection.add_pipeline(pipeline)
```

### Configuring HNSW Indexing Parameters

Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.

Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.

```javascript
const model = pgml.newModel()
const splitter = pgml.newSplitter()
const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
hnsw: {
m: 100,
ef_construction: 200
}
})
await collection.add_pipeline(pipeline)
```

### Searching with Pipelines

Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.
Expand Down
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
const pgml = require("pgml");
require("dotenv").config();

pgml.js_init_logger();

const main = async () => {
// Initialize the collection
Expand Down
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
const pgml = require("pgml");
require("dotenv").config();

pgml.js_init_logger();

const main = async () => {
// Initialize the collection
const collection = pgml.newCollection("my_javascript_sqa_collection");
Expand Down
56 changes: 55 additions & 1 deletionpgml-sdks/pgml/javascript/tests/typescript-tests/test.ts
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -10,7 +10,7 @@ import pgml from "../../index.js";
////////////////////////////////////////////////////////////////////////////////////

const LOG_LEVEL = process.env.LOG_LEVEL ? process.env.LOG_LEVEL : "ERROR";
pgml.js_init_logger(LOG_LEVEL);
pgml.init_logger(LOG_LEVEL);

const generate_dummy_documents = (count: number) => {
let docs = [];
Expand DownExpand Up@@ -143,6 +143,52 @@ it("can vector search with query builder and metadata filtering", async () => {
await collection.archive();
});

it("can vector search with query builder and custom hnsfw ef_search value", async () => {
let model = pgml.newModel();
let splitter = pgml.newSplitter();
let pipeline = pgml.newPipeline("test_j_p_cvswqbachesv_0", model, splitter);
let collection = pgml.newCollection("test_j_c_cvswqbachesv_0");
await collection.upsert_documents(generate_dummy_documents(3));
await collection.add_pipeline(pipeline);
let results = await collection
.query()
.vector_recall("Here is some query", pipeline)
.filter({
hnsw: {
ef_search: 2,
},
})
.limit(10)
.fetch_all();
expect(results).toHaveLength(3);
await collection.archive();
});

it("can vector search with query builder and custom hnsfw ef_search value and remote embeddings", async () => {
let model = pgml.newModel("text-embedding-ada-002", "openai");
let splitter = pgml.newSplitter();
let pipeline = pgml.newPipeline(
"test_j_p_cvswqbachesvare_0",
model,
splitter,
);
let collection = pgml.newCollection("test_j_c_cvswqbachesvare_0");
await collection.upsert_documents(generate_dummy_documents(3));
await collection.add_pipeline(pipeline);
let results = await collection
.query()
.vector_recall("Here is some query", pipeline)
.filter({
hnsw: {
ef_search: 2,
},
})
.limit(10)
.fetch_all();
expect(results).toHaveLength(3);
await collection.archive();
});

///////////////////////////////////////////////////
// Test user output facing functions //////////////
///////////////////////////////////////////////////
Expand DownExpand Up@@ -220,3 +266,11 @@ it("can delete documents", async () => {

await collection.archive();
});

///////////////////////////////////////////////////
// Test migrations ////////////////////////////////
///////////////////////////////////////////////////

it("can migrate", async () => {
await pgml.migrate();
});
18 changes: 18 additions & 0 deletionspgml-sdks/pgml/python/README.md
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -530,6 +530,24 @@ pipeline = Pipeline("test_pipeline", model, splitter, {
await collection.add_pipeline(pipeline)
```

### Configuring HNSW Indexing Parameters

Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.

Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.

```python
model = Model()
splitter = Splitter()
pipeline = Pipeline("test_pipeline", model, splitter, {
"hnsw": {
"m": 100,
"ef_construction": 200
}
})
await collection.add_pipeline(pipeline)
```

### Searching with Pipelines

Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.
Expand Down
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
from pgml import Collection, Model, Splitter, Pipeline, Builtins, py_init_logger
from pgml import Collection, Model, Splitter, Pipeline, Builtins
import json
from datasets import load_dataset
from time import time
Expand All@@ -7,9 +7,6 @@
import asyncio


py_init_logger()


async def main():
load_dotenv()
console = Console()
Expand Down
89 changes: 2 additions & 87 deletionspgml-sdks/pgml/python/pgml/pgml.pyi
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -1,91 +1,6 @@

def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
async def migrate() -> None

Json = Any
DateTime = int

# Top of file key: A12BECOD!
from typing import List, Dict, Optional, Self, Any


class Builtins:
def __init__(self, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
...
def query(self, query: str) -> QueryRunner
...
async def transform(self, task: Json, inputs: List[str], args: Optional[Json] = Any) -> Json
...

class Collection:
def __init__(self, name: str, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
...
async def add_pipeline(self, pipeline: Pipeline) -> None
...
async def remove_pipeline(self, pipeline: Pipeline) -> None
...
async def enable_pipeline(self, pipeline: Pipeline) -> None
...
async def disable_pipeline(self, pipeline: Pipeline) -> None
...
async def upsert_documents(self, documents: List[Json]) -> None
...
async def get_documents(self, args: Optional[Json] = Any) -> List[Json]
...
async def delete_documents(self, filter: Json) -> None
...
async def vector_search(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any, top_k: Optional[int] = 1) -> List[tuple[float, str, Json]]
...
async def archive(self) -> None
...
def query(self) -> QueryBuilder
...
async def get_pipelines(self) -> List[Pipeline]
...
async def get_pipeline(self, name: str) -> Pipeline
...
async def exists(self) -> bool
...

class Model:
def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", source: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
...

class Pipeline:
def __init__(self, name: str, model: Optional[Model] = Any, splitter: Optional[Splitter] = Any, parameters: Optional[Json] = Any) -> Self
...
async def get_status(self) -> PipelineSyncData
...
async def to_dict(self) -> Json
...

class QueryBuilder:
def limit(self, limit: int) -> Self
...
def filter(self, filter: Json) -> Self
...
def vector_recall(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any) -> Self
...
async def fetch_all(self) -> List[tuple[float, str, Json]]
...
def to_full_string(self) -> str
...

class QueryRunner:
async def fetch_all(self) -> Json
...
async def execute(self) -> None
...
def bind_string(self, bind_value: str) -> Self
...
def bind_int(self, bind_value: int) -> Self
...
def bind_float(self, bind_value: float) -> Self
...
def bind_bool(self, bind_value: bool) -> Self
...
def bind_json(self, bind_value: Json) -> Self
...

class Splitter:
def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
...
50 changes: 49 additions & 1 deletionpgml-sdks/pgml/python/tests/test.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -19,7 +19,7 @@
print("No DATABASE_URL environment variable found. Please set one")
exit(1)

pgml.py_init_logger()
pgml.init_logger()


def generate_dummy_documents(count: int) -> List[Dict[str, Any]]:
Expand DownExpand Up@@ -164,6 +164,44 @@ async def test_can_vector_search_with_query_builder_and_metadata_filtering():
await collection.archive()


@pytest.mark.asyncio
async def test_can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value():
model = pgml.Model()
splitter = pgml.Splitter()
pipeline = pgml.Pipeline("test_p_p_tcvswqbachesv_0", model, splitter)
collection = pgml.Collection(name="test_p_c_tcvswqbachesv_0")
await collection.upsert_documents(generate_dummy_documents(3))
await collection.add_pipeline(pipeline)
results = (
await collection.query()
.vector_recall("Here is some query", pipeline)
.filter({"hnsw": {"ef_search": 2}})
.limit(10)
.fetch_all()
)
assert len(results) == 3
await collection.archive()


@pytest.mark.asyncio
async def test_can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value_and_remote_embeddings():
model = pgml.Model(name="text-embedding-ada-002", source="openai")
splitter = pgml.Splitter()
pipeline = pgml.Pipeline("test_p_p_tcvswqbachesvare_0", model, splitter)
collection = pgml.Collection(name="test_p_c_tcvswqbachesvare_0")
await collection.upsert_documents(generate_dummy_documents(3))
await collection.add_pipeline(pipeline)
results = (
await collection.query()
.vector_recall("Here is some query", pipeline)
.filter({"hnsw": {"ef_search": 2}})
.limit(10)
.fetch_all()
)
assert len(results) == 3
await collection.archive()


###################################################
## Test user output facing functions ##############
###################################################
Expand DownExpand Up@@ -250,6 +288,16 @@ async def test_delete_documents():
await collection.archive()


###################################################
## Migration tests ################################
###################################################


@pytest.mark.asyncio
async def test_migrate():
await pgml.migrate()


###################################################
## Test with multiprocessing ######################
###################################################
Expand Down
8 changes: 4 additions & 4 deletionspgml-sdks/pgml/src/builtins.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -92,21 +92,21 @@ impl Builtins {
#[cfg(test)]
mod tests {
use super::*;
use crate::init_logger;
use crate::internal_init_logger;

#[sqlx::test]
async fn can_query() -> anyhow::Result<()> {
init_logger(None, None).ok();
internal_init_logger(None, None).ok();
let builtins = Builtins::new(None);
let query = "SELECT10";
let query = "SELECT* from pgml.collections";
let results = builtins.query(query).fetch_all().await?;
assert!(results.as_array().is_some());
Ok(())
}

#[sqlx::test]
async fn can_transform() -> anyhow::Result<()> {
init_logger(None, None).ok();
internal_init_logger(None, None).ok();
let builtins = Builtins::new(None);
let task = Json::from(serde_json::json!("translation_en_to_fr"));
let inputs = vec!["test1".to_string(), "test2".to_string()];
Expand Down
Loading

[8]ページ先頭

©2009-2025 Movatter.jp