May 31, 2023 · May 24, 2023 · May 25, 2023 · May 25, 2023 · May 25, 2023 · May 25, 2023
diff --git a/pgml-extension/examples/chunking.sql b/pgml-extension/examples/chunking.sql
 --- Chunk text for  LLM embeddings and vectorization.

 DROP TABLE documents CASCADE;
 CREATE TABLE documents (
 id BIGSERIAL PRIMARY KEY,
 document TEXT NOT NULL,
 created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );

 DROP TABLE splitters CASCADE;
 CREATE TABLE splitters (
 id BIGSERIAL PRIMARY KEY,
 splitter VARCHAR NOT NULL DEFAULT 'recursive_character'
 );

 DROP TABLE document_chunks CASCADE;
 CREATE TABLE document_chunks(
 id BIGSERIAL PRIMARY KEY,
 document_id BIGINT NOT NULL REFERENCES documents(id),
 splitter_id BIGINT NOT NULL REFERENCES splitters(id),
 chunk_index BIGINT NOT NULL,
 chunk VARCHAR
 );

 INSERT INTO documents VALUES (
 1,
 'It was the best of times, it was the worst of times, it was the age of wisdom,
 it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light,
 it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us,
 we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like
 the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.',
 NOW()
 );

 INSERT INTO splitters VALUES (1, 'recursive_character');

 WITH document AS (
 SELECT id, document
 FROM documents
 WHERE id = 1
 ),

 splitter AS (
 SELECT id, splitter
 FROM splitters
 WHERE id = 1
 )

 INSERT INTO document_chunks SELECT
 nextval('document_chunks_id_seq'::regclass),
 (SELECT id FROM document),
 (SELECT id FROM splitter),
 chunk_index,
 chunk
 FROM
 pgml.chunk(
 (SELECT splitter FROM splitter),
 (SELECT document FROM document),
 '{"chunk_size": 2, "chunk_overlap": 2}'
 );

 SELECT * FROM document_chunks LIMIT 5;
diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt
 tqdm==4.65.0
 transformers==4.29.2
 xgboost==1.7.5
 langchain==0.0.180
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs

 #[pg_extern(immutable, parallel_safe)]
 pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec<f32> {
    crate::bindings::transformers::embed(transformer, &text, &kwargs.0)
    crate::bindings::transformers::embed(transformer, text, &kwargs.0)
 }

 #[pg_extern(immutable, parallel_safe)]
 pub fn chunk(
    splitter: &str,
    text: &str,
    kwargs: default!(JsonB, "'{}'"),
 ) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> {
    let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0);
    let chunks = chunks
        .into_iter()
        .enumerate()
        .map(|(i, chunk)| (i as i64 + 1, chunk))
        .collect::<Vec<(i64, String)>>();

    TableIterator::new(chunks.into_iter())
 }

 #[cfg(feature = "python")]
    task: JsonB,
    args: default!(JsonB, "'{}'"),
    inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
    cache: default!(bool, false)
    cache: default!(bool, false),
 ) -> JsonB {
    JsonB(crate::bindings::transformers::transform(
        &task.0, &args.0, &inputs,
    task: String,
    args: default!(JsonB, "'{}'"),
    inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
    cache: default!(bool, false)
    cache: default!(bool, false),
 ) -> JsonB {
    let mut task_map = HashMap::new();
    task_map.insert("task", task);
diff --git a/pgml-extension/src/bindings/langchain.py b/pgml-extension/src/bindings/langchain.py
 from langchain.text_splitter import (
    CharacterTextSplitter,
    LatexTextSplitter,
    MarkdownTextSplitter,
    NLTKTextSplitter,
    PythonCodeTextSplitter,
    RecursiveCharacterTextSplitter,
    SpacyTextSplitter,
 )
 import json

 SPLITTERS = {
    "character": CharacterTextSplitter,
    "latex": LatexTextSplitter,
    "markdown": MarkdownTextSplitter,
    "nltk": NLTKTextSplitter,
    "python": PythonCodeTextSplitter,
    "recursive_character": RecursiveCharacterTextSplitter,
    "spacy": SpacyTextSplitter,
 }


 def chunk(splitter, text, args):
    kwargs = json.loads(args)

    if splitter in SPLITTERS:
        return SPLITTERS[splitter](**kwargs).split_text(text)
    else:
        raise ValueError("Unknown splitter: {}".format(splitter))
diff --git a/pgml-extension/src/bindings/langchain.rs b/pgml-extension/src/bindings/langchain.rs
 use once_cell::sync::Lazy;
 use pgrx::*;
 use pyo3::prelude::*;
 use pyo3::types::PyTuple;

 static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(|| {
    Python::with_gil(|py| -> Py<PyModule> {
        let src = include_str!(concat!(
            env!("CARGO_MANIFEST_DIR"),
            "/src/bindings/langchain.py"
        ));

        PyModule::from_code(py, src, "", "").unwrap().into()
    })
 });

 pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec<String> {
    crate::bindings::venv::activate();

    let kwargs = serde_json::to_string(kwargs).unwrap();

    Python::with_gil(|py| -> Vec<String> {
        let chunk: Py<PyAny> = PY_MODULE.getattr(py, "chunk").unwrap().into();

        chunk
            .call1(
                py,
                PyTuple::new(
                    py,
                    &[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)],
                ),
            )
            .unwrap()
            .extract(py)
            .unwrap()
    })
 }
diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs

 use crate::orm::*;

 #[cfg(feature = "python")]
 pub mod langchain;
 pub mod lightgbm;
 pub mod linfa;
 #[cfg(feature = "python")]
diff --git a/pgml-extension/src/bindings/transformers.rs b/pgml-extension/src/bindings/transformers.rs
                py,
                PyTuple::new(
                    py,
                    &[
                        task.into_py(py),
                        args.into_py(py),
                        inputs.into_py(py),
                    ],
                    &[task.into_py(py), args.into_py(py), inputs.into_py(py)],
                ),
            )
            .unwrap()
diff --git a/pgml-extension/tests/test.sql b/pgml-extension/tests/test.sql
 \i examples/multi_classification.sql
 \i examples/regression.sql
 \i examples/vectors.sql
 \i examples/chunking.sql
 -- transformers are generally too slow to run in the test suite
 --\i examples/transformers.sql
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,62 @@
		--- Chunk text for LLM embeddings and vectorization.

		DROP TABLE documents CASCADE;
		CREATE TABLE documents (
		id BIGSERIAL PRIMARY KEY,
		document TEXT NOT NULL,
		created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
		);

		DROP TABLE splitters CASCADE;
		CREATE TABLE splitters (
		id BIGSERIAL PRIMARY KEY,
		splitter VARCHAR NOT NULL DEFAULT 'recursive_character'
		);

		DROP TABLE document_chunks CASCADE;
		CREATE TABLE document_chunks(
		id BIGSERIAL PRIMARY KEY,
		document_id BIGINT NOT NULL REFERENCES documents(id),
		splitter_id BIGINT NOT NULL REFERENCES splitters(id),
		chunk_index BIGINT NOT NULL,
		chunk VARCHAR
		);

		INSERT INTO documents VALUES (
		1,
		'It was the best of times, it was the worst of times, it was the age of wisdom,
		it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light,
		it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us,
		we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like
		the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.',
		NOW()
		);

		INSERT INTO splitters VALUES (1, 'recursive_character');

		WITH document AS (
		SELECT id, document
		FROM documents
		WHERE id = 1
		),

		splitter AS (
		SELECT id, splitter
		FROM splitters
		WHERE id = 1
		)

		INSERT INTO document_chunks SELECT
		nextval('document_chunks_id_seq'::regclass),
		(SELECT id FROM document),
		(SELECT id FROM splitter),
		chunk_index,
		chunk
		FROM
		pgml.chunk(
		(SELECT splitter FROM splitter),
		(SELECT document FROM document),
		'{"chunk_size": 2, "chunk_overlap": 2}'
		);

		SELECT * FROM document_chunks LIMIT 5;
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,3 +18,4 @@ torchvision==0.15.2
		tqdm==4.65.0
		transformers==4.29.2
		xgboost==1.7.5
		langchain==0.0.180
Original file line number	Diff line number	Diff line change
Expand Up		@@ -565,7 +565,23 @@ fn load_dataset(

		#[pg_extern(immutable, parallel_safe)]
		pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec<f32> {
		crate::bindings::transformers::embed(transformer, &text, &kwargs.0)
		crate::bindings::transformers::embed(transformer, text, &kwargs.0)
		}

		#[pg_extern(immutable, parallel_safe)]
		pub fn chunk(
		splitter: &str,
		text: &str,
		kwargs: default!(JsonB, "'{}'"),
		) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> {
		let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0);
		let chunks = chunks
		.into_iter()
		.enumerate()
		.map(\|(i, chunk)\| (i as i64 + 1, chunk))
		.collect::<Vec<(i64, String)>>();

		TableIterator::new(chunks.into_iter())
		}

		#[cfg(feature = "python")]
Expand All		@@ -575,7 +591,7 @@ pub fn transform_json(
		task: JsonB,
		args: default!(JsonB, "'{}'"),
		inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
		cache: default!(bool, false)
		cache: default!(bool, false),
		) -> JsonB {
		JsonB(crate::bindings::transformers::transform(
		&task.0, &args.0, &inputs,
Expand All		@@ -589,7 +605,7 @@ pub fn transform_string(
		task: String,
		args: default!(JsonB, "'{}'"),
		inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
		cache: default!(bool, false)
		cache: default!(bool, false),
		) -> JsonB {
		let mut task_map = HashMap::new();
		task_map.insert("task", task);
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,29 @@
		from langchain.text_splitter import (
		CharacterTextSplitter,
		LatexTextSplitter,
		MarkdownTextSplitter,
		NLTKTextSplitter,
		PythonCodeTextSplitter,
		RecursiveCharacterTextSplitter,
		SpacyTextSplitter,
		)
		import json

		SPLITTERS = {
		"character": CharacterTextSplitter,
		"latex": LatexTextSplitter,
		"markdown": MarkdownTextSplitter,
		"nltk": NLTKTextSplitter,
		"python": PythonCodeTextSplitter,
		"recursive_character": RecursiveCharacterTextSplitter,
		"spacy": SpacyTextSplitter,
		}


		def chunk(splitter, text, args):
		kwargs = json.loads(args)

		if splitter in SPLITTERS:
		return SPLITTERS[splitter](**kwargs).split_text(text)
		else:
		raise ValueError("Unknown splitter: {}".format(splitter))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,37 @@
		use once_cell::sync::Lazy;
		use pgrx::*;
		use pyo3::prelude::*;
		use pyo3::types::PyTuple;

		static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(\|\| {
		Python::with_gil(\|py\| -> Py<PyModule> {
		let src = include_str!(concat!(
		env!("CARGO_MANIFEST_DIR"),
		"/src/bindings/langchain.py"
		));

		PyModule::from_code(py, src, "", "").unwrap().into()
		})
		});

		pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec<String> {
		crate::bindings::venv::activate();

		let kwargs = serde_json::to_string(kwargs).unwrap();

		Python::with_gil(\|py\| -> Vec<String> {
		let chunk: Py<PyAny> = PY_MODULE.getattr(py, "chunk").unwrap().into();

		chunk
		.call1(
		py,
		PyTuple::new(
		py,
		&[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)],
		),
		)
		.unwrap()
		.extract(py)
		.unwrap()
		})
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,6 +5,8 @@ use pgrx::*;

		use crate::orm::*;

		#[cfg(feature = "python")]
		pub mod langchain;
		pub mod lightgbm;
		pub mod linfa;
		#[cfg(feature = "python")]
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -40,11 +40,7 @@ pub fn transform(
		py,
		PyTuple::new(
		py,
		&[
		task.into_py(py),
		args.into_py(py),
		inputs.into_py(py),
		],
		&[task.into_py(py), args.into_py(py), inputs.into_py(py)],
		),
		)
		.unwrap()
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -27,5 +27,6 @@ SELECT pgml.load_dataset('wine');
		\i examples/multi_classification.sql
		\i examples/regression.sql
		\i examples/vectors.sql
		\i examples/chunking.sql
		-- transformers are generally too slow to run in the test suite
		--\i examples/transformers.sql