Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Add LangChain splitters#655

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
levkk merged 5 commits intomasterfromlevkk-langchain
May 31, 2023
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletionspgml-extension/examples/chunking.sql
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
--- Chunk text for LLM embeddings and vectorization.

DROP TABLE documents CASCADE;
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
document TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

DROP TABLE splitters CASCADE;
CREATE TABLE splitters (
id BIGSERIAL PRIMARY KEY,
splitter VARCHAR NOT NULL DEFAULT 'recursive_character'
);

DROP TABLE document_chunks CASCADE;
CREATE TABLE document_chunks(
id BIGSERIAL PRIMARY KEY,
document_id BIGINT NOT NULL REFERENCES documents(id),
splitter_id BIGINT NOT NULL REFERENCES splitters(id),
chunk_index BIGINT NOT NULL,
chunk VARCHAR
);

INSERT INTO documents VALUES (
1,
'It was the best of times, it was the worst of times, it was the age of wisdom,
it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light,
it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us,
we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like
the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.',
NOW()
);

INSERT INTO splitters VALUES (1, 'recursive_character');

WITH document AS (
SELECT id, document
FROM documents
WHERE id = 1
),

splitter AS (
SELECT id, splitter
FROM splitters
WHERE id = 1
)

INSERT INTO document_chunks SELECT
nextval('document_chunks_id_seq'::regclass),
(SELECT id FROM document),
(SELECT id FROM splitter),
chunk_index,
chunk
FROM
pgml.chunk(
(SELECT splitter FROM splitter),
(SELECT document FROM document),
'{"chunk_size": 2, "chunk_overlap": 2}'
);

SELECT * FROM document_chunks LIMIT 5;
1 change: 1 addition & 0 deletionspgml-extension/requirements.txt
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -18,3 +18,4 @@ torchvision==0.15.2
tqdm==4.65.0
transformers==4.29.2
xgboost==1.7.5
langchain==0.0.180
22 changes: 19 additions & 3 deletionspgml-extension/src/api.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -565,7 +565,23 @@ fn load_dataset(

#[pg_extern(immutable, parallel_safe)]
pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec<f32> {
crate::bindings::transformers::embed(transformer, &text, &kwargs.0)
crate::bindings::transformers::embed(transformer, text, &kwargs.0)
}

#[pg_extern(immutable, parallel_safe)]
pub fn chunk(
splitter: &str,
text: &str,
kwargs: default!(JsonB, "'{}'"),
) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> {
let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0);
let chunks = chunks
.into_iter()
.enumerate()
.map(|(i, chunk)| (i as i64 + 1, chunk))
.collect::<Vec<(i64, String)>>();

TableIterator::new(chunks.into_iter())
}

#[cfg(feature = "python")]
Expand All@@ -575,7 +591,7 @@ pub fn transform_json(
task: JsonB,
args: default!(JsonB, "'{}'"),
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
cache: default!(bool, false)
cache: default!(bool, false),
) -> JsonB {
JsonB(crate::bindings::transformers::transform(
&task.0, &args.0, &inputs,
Expand All@@ -589,7 +605,7 @@ pub fn transform_string(
task: String,
args: default!(JsonB, "'{}'"),
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
cache: default!(bool, false)
cache: default!(bool, false),
) -> JsonB {
let mut task_map = HashMap::new();
task_map.insert("task", task);
Expand Down
29 changes: 29 additions & 0 deletionspgml-extension/src/bindings/langchain.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
from langchain.text_splitter import (
CharacterTextSplitter,
LatexTextSplitter,
MarkdownTextSplitter,
NLTKTextSplitter,
PythonCodeTextSplitter,
RecursiveCharacterTextSplitter,
SpacyTextSplitter,
)
import json

SPLITTERS = {
"character": CharacterTextSplitter,
"latex": LatexTextSplitter,
"markdown": MarkdownTextSplitter,
"nltk": NLTKTextSplitter,
"python": PythonCodeTextSplitter,
"recursive_character": RecursiveCharacterTextSplitter,
"spacy": SpacyTextSplitter,
}


def chunk(splitter, text, args):
kwargs = json.loads(args)

if splitter in SPLITTERS:
return SPLITTERS[splitter](**kwargs).split_text(text)
else:
raise ValueError("Unknown splitter: {}".format(splitter))
37 changes: 37 additions & 0 deletionspgml-extension/src/bindings/langchain.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
use once_cell::sync::Lazy;
use pgrx::*;
use pyo3::prelude::*;
use pyo3::types::PyTuple;

static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(|| {
Python::with_gil(|py| -> Py<PyModule> {
let src = include_str!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/src/bindings/langchain.py"
));

PyModule::from_code(py, src, "", "").unwrap().into()
})
});

pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec<String> {
crate::bindings::venv::activate();

let kwargs = serde_json::to_string(kwargs).unwrap();

Python::with_gil(|py| -> Vec<String> {
let chunk: Py<PyAny> = PY_MODULE.getattr(py, "chunk").unwrap().into();

chunk
.call1(
py,
PyTuple::new(
py,
&[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)],
),
)
.unwrap()
.extract(py)
.unwrap()
})
}
2 changes: 2 additions & 0 deletionspgml-extension/src/bindings/mod.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -5,6 +5,8 @@ use pgrx::*;

use crate::orm::*;

#[cfg(feature = "python")]
pub mod langchain;
pub mod lightgbm;
pub mod linfa;
#[cfg(feature = "python")]
Expand Down
6 changes: 1 addition & 5 deletionspgml-extension/src/bindings/transformers.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -40,11 +40,7 @@ pub fn transform(
py,
PyTuple::new(
py,
&[
task.into_py(py),
args.into_py(py),
inputs.into_py(py),
],
&[task.into_py(py), args.into_py(py), inputs.into_py(py)],
),
)
.unwrap()
Expand Down
1 change: 1 addition & 0 deletionspgml-extension/tests/test.sql
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -27,5 +27,6 @@ SELECT pgml.load_dataset('wine');
\i examples/multi_classification.sql
\i examples/regression.sql
\i examples/vectors.sql
\i examples/chunking.sql
-- transformers are generally too slow to run in the test suite
--\i examples/transformers.sql

[8]ページ先頭

©2009-2025 Movatter.jp