Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 13, 2023 · Oct 13, 2023 · Oct 16, 2023
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
 use std::str::FromStr;

 use ndarray::Zip;
 use once_cell::sync::OnceCell;
 use pgrx::iter::{SetOfIterator, TableIterator};
 use pgrx::*;
 use serde_json::Value;

 #[cfg(feature = "python")]
 use serde_json::json;

 use crate::bindings::vllm::{LLMBuilder, LLM};
 #[cfg(feature = "python")]
 use crate::orm::*;

    });

    if use_vllm {
        crate::bindings::python::activate().unwrap();

        static LAZY_LLM: OnceCell<LLM> = OnceCell::new();
        let llm = LAZY_LLM.get_or_init(move || {
            let builder = match LLMBuilder::try_from(task) {
                Ok(b) => b,
                Err(e) => error!("{e}"),
            };
            builder.build().unwrap()
        });

        let outputs = llm
            .generate(&inputs, None)?
            .iter()
            .map(|o| {
                o.outputs()
                    .unwrap()
                    .iter()
                    .map(|o| o.text().unwrap())
                    .collect::<Vec<_>>()
            })
            .collect::<Vec<Vec<_>>>();

        Ok(json!(outputs))
        Ok(crate::bindings::vllm::vllm_inference(&task, &inputs)?)
    } else {
        if let Some(map) = task.as_object_mut() {
            // pop backend keyword, if present
diff --git a/pgml-extension/src/bindings/vllm/inference.rs b/pgml-extension/src/bindings/vllm/inference.rs
 use parking_lot::Mutex;
 use pyo3::prelude::*;
 use serde_json::{json, Value};

 use super::LLM;

 static MODEL: Mutex<Option<LLM>> = Mutex::new(None);

 pub fn vllm_inference(task: &Value, inputs: &[&str]) -> PyResult<Value> {
    crate::bindings::python::activate().expect("python venv activate");
    let mut model = MODEL.lock();

    let llm = match get_model_name(&model, task) {
        ModelName::Same => model.as_mut().expect("ModelName::Same as_mut"),
        ModelName::Different(name) => {
            if let Some(llm) = model.take() {
                // delete old model, exists
                destroy_model_parallel(llm)?;
            }
            // make new model
            let llm = LLM::new(&name)?;
            model.insert(llm)
        }
    };

    let outputs = llm
        .generate(&inputs, None)?
        .iter()
        .map(|o| {
            o.outputs()
                .expect("RequestOutput::outputs()")
                .iter()
                .map(|o| o.text().expect("CompletionOutput::text()"))
                .collect::<Vec<_>>()
        })
        .collect::<Vec<Vec<_>>>();

    Ok(json!(outputs))
 }

 fn get_model_name<M>(model: &M, task: &Value) -> ModelName
 where
    M: std::ops::Deref<Target = Option<LLM>>,
 {
    match task
        .as_object()
        .and_then(|obj| obj.get("model").and_then(|m| m.as_str()))
    {
        Some(name) => match model.as_ref() {
            Some(llm) if llm.model() == name => ModelName::Same,
            _ => ModelName::Different(name.to_string()),
        },
        None => ModelName::Same,
    }
 }

 enum ModelName {
    Same,
    Different(String),
 }

 // See https://github.com/vllm-project/vllm/issues/565#issuecomment-1725174811
 fn destroy_model_parallel(llm: LLM) -> PyResult<()> {
    Python::with_gil(|py| {
        PyModule::import(py, "vllm")?
            .getattr("model_executor")?
            .getattr("parallel_utils")?
            .getattr("parallel_state")?
            .getattr("destroy_model_parallel")?
            .call0()?;
        drop(llm);
        PyModule::import(py, "gc")?.getattr("collect")?.call0()?;
        Ok(())
    })
 }
diff --git a/pgml-extension/src/bindings/vllm/llm.rs b/pgml-extension/src/bindings/vllm/llm.rs
 }

 pub struct LLM {
    model: String,
    inner: PyObject,
 }

    pub fn build(self) -> PyResult<LLM> {
        let inner = Python::with_gil(|py| -> PyResult<PyObject> {
            let kwargs = PyDict::new(py);
            kwargs.set_item("model", self.model)?;
            kwargs.set_item("model", self.model.clone())?;
            kwargs.set_item("tokenizer", self.tokenizer)?;
            kwargs.set_item("tokenizer_mode", self.tokenizer_mode)?;
            kwargs.set_item("trust_remote_code", self.trust_remote_code)?;
            vllm.getattr("LLM")?.call((), Some(kwargs))?.extract()
        })?;

        Ok(LLM { inner })
        Ok(LLM {
            inner,
            model: self.model,
        })
    }
 }

                .extract(py)
        })
    }

    pub fn model(&self) -> &str {
        self.model.as_str()
    }
 }

 impl ToPyObject for TokenizerMode {
diff --git a/pgml-extension/src/bindings/vllm/mod.rs b/pgml-extension/src/bindings/vllm/mod.rs
 //! Rust bindings to the Python package `vllm`.

 mod inference;
 mod llm;
 mod outputs;
 mod params;

 pub use inference::*;
 pub use llm::*;
 pub use outputs::*;
 pub use params::*;
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,15 +2,13 @@ use std::fmt::Write;
		use std::str::FromStr;

		use ndarray::Zip;
		use once_cell::sync::OnceCell;
		use pgrx::iter::{SetOfIterator, TableIterator};
		use pgrx::*;
		use serde_json::Value;

		#[cfg(feature = "python")]
		use serde_json::json;

		use crate::bindings::vllm::{LLMBuilder, LLM};
		#[cfg(feature = "python")]
		use crate::orm::*;

Expand DownExpand Up		@@ -642,30 +640,7 @@ fn transform(mut task: Value, args: Value, inputs: Vec<&str>) -> anyhow::Result<
		});

		if use_vllm {
		crate::bindings::python::activate().unwrap();

		static LAZY_LLM: OnceCell<LLM> = OnceCell::new();
		let llm = LAZY_LLM.get_or_init(move \|\| {
		let builder = match LLMBuilder::try_from(task) {
		Ok(b) => b,
		Err(e) => error!("{e}"),
		};
		builder.build().unwrap()
		});

		let outputs = llm
		.generate(&inputs, None)?
		.iter()
		.map(\|o\| {
		o.outputs()
		.unwrap()
		.iter()
		.map(\|o\| o.text().unwrap())
		.collect::<Vec<_>>()
		})
		.collect::<Vec<Vec<_>>>();

		Ok(json!(outputs))
		Ok(crate::bindings::vllm::vllm_inference(&task, &inputs)?)
		} else {
		if let Some(map) = task.as_object_mut() {
		// pop backend keyword, if present
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,75 @@
		use parking_lot::Mutex;
		use pyo3::prelude::*;
		use serde_json::{json, Value};

		use super::LLM;

		static MODEL: Mutex<Option<LLM>> = Mutex::new(None);

		pub fn vllm_inference(task: &Value, inputs: &[&str]) -> PyResult<Value> {
		crate::bindings::python::activate().expect("python venv activate");
		let mut model = MODEL.lock();

		let llm = match get_model_name(&model, task) {
		ModelName::Same => model.as_mut().expect("ModelName::Same as_mut"),
		ModelName::Different(name) => {
		if let Some(llm) = model.take() {
		// delete old model, exists
		destroy_model_parallel(llm)?;
		}
		// make new model
		let llm = LLM::new(&name)?;
		model.insert(llm)
		}
		};

		let outputs = llm
		.generate(&inputs, None)?
		.iter()
		.map(\|o\| {
		o.outputs()
		.expect("RequestOutput::outputs()")
		.iter()
		.map(\|o\| o.text().expect("CompletionOutput::text()"))
		.collect::<Vec<_>>()
		})
		.collect::<Vec<Vec<_>>>();

		Ok(json!(outputs))
		}

		fn get_model_name<M>(model: &M, task: &Value) -> ModelName
		where
		M: std::ops::Deref<Target = Option<LLM>>,
		{
		match task
		.as_object()
		.and_then(\|obj\| obj.get("model").and_then(\|m\| m.as_str()))
		{
		Some(name) => match model.as_ref() {
		Some(llm) if llm.model() == name => ModelName::Same,
		_ => ModelName::Different(name.to_string()),
		},
		None => ModelName::Same,
		}
		}

		enum ModelName {
		Same,
		Different(String),
		}

		// See https://github.com/vllm-project/vllm/issues/565#issuecomment-1725174811
		fn destroy_model_parallel(llm: LLM) -> PyResult<()> {
		Python::with_gil(\|py\| {
		PyModule::import(py, "vllm")?
		.getattr("model_executor")?
		.getattr("parallel_utils")?
		.getattr("parallel_state")?
		.getattr("destroy_model_parallel")?
		.call0()?;
		drop(llm);
		PyModule::import(py, "gc")?.getattr("collect")?.call0()?;
		Ok(())
		})
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,6 +29,7 @@ pub enum Quantization {
		}

		pub struct LLM {
		model: String,
		inner: PyObject,
		}

Expand DownExpand Up		@@ -133,7 +134,7 @@ impl LLMBuilder {
		pub fn build(self) -> PyResult<LLM> {
		let inner = Python::with_gil(\|py\| -> PyResult<PyObject> {
		let kwargs = PyDict::new(py);
		kwargs.set_item("model", self.model)?;
		kwargs.set_item("model", self.model.clone())?;
		kwargs.set_item("tokenizer", self.tokenizer)?;
		kwargs.set_item("tokenizer_mode", self.tokenizer_mode)?;
		kwargs.set_item("trust_remote_code", self.trust_remote_code)?;
Expand All		@@ -149,7 +150,10 @@ impl LLMBuilder {
		vllm.getattr("LLM")?.call((), Some(kwargs))?.extract()
		})?;

		Ok(LLM { inner })
		Ok(LLM {
		inner,
		model: self.model,
		})
		}
		}

Expand DownExpand Up		@@ -184,6 +188,10 @@ impl LLM {
		.extract(py)
		})
		}

		pub fn model(&self) -> &str {
		self.model.as_str()
		}
		}

		impl ToPyObject for TokenizerMode {
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,9 +1,11 @@
		//! Rust bindings to the Python package `vllm`.

		mod inference;
		mod llm;
		mod outputs;
		mod params;

		pub use inference::*;
		pub use llm::*;
		pub use outputs::*;
		pub use params::*;