Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 13, 2023 · Oct 13, 2023 · Oct 16, 2023
diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt
 langchain==0.0.287
 einops==0.6.1
 pynvml==11.5.0
 vllm==0.2.0
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
 use ndarray::Zip;
 use pgrx::iter::{SetOfIterator, TableIterator};
 use pgrx::*;
 use serde_json::Value;

 #[cfg(feature = "python")]
 use serde_json::json;
    inputs: default!(Vec<&str>, "ARRAY[]::TEXT[]"),
    cache: default!(bool, false),
 ) -> JsonB {
    matchcrate::bindings::transformers::transform(&task.0,&args.0, inputs) {
    match transform(task.0, args.0, inputs) {
        Ok(output) => JsonB(output),
        Err(e) => error!("{e}"),
    }
    }
 }

 fn transform(mut task: Value, args: Value, inputs: Vec<&str>) -> anyhow::Result<Value> {
    // use vLLM if model present in task and backend is set to vllm
    let use_vllm =  task.as_object_mut().is_some_and(|obj| {
        obj.contains_key("model") && matches!(obj.get("backend"), Some(Value::String(backend)) if backend.to_string().to_ascii_lowercase() == "vllm")
    });

    if use_vllm {
        Ok(crate::bindings::vllm::vllm_inference(&task, &inputs)?)
    } else {
        if let Some(map) = task.as_object_mut() {
            // pop backend keyword, if present
            let _ = map.remove("backend");
        }
        crate::bindings::transformers::transform(&task, &args, inputs)
    }
 }

 #[cfg(feature = "python")]
 #[pg_extern(immutable, parallel_safe, name = "generate")]
 fn generate(project_name: &str, inputs: &str, config: default!(JsonB, "'{}'")) -> String {
diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs
 pub mod sklearn;
 #[cfg(feature = "python")]
 pub mod transformers;
 #[cfg(feature = "python")]
 pub mod vllm;
 pub mod xgboost;

 pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Result<Box<dyn Bindings>>;
diff --git a/pgml-extension/src/bindings/vllm/inference.rs b/pgml-extension/src/bindings/vllm/inference.rs
 use parking_lot::Mutex;
 use pyo3::prelude::*;
 use serde_json::{json, Value};

 use super::LLM;

 /// Cache a single model per client process. vLLM does not allow multiple, simultaneous models to be loaded.
 /// See GH issue, https://github.com/vllm-project/vllm/issues/565
 static MODEL: Mutex<Option<LLM>> = Mutex::new(None);

 pub fn vllm_inference(task: &Value, inputs: &[&str]) -> PyResult<Value> {
    crate::bindings::python::activate().expect("python venv activate");
    let mut model = MODEL.lock();

    let llm = match get_model_name(&model, task) {
        ModelName::Same => model.as_mut().expect("ModelName::Same as_mut"),
        ModelName::Different(name) => {
            if let Some(llm) = model.take() {
                // delete old model, exists
                destroy_model_parallel(llm)?;
            }
            // make new model
            let llm = LLM::new(&name)?;
            model.insert(llm)
        }
    };

    let outputs = llm
        .generate(&inputs, None)?
        .iter()
        .map(|o| {
            o.outputs()
                .expect("RequestOutput::outputs()")
                .iter()
                .map(|o| o.text().expect("CompletionOutput::text()"))
                .collect::<Vec<_>>()
        })
        .collect::<Vec<Vec<_>>>();

    Ok(json!(outputs))
 }

 /// Determine if the "model" specified in the task is the same model as the one cached.
 ///
 /// # Panic
 /// This function panics if:
 /// - `task` is not an object
 /// - "model" key is missing from `task` object
 /// - "model" value is not a str
 fn get_model_name<M>(model: &M, task: &Value) -> ModelName
 where
    M: std::ops::Deref<Target = Option<LLM>>,
 {
    let name = task.as_object()
        .expect("`task` is an object")
        .get("model")
        .expect("model key is present")
        .as_str()
        .expect("model value is a str");

    if matches!(model.as_ref(), Some(llm) if llm.model() == name) {
        ModelName::Same
    } else {
        ModelName::Different(name.to_string())
    }
 }

 enum ModelName {
    Same,
    Different(String),
 }

 // See https://github.com/vllm-project/vllm/issues/565#issuecomment-1725174811
 fn destroy_model_parallel(llm: LLM) -> PyResult<()> {
    Python::with_gil(|py| {
        PyModule::import(py, "vllm")?
            .getattr("model_executor")?
            .getattr("parallel_utils")?
            .getattr("parallel_state")?
            .getattr("destroy_model_parallel")?
            .call0()?;
        drop(llm);
        PyModule::import(py, "gc")?.getattr("collect")?.call0()?;
        Ok(())
    })
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -26,3 +26,4 @@ xgboost==2.0.0
		langchain==0.0.287
		einops==0.6.1
		pynvml==11.5.0
		vllm==0.2.0
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,6 +4,7 @@ use std::str::FromStr;
		use ndarray::Zip;
		use pgrx::iter::{SetOfIterator, TableIterator};
		use pgrx::*;
		use serde_json::Value;

		#[cfg(feature = "python")]
		use serde_json::json;
Expand DownExpand Up		@@ -610,7 +611,7 @@ pub fn transform_json(
		inputs: default!(Vec<&str>, "ARRAY[]::TEXT[]"),
		cache: default!(bool, false),
		) -> JsonB {
		matchcrate::bindings::transformers::transform(&task.0,&args.0, inputs) {
		match transform(task.0, args.0, inputs) {
		Ok(output) => JsonB(output),
		Err(e) => error!("{e}"),
		}
Expand All		@@ -632,6 +633,23 @@ pub fn transform_string(
		}
		}

		fn transform(mut task: Value, args: Value, inputs: Vec<&str>) -> anyhow::Result<Value> {
		// use vLLM if model present in task and backend is set to vllm
		let use_vllm = task.as_object_mut().is_some_and(\|obj\| {
		obj.contains_key("model") && matches!(obj.get("backend"), Some(Value::String(backend)) if backend.to_string().to_ascii_lowercase() == "vllm")
		});

		if use_vllm {
		Ok(crate::bindings::vllm::vllm_inference(&task, &inputs)?)
		} else {
		if let Some(map) = task.as_object_mut() {
		// pop backend keyword, if present
		let _ = map.remove("backend");
		}
		crate::bindings::transformers::transform(&task, &args, inputs)
		}
		}

		#[cfg(feature = "python")]
		#[pg_extern(immutable, parallel_safe, name = "generate")]
		fn generate(project_name: &str, inputs: &str, config: default!(JsonB, "'{}'")) -> String {
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -46,6 +46,8 @@ pub mod python;
		pub mod sklearn;
		#[cfg(feature = "python")]
		pub mod transformers;
		#[cfg(feature = "python")]
		pub mod vllm;
		pub mod xgboost;

		pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Result<Box<dyn Bindings>>;
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,86 @@
		use parking_lot::Mutex;
		use pyo3::prelude::*;
		use serde_json::{json, Value};

		use super::LLM;

		/// Cache a single model per client process. vLLM does not allow multiple, simultaneous models to be loaded.
		/// See GH issue, https://github.com/vllm-project/vllm/issues/565
		static MODEL: Mutex<Option<LLM>> = Mutex::new(None);

		pub fn vllm_inference(task: &Value, inputs: &[&str]) -> PyResult<Value> {
		crate::bindings::python::activate().expect("python venv activate");
		let mut model = MODEL.lock();

		let llm = match get_model_name(&model, task) {
		ModelName::Same => model.as_mut().expect("ModelName::Same as_mut"),
		ModelName::Different(name) => {
		if let Some(llm) = model.take() {
		// delete old model, exists
		destroy_model_parallel(llm)?;
		}
		// make new model
		let llm = LLM::new(&name)?;
		model.insert(llm)
		}
		};

		let outputs = llm
		.generate(&inputs, None)?
		.iter()
		.map(\|o\| {
		o.outputs()
		.expect("RequestOutput::outputs()")
		.iter()
		.map(\|o\| o.text().expect("CompletionOutput::text()"))
		.collect::<Vec<_>>()
		})
		.collect::<Vec<Vec<_>>>();

		Ok(json!(outputs))
		}

		/// Determine if the "model" specified in the task is the same model as the one cached.
		///
		/// # Panic
		/// This function panics if:
		/// - `task` is not an object
		/// - "model" key is missing from `task` object
		/// - "model" value is not a str
		fn get_model_name<M>(model: &M, task: &Value) -> ModelName
		where
		M: std::ops::Deref<Target = Option<LLM>>,
		{
		let name = task.as_object()
		.expect("`task` is an object")
		.get("model")
		.expect("model key is present")
		.as_str()
		.expect("model value is a str");

		if matches!(model.as_ref(), Some(llm) if llm.model() == name) {
		ModelName::Same
		} else {
		ModelName::Different(name.to_string())
		}
		}

		enum ModelName {
		Same,
		Different(String),
		}

		// See https://github.com/vllm-project/vllm/issues/565#issuecomment-1725174811
		fn destroy_model_parallel(llm: LLM) -> PyResult<()> {
		Python::with_gil(\|py\| {
		PyModule::import(py, "vllm")?
		.getattr("model_executor")?
		.getattr("parallel_utils")?
		.getattr("parallel_state")?
		.getattr("destroy_model_parallel")?
		.call0()?;
		drop(llm);
		PyModule::import(py, "gc")?.getattr("collect")?.call0()?;
		Ok(())
		})
		}