Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/.github/workflows/ubuntu-packages-and-docker-image.yml b/.github/workflows/ubuntu-packages-and-docker-image.yml
  workflow_dispatch:
    inputs:
      packageVersion:
        default: "2.8.2"
        default: "2.9.1"
 jobs:
  #
  # PostgresML extension.
diff --git a/pgml-cms/docs/resources/developer-docs/contributing.md b/pgml-cms/docs/resources/developer-docs/contributing.md
 postgres=# select pgml.version();
      version
 -------------------
 2.7.4
 2.9.1
 (1 row)
 ```
 {% endtab %}
diff --git a/pgml-cms/docs/resources/developer-docs/installation.md b/pgml-cms/docs/resources/developer-docs/installation.md
 pgml_test=# SELECT pgml.version();
 version
 ---------
 2.7.4
 2.9.1
 (1 row)
 ```

diff --git a/pgml-cms/docs/resources/developer-docs/quick-start-with-docker.md b/pgml-cms/docs/resources/developer-docs/quick-start-with-docker.md
 postgresml=# SELECT pgml.version();
 version
 ---------
 2.7.13
 2.9.1
 (1 row)
 ```

diff --git a/pgml-cms/docs/resources/developer-docs/self-hosting/pooler.md b/pgml-cms/docs/resources/developer-docs/self-hosting/pooler.md
 postgresml=> SELECT pgml.version();
 version
 ---------
 2.7.9
 2.9.1
 (1 row)
 ```
diff --git a/pgml-extension/Cargo.lock b/pgml-extension/Cargo.lock
diff --git a/pgml-extension/Cargo.toml b/pgml-extension/Cargo.toml
 [package]
 name = "pgml"
 version = "2.9.0"
 version = "2.9.1"
 edition = "2021"

 [lib]
diff --git a/pgml-extension/examples/preprocessing.sql b/pgml-extension/examples/preprocessing.sql
 -- load the diamonds dataset, that contains text categorical variables
 SELECT pgml.load_dataset('jdxcosta/diamonds');

 -- view the data
 SELECT * FROM pgml."jdxcosta/diamonds" LIMIT 10;

 -- drop the Unamed column, since it's not useful for training (you could create a view instead)
 ALTER TABLE pgml."jdxcosta/diamonds" DROP COLUMN "Unnamed: 0";

 -- train a model using preprocessors to scale the numeric variables, and target encode the categoricals
 SELECT pgml.train(
       project_name => 'Diamond prices',
       task => 'regression',
       relation_name => 'pgml.jdxcosta/diamonds',
       y_column_name => 'price',
       algorithm => 'lightgbm',
       preprocess => '{
                      "carat": {"scale": "standard"},
                      "depth": {"scale": "standard"},
                      "table": {"scale": "standard"},
                      "cut": {"encode": "target", "scale": "standard"},
                      "color": {"encode": "target", "scale": "standard"},
                      "clarity": {"encode": "target", "scale": "standard"}
                  }'
 );

 -- run some predictions, notice we're passing a heterogeneous row (tuple) as input, rather than a homogenous ARRAY[].
 SELECT price, pgml.predict('Diamond prices', (carat, cut, color, clarity, depth, "table", x, y, z)) AS prediction
 FROM pgml."jdxcosta/diamonds"
 LIMIT 10;

 -- This is a difficult dataset for more algorithms, which makes it a good challenge for preprocessing, and additional
 -- feature engineering. What's next?
diff --git a/pgml-extension/sql/pgml--2.9.0--2.9.1.sql b/pgml-extension/sql/pgml--2.9.0--2.9.1.sql
diff --git a/pgml-extension/src/bindings/transformers/mod.rs b/pgml-extension/src/bindings/transformers/mod.rs
        .ok_or(anyhow!("dataset `data` key is not an object"))?;
    let column_names = types
        .iter()
        .map(|(name, _type)|name.clone())
        .map(|(name, _type)|format!("\"{}\"", name))
        .collect::<Vec<String>>()
        .join(", ");
    let column_types = types
                "int64" => "INT8",
                "int32" => "INT4",
                "int16" => "INT2",
                "int8" => "INT2",
                "float64" => "FLOAT8",
                "float32" => "FLOAT4",
                "float16" => "FLOAT4",
                "bool" => "BOOLEAN",
                _ => bail!("unhandled dataset feature while reading dataset: {type_}"),
            };
            Ok(format!("{name} {type_}"))
            Ok(format!("\"{name}\" {type_}"))
        })
        .collect::<Result<Vec<String>>>()?
        .join(", ");
                        .into_datum(),
                )),
                "dict" | "list" => row.push((PgBuiltInOids::JSONBOID.oid(), JsonB(value.clone()).into_datum())),
                "int64" | "int32" | "int16" => row.push((
                "int64" | "int32" | "int16"| "int8"=> row.push((
                    PgBuiltInOids::INT8OID.oid(),
                    value
                        .as_i64()
diff --git a/pgml-extension/src/orm/model.rs b/pgml-extension/src/orm/model.rs
            ).unwrap().first();

            if !result.is_empty() {
                let project_id = result.get(2).unwrap().unwrap();
                let project = Project::find(project_id).unwrap();
                let snapshot_id = result.get(3).unwrap().unwrap();
                let snapshot = Snapshot::find(snapshot_id).unwrap();
                let algorithm = Algorithm::from_str(result.get(4).unwrap().unwrap()).unwrap();
                let runtime = Runtime::from_str(result.get(5).unwrap().unwrap()).unwrap();

                let project_id = result.get(2).unwrap().expect("project_id is i64");
                let project = Project::find(project_id).expect("project doesn't exist");
                let snapshot_id = result.get(3).unwrap().expect("snapshot_id is i64");
                let snapshot = Snapshot::find(snapshot_id).expect("snapshot doesn't exist");
                let algorithm = Algorithm::from_str(result.get(4).unwrap().unwrap()).expect("algorithm is malformed");
                let runtime = Runtime::from_str(result.get(5).unwrap().unwrap()).expect("runtime is malformed");
                let data = Spi::get_one_with_args::<Vec<u8>>(
                    "
                        SELECT data
diff --git a/pgml-extension/src/orm/sampling.rs b/pgml-extension/src/orm/sampling.rs
            Sampling::stratified => {
                format!(
                    "
                    SELECT*
                    SELECT{col_string}
                    FROM (
                        SELECT
                            *,
        let columns = get_column_fixtures();
        let sql = sampling.get_sql("my_table", columns);
        let expected_sql = "
                    SELECT*
                    SELECT\"col1\", \"col2\"
                    FROM (
                        SELECT
                            *,
diff --git a/pgml-extension/src/orm/snapshot.rs b/pgml-extension/src/orm/snapshot.rs
        if self.preprocessor.encode == Encode::target {
            let categories = self.statistics.categories.as_mut().unwrap();
            let mut sums = vec![0_f32; categories.len() + 1];
            let mut total = 0.;
            Zip::from(array).and(target).for_each(|&value, &target| {
                total += target;
                sums[value as usize] += target;
            });
            let avg_target = total / categories.len() as f32;
            for category in categories.values_mut() {
                let sum = sums[category.value as usize];
                category.value = sum / category.members as f32;
                if category.members > 0 {
                    let sum = sums[category.value as usize];
                    category.value = sum / category.members as f32;
                } else {
                    // use avg target for categories w/ no members, e.g. __NULL__ category in a complete dataset
                    category.value = avg_target;
                }
            }
        }

        // Data is filtered for NaN because it is not welldefined statistically, and they are counted as separate stat
        // Data is filtered for NaN because it is not well-defined statistically, and they are counted as separate stat
        let mut data = array
            .iter()
            .filter_map(|n| if n.is_nan() { None } else { Some(*n) })
                .first();
            if !result.is_empty() {
                let jsonb: JsonB = result.get(7).unwrap().unwrap();
                let columns: Vec<Column> = serde_json::from_value(jsonb.0).unwrap();
                let columns: Vec<Column> =
                    serde_json::from_value(jsonb.0).expect("invalid json description of columns");
                // let jsonb: JsonB = result.get(8).unwrap();
                // let analysis: Option<IndexMap<String, f32>> = Some(serde_json::from_value(jsonb.0).unwrap());
                let mut s = Snapshot {

        let preprocessors: HashMap<String, Preprocessor> = serde_json::from_value(preprocess.0).expect("is valid");

        let mut position = 0; // Postgres column positions are not updated when other columns are dropped, but we expect consecutive positions when we read the table.
        Spi::connect(|mut client| {
            let mut columns: Vec<Column> = Vec::new();
            client.select("SELECT column_name::TEXT, udt_name::TEXT, is_nullable::BOOLEAN, ordinal_position::INTEGER FROM information_schema.columns WHERE table_schema = $1 AND table_name = $2 ORDER BY ordinal_position ASC",
            client.select("SELECT column_name::TEXT, udt_name::TEXT, is_nullable::BOOLEAN FROM information_schema.columns WHERE table_schema = $1 AND table_name = $2 ORDER BY ordinal_position ASC",
                None,
                Some(vec![
                    (PgBuiltInOids::TEXTOID.oid(), schema_name.into_datum()),
                    pg_type = pg_type[1..].to_string() + "[]";
                }
                let nullable = row[3].value::<bool>().unwrap().unwrap();
 letposition= row[4].value::<i32>().unwrap().unwrap() as usize;
                position+= 1;
                let label = match y_column_name {
                    Some(ref y_column_name) => y_column_name.contains(&name),
                    None => false,
    pub fn numeric_encoded_dataset(&mut self) -> Dataset {
        let mut data = None;
        Spi::connect(|client| {
            // PostgresArraysarrays are 1 indexed and so are SPI tuples...
            // Postgres arrays are 1 indexed and so are SPI tuples...
            let result = client.select(&self.select_sql(), None, None).unwrap();
            let num_rows = result.len();
            let (num_train_rows, num_test_rows) = self.train_test_split(num_rows);
diff --git a/pgml-extension/tests/test.sql b/pgml-extension/tests/test.sql
 \i examples/regression.sql
 \i examples/vectors.sql
 \i examples/chunking.sql
 \i examples/preprocessing.sql
 -- transformers are generally too slow to run in the test suite
 --\i examples/transformers.sql
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,7 +4,7 @@ on:
		workflow_dispatch:
		inputs:
		packageVersion:
		default: "2.8.2"
		default: "2.9.1"
		jobs:
		#
		# PostgresML extension.
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,7 +127,7 @@ SELECT pgml.version();
		postgres=# select pgml.version();
		version
		-------------------
		2.7.4
		2.9.1
		(1 row)
		```
		{% endtab %}
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -132,7 +132,7 @@ CREATE EXTENSION
		pgml_test=# SELECT pgml.version();
		version
		---------
		2.7.4
		2.9.1
		(1 row)
		```

Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -80,7 +80,7 @@ Time: 41.520 ms
		postgresml=# SELECT pgml.version();
		version
		---------
		2.7.13
		2.9.1
		(1 row)
		```

Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -115,6 +115,6 @@ Type "help" for help.
		postgresml=> SELECT pgml.version();
		version
		---------
		2.7.9
		2.9.1
		(1 row)
		```
Original file line number	Diff line number	Diff line change
		@@ -1,6 +1,6 @@
		[package]
		name = "pgml"
		version = "2.9.0"
		version = "2.9.1"
		edition = "2021"

		[lib]
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,33 @@
		-- load the diamonds dataset, that contains text categorical variables
		SELECT pgml.load_dataset('jdxcosta/diamonds');

		-- view the data
		SELECT * FROM pgml."jdxcosta/diamonds" LIMIT 10;

		-- drop the Unamed column, since it's not useful for training (you could create a view instead)
		ALTER TABLE pgml."jdxcosta/diamonds" DROP COLUMN "Unnamed: 0";

		-- train a model using preprocessors to scale the numeric variables, and target encode the categoricals
		SELECT pgml.train(
		project_name => 'Diamond prices',
		task => 'regression',
		relation_name => 'pgml.jdxcosta/diamonds',
		y_column_name => 'price',
		algorithm => 'lightgbm',
		preprocess => '{
		"carat": {"scale": "standard"},
		"depth": {"scale": "standard"},
		"table": {"scale": "standard"},
		"cut": {"encode": "target", "scale": "standard"},
		"color": {"encode": "target", "scale": "standard"},
		"clarity": {"encode": "target", "scale": "standard"}
		}'
		);

		-- run some predictions, notice we're passing a heterogeneous row (tuple) as input, rather than a homogenous ARRAY[].
		SELECT price, pgml.predict('Diamond prices', (carat, cut, color, clarity, depth, "table", x, y, z)) AS prediction
		FROM pgml."jdxcosta/diamonds"
		LIMIT 10;

		-- This is a difficult dataset for more algorithms, which makes it a good challenge for preprocessing, and additional
		-- feature engineering. What's next?
Original file line number	Diff line number	Diff line change
Expand Up		@@ -380,7 +380,7 @@ pub fn load_dataset(
		.ok_or(anyhow!("dataset `data` key is not an object"))?;
		let column_names = types
		.iter()
		.map(\|(name, _type)\|name.clone())
		.map(\|(name, _type)\|format!("\"{}\"", name))
		.collect::<Vec<String>>()
		.join(", ");
		let column_types = types
Expand All		@@ -393,13 +393,14 @@ pub fn load_dataset(
		"int64" => "INT8",
		"int32" => "INT4",
		"int16" => "INT2",
		"int8" => "INT2",
		"float64" => "FLOAT8",
		"float32" => "FLOAT4",
		"float16" => "FLOAT4",
		"bool" => "BOOLEAN",
		_ => bail!("unhandled dataset feature while reading dataset: {type_}"),
		};
		Ok(format!("{name} {type_}"))
		Ok(format!("\"{name}\" {type_}"))
		})
		.collect::<Result<Vec<String>>>()?
		.join(", ");
Expand DownExpand Up		@@ -455,7 +456,7 @@ pub fn load_dataset(
		.into_datum(),
		)),
		"dict" \| "list" => row.push((PgBuiltInOids::JSONBOID.oid(), JsonB(value.clone()).into_datum())),
		"int64" \| "int32" \| "int16" => row.push((
		"int64" \| "int32" \| "int16"\| "int8"=> row.push((
		PgBuiltInOids::INT8OID.oid(),
		value
		.as_i64()
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -344,13 +344,12 @@ impl Model {
		).unwrap().first();

		if !result.is_empty() {
		let project_id = result.get(2).unwrap().unwrap();
		let project = Project::find(project_id).unwrap();
		let snapshot_id = result.get(3).unwrap().unwrap();
		let snapshot = Snapshot::find(snapshot_id).unwrap();
		let algorithm = Algorithm::from_str(result.get(4).unwrap().unwrap()).unwrap();
		let runtime = Runtime::from_str(result.get(5).unwrap().unwrap()).unwrap();

		let project_id = result.get(2).unwrap().expect("project_id is i64");
		let project = Project::find(project_id).expect("project doesn't exist");
		let snapshot_id = result.get(3).unwrap().expect("snapshot_id is i64");
		let snapshot = Snapshot::find(snapshot_id).expect("snapshot doesn't exist");
		let algorithm = Algorithm::from_str(result.get(4).unwrap().unwrap()).expect("algorithm is malformed");
		let runtime = Runtime::from_str(result.get(5).unwrap().unwrap()).expect("runtime is malformed");
		let data = Spi::get_one_with_args::<Vec<u8>>(
		"
		SELECT data
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -55,7 +55,7 @@ impl Sampling {
		Sampling::stratified => {
		format!(
		"
		SELECT*
		SELECT{col_string}
		FROM (
		SELECT
		*,
Expand DownExpand Up		@@ -125,7 +125,7 @@ mod tests {
		let columns = get_column_fixtures();
		let sql = sampling.get_sql("my_table", columns);
		let expected_sql = "
		SELECT*
		SELECT\"col1\", \"col2\"
		FROM (
		SELECT
		*,
Expand Down