Apr 18, 2022 · Apr 17, 2022
diff --git a/README.md b/README.md
 psql -f sql/test.sql
 ```

 One liner:
 ```
 cd pgml; sudo python3 setup.py install; cd ../; sudo -u postgres psql -f sql/test.sql
 ```

 Make sure to run it exactly like this, from the root directory of the repo.
diff --git a/examples/california_housing/run.sql b/examples/california_housing/run.sql
 -- This example trains models on the sklean california_housing dataset
 -- which is a copy of the test set from the StatLib repository
 -- https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
 --
 -- This demonstrates using a table with individual columns as features
 -- for regression.
 SELECT pgml.load_dataset('california_housing');

 -- view the dataset
 SELECT * from pgml.california_housing;

 -- train a simple model to classify the data
 SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target');

 -- check out the predictions
 SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
 FROM pgml.california_housing
 LIMIT 10;

 -- -- train some more models with different algorithms
 SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'svm');
 SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'random_forest');
 SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosting_trees');
 -- TODO SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'dense_neural_network');
 -- -- check out all that hard work
 SELECT * FROM pgml.trained_models;

 -- deploy the random_forest model for prediction use
 SELECT pgml.deploy('California Housing Prediction', 'random_forest');
 -- check out that throughput
 SELECT * FROM pgml.deployed_models;

 -- do some hyper param tuning
 -- TODO SELECT pgml.hypertune(100, 'California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosted_trees');
 -- deploy the "best" model for prediction use
 SELECT pgml.deploy('California Housing Prediction', 'best_fit');

 -- check out the improved predictions
 SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
 FROM pgml.california_housing
 LIMIT 10;
diff --git a/examples/digits/run.sql b/examples/digits/run.sql
 -- which is a copy of the test set of the UCI ML hand-written digits datasets
 -- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
 --
 -- This demonstrates using a table with a single array feature column
 -- for classification.
 --
 -- The final result after a few seconds of training is not terrible. Maybe not perfect
 -- enough for mission critical applications, but it's telling how quickly "off the shelf"
 -- solutions can solve problems these days.
diff --git a/pgml/pgml/datasets.py b/pgml/pgml/datasets.py
 import plpy
 from sklearn.datasets import load_digits as d
 import sklearn.datasets

 from pgml.sql import q
 from pgml.exceptions import PgMLException

 def load(source: str):
    if source == "digits":
        load_digits()
    elif source == "california_housing":
        load_california_housing()
    else:
        raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].")
    return "OK"

 def load_digits():
    dataset =d()
    dataset =sklearn.datasets.load_digits()
    a = plpy.execute("DROP TABLE IF EXISTS pgml.digits")
    a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)")
    a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""")
    for X, y in zip(dataset["data"], dataset["target"]):
        X = ",".join("%i" % x for x in list(X))
        plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""")

 def load_california_housing():
    dataset = sklearn.datasets.fetch_california_housing()
    a = plpy.execute("DROP TABLE IF EXISTS pgml.california_housing")
    a = plpy.execute("""
        CREATE TABLE pgml.california_housing (
            median_income FLOAT4, -- median income in block group
            house_age FLOAT4, -- median house age in block group
            avg_rooms FLOAT4, -- average number of rooms per household
            avg_bedrooms FLOAT4, -- average number of bedrooms per household
            population FLOAT4, -- block group population
            avg_occupants FLOAT4, -- average number of household members
            latitude FLOAT4, -- block group latitude
            longitude FLOAT4, -- block group longitudetarget INTEGER
            target FLOAT4
        )""")
    a = plpy.execute(f"""COMMENT ON TABLE pgml.california_housing IS {q(dataset["DESCR"])}""")
    for X, y in zip(dataset["data"], dataset["target"]):
        plpy.execute(f"""
            INSERT INTO pgml.california_housing (median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude, target)
            VALUES ({q(X[0])}, {q(X[1])}, {q(X[2])}, {q(X[3])}, {q(X[4])}, {q(X[5])}, {q(X[6])}, {q(X[7])}, {q(y)})""")

diff --git a/sql/install.sql b/sql/install.sql
 ---
 --- Predict
 ---
 CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, featuresNUMERIC[])
 CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, featuresDOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION
 AS $$
 from pgml.model import Project
Original file line number	Diff line number	Diff line change
Expand Up		@@ -211,4 +211,9 @@ Run the test:
		psql -f sql/test.sql
		```

		One liner:
		```
		cd pgml; sudo python3 setup.py install; cd ../; sudo -u postgres psql -f sql/test.sql
		```

		Make sure to run it exactly like this, from the root directory of the repo.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,41 @@
		-- This example trains models on the sklean california_housing dataset
		-- which is a copy of the test set from the StatLib repository
		-- https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
		--
		-- This demonstrates using a table with individual columns as features
		-- for regression.
		SELECT pgml.load_dataset('california_housing');

		-- view the dataset
		SELECT * from pgml.california_housing;

		-- train a simple model to classify the data
		SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target');

		-- check out the predictions
		SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
		FROM pgml.california_housing
		LIMIT 10;

		-- -- train some more models with different algorithms
		SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'svm');
		SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'random_forest');
		SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosting_trees');
		-- TODO SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'dense_neural_network');
		-- -- check out all that hard work
		SELECT * FROM pgml.trained_models;

		-- deploy the random_forest model for prediction use
		SELECT pgml.deploy('California Housing Prediction', 'random_forest');
		-- check out that throughput
		SELECT * FROM pgml.deployed_models;

		-- do some hyper param tuning
		-- TODO SELECT pgml.hypertune(100, 'California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosted_trees');
		-- deploy the "best" model for prediction use
		SELECT pgml.deploy('California Housing Prediction', 'best_fit');

		-- check out the improved predictions
		SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
		FROM pgml.california_housing
		LIMIT 10;
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,6 +2,9 @@
		-- which is a copy of the test set of the UCI ML hand-written digits datasets
		-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
		--
		-- This demonstrates using a table with a single array feature column
		-- for classification.
		--
		-- The final result after a few seconds of training is not terrible. Maybe not perfect
		-- enough for mission critical applications, but it's telling how quickly "off the shelf"
		-- solutions can solve problems these days.
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,21 +1,45 @@
		import plpy
		from sklearn.datasets import load_digits as d
		import sklearn.datasets

		from pgml.sql import q
		from pgml.exceptions import PgMLException

		def load(source: str):
		if source == "digits":
		load_digits()
		elif source == "california_housing":
		load_california_housing()
		else:
		raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].")
		return "OK"

		def load_digits():
		dataset =d()
		dataset =sklearn.datasets.load_digits()
		a = plpy.execute("DROP TABLE IF EXISTS pgml.digits")
		a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)")
		a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""")
		for X, y in zip(dataset["data"], dataset["target"]):
		X = ",".join("%i" % x for x in list(X))
		plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""")

		def load_california_housing():
		dataset = sklearn.datasets.fetch_california_housing()
		a = plpy.execute("DROP TABLE IF EXISTS pgml.california_housing")
		a = plpy.execute("""
		CREATE TABLE pgml.california_housing (
		median_income FLOAT4, -- median income in block group
		house_age FLOAT4, -- median house age in block group
		avg_rooms FLOAT4, -- average number of rooms per household
		avg_bedrooms FLOAT4, -- average number of bedrooms per household
		population FLOAT4, -- block group population
		avg_occupants FLOAT4, -- average number of household members
		latitude FLOAT4, -- block group latitude
		longitude FLOAT4, -- block group longitudetarget INTEGER
		target FLOAT4
		)""")
		a = plpy.execute(f"""COMMENT ON TABLE pgml.california_housing IS {q(dataset["DESCR"])}""")
		for X, y in zip(dataset["data"], dataset["target"]):
		plpy.execute(f"""
		INSERT INTO pgml.california_housing (median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude, target)
		VALUES ({q(X[0])}, {q(X[1])}, {q(X[2])}, {q(X[3])}, {q(X[4])}, {q(X[5])}, {q(X[6])}, {q(X[7])}, {q(y)})""")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -136,7 +136,7 @@ $$ LANGUAGE plpython3u;
		---
		--- Predict
		---
		CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, featuresNUMERIC[])
		CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, featuresDOUBLE PRECISION[])
		RETURNS DOUBLE PRECISION
		AS $$
		from pgml.model import Project
Expand Down