NotificationsYou must be signed in to change notification settings
Fork328
Star6.4k

Commit458008a

Montana Low

committed

add a new example

1 parent415f2e1 commit458008aCopy full SHA for 458008a

File tree

5 files changed

+251

-51

lines changed

examples/digits
- run.sql
pgml/pgml
- datasets.py
- model.py
sql
- install.sql
- test.sql

5 files changed

+251

-51

lines changed

`‎examples/digits/run.sql`

Lines changed: 42 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,42 @@`
	`1`	`+-- This example trains models on the sklean digits dataset`
	`2`	`+-- which is a copy of the test set of the UCI ML hand-written digits datasets`
	`3`	`+-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits`
	`4`	`+--`
	`5`	`+-- The final result after a few seconds of training is not terrible. Maybe not perfect`
	`6`	`+-- enough for mission critical applications, but it's telling how quickly "off the shelf"`
	`7`	`+-- solutions can solve problems these days.`
	`8`	`+SELECTpgml.load_dataset('digits');`
	`9`	`+`
	`10`	`+-- view the dataset`
	`11`	`+SELECT*frompgml.digits;`
	`12`	`+`
	`13`	`+-- train a simple model to classify the data`
	`14`	`+SELECTpgml.train('Handwritten Digit Image Classifier','classification','pgml.digits','target');`
	`15`	`+`
	`16`	`+-- check out the predictions`
	`17`	`+SELECT target,pgml.predict('Handwritten Digit Image Classifier', image)AS prediction`
	`18`	`+FROMpgml.digits`
	`19`	`+LIMIT10;`
	`20`	`+`
	`21`	`+-- -- train some more models with different algorithms`
	`22`	`+SELECTpgml.train('Handwritten Digit Image Classifier','classification','pgml.digits','target','svm');`
	`23`	`+SELECTpgml.train('Handwritten Digit Image Classifier','classification','pgml.digits','target','random_forest');`
	`24`	`+SELECTpgml.train('Handwritten Digit Image Classifier','classification','pgml.digits','target','gradient_boosting_trees');`
	`25`	`+-- TODO SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'dense_neural_network');`
	`26`	`+-- -- check out all that hard work`
	`27`	`+SELECT*FROMpgml.trained_models;`
	`28`	`+`
	`29`	`+-- deploy the random_forest model for prediction use`
	`30`	`+SELECTpgml.deploy('Handwritten Digit Image Classifier','random_forest');`
	`31`	`+-- check out that throughput`
	`32`	`+SELECT*FROMpgml.deployed_models;`
	`33`	`+`
	`34`	`+-- do some hyper param tuning`
	`35`	`+-- TODO SELECT pgml.hypertune(100, 'Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'gradient_boosted_trees');`
	`36`	`+-- deploy the "best" model for prediction use`
	`37`	`+SELECTpgml.deploy('Handwritten Digit Image Classifier','best_fit');`
	`38`	`+`
	`39`	`+-- check out the improved predictions`
	`40`	`+SELECT target,pgml.predict('Handwritten Digit Image Classifier', image)AS prediction`
	`41`	`+FROMpgml.digits`
	`42`	`+LIMIT10;`

`‎pgml/pgml/datasets.py`

Lines changed: 21 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,21 @@`
	`1`	`+importplpy`
	`2`	`+fromsklearn.datasetsimportload_digitsasd`
	`3`	`+`
	`4`	`+frompgml.sqlimportq`
	`5`	`+frompgml.exceptionsimportPgMLException`
	`6`	`+`
	`7`	`+defload(source:str):`
	`8`	`+ifsource=="digits":`
	`9`	`+load_digits()`
	`10`	`+else:`
	`11`	`+raisePgMLException(f"Invalid dataset name:{source}. Valid values are ['digits'].")`
	`12`	`+return"OK"`
	`13`	`+`
	`14`	`+defload_digits():`
	`15`	`+dataset=d()`
	`16`	`+a=plpy.execute("DROP TABLE IF EXISTS pgml.digits")`
	`17`	`+a=plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)")`
	`18`	`+a=plpy.execute(f"""COMMENT ON TABLE pgml.digits IS{q(dataset["DESCR"])}""")`
	`19`	`+forX,yinzip(dataset["data"],dataset["target"]):`
	`20`	`+X=",".join("%i"%xforxinlist(X))`
	`21`	`+plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}',{y})""")`

`‎pgml/pgml/model.py`

Lines changed: 107 additions & 34 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,14 +1,23 @@`
	`1`	`+fromreimportM`
`1`	`2`	`importplpy`
`2`		`-fromsklearn.linear_modelimportLinearRegression`
`3`		`-fromsklearn.ensembleimportRandomForestRegressor,RandomForestClassifier`
	`3`	`+fromsklearn.linear_modelimportLinearRegression,LogisticRegression`
	`4`	`+fromsklearn.svmimportSVR,SVC`
	`5`	`+fromsklearn.ensembleimportRandomForestRegressor,RandomForestClassifier,GradientBoostingRegressor,GradientBoostingClassifier`
`4`	`6`	`fromsklearn.model_selectionimporttrain_test_split`
`5`		`-fromsklearn.metricsimportmean_squared_error,r2_score`
	`7`	`+fromsklearn.metricsimportmean_squared_error,r2_score,f1_score,precision_score,recall_score`
`6`	`8`
`7`	`9`	`importpickle`
	`10`	`+importjson`
`8`	`11`
`9`	`12`	`frompgml.exceptionsimportPgMLException`
`10`	`13`	`frompgml.sqlimportq`
`11`	`14`
	`15`	`+defflatten(S):`
	`16`	`+ifS== []:`
	`17`	`+returnS`
	`18`	`+ifisinstance(S[0],list):`
	`19`	`+returnflatten(S[0])+flatten(S[1:])`
	`20`	`+returnS[:1]+flatten(S[1:])`
`12`	`21`
`13`	`22`	`classProject(object):`
`14`	`23`	`"""`
`@@ -124,6 +133,14 @@ def deployed_model(self):`
`124`	`133`	`self._deployed_model=Model.find_deployed(self.id)`
`125`	`134`	`returnself._deployed_model`
`126`	`135`
	`136`	`+defdeploy(self,algorithm_name):`
	`137`	`+model=None`
	`138`	`+ifalgorithm_name=="best_fit":`
	`139`	`+model=Model.find_by_project_and_best_fit(self)`
	`140`	`+else:`
	`141`	`+model=Model.find_by_project_id_and_algorithm_name(self.id,algorithm_name)`
	`142`	`+model.deploy()`
	`143`	`+returnmodel`
`127`	`144`
`128`	`145`	`classSnapshot(object):`
`129`	`146`	`"""`
`@@ -178,7 +195,7 @@ def create(`
`178`	`195`	`plpy.execute(`
`179`	`196`	`f"""`
`180`	`197`	`CREATE TABLE pgml."snapshot_{snapshot.id}" AS`
`181`		`- SELECT * FROM"{snapshot.relation_name}";`
	`198`	`+ SELECT * FROM{snapshot.relation_name};`
`182`	`199`	`"""`
`183`	`200`	`)`
`184`	`201`	`snapshot.__dict__=dict(`
`@@ -232,6 +249,7 @@ def data(self):`
`232`	`249`	`forcolumnincolumns:`
`233`	`250`	`x_.append(row[column])`
`234`	`251`
	`252`	`+x_=flatten(x_)# TODO be smart about flattening X depending on algorithm`
`235`	`253`	`X.append(x_)`
`236`	`254`	`y.append(y_)`
`237`	`255`
`@@ -262,8 +280,7 @@ class Model(object):`
`262`	`280`	`status (str): The current status of the model, e.g. 'new', 'training' or 'successful'`
`263`	`281`	`created_at (Timestamp): when this model was created`
`264`	`282`	`updated_at (Timestamp): when this model was last updated`
`265`		`- mean_squared_error (float):`
`266`		`- r2_score (float):`
	`283`	`+ metrics (dict): key performance indicators for the model`
`267`	`284`	`pickle (bytes): the serialized version of the model parameters`
`268`	`285`	`algorithm: the in memory version of the model parameters that can make predictions`
`269`	`286`	`"""`
`@@ -320,6 +337,63 @@ def find_deployed(cls, project_id: int):`
`320`	`337`	`model.__init__()`
`321`	`338`	`returnmodel`
`322`	`339`
	`340`	`+@classmethod`
	`341`	`+deffind_by_project_id_and_algorithm_name(cls,project_id:int,algorithm_name:str):`
	`342`	`+"""`
	`343`	`+ Args:`
	`344`	`+ project_id (int): The project id`
	`345`	`+ algorithm_name (str): The algorithm`
	`346`	`+ Returns:`
	`347`	`+ Model: most recently created model that fits the criteria`
	`348`	`+ """`
	`349`	`+result=plpy.execute(`
	`350`	`+f"""`
	`351`	`+ SELECT models.*`
	`352`	`+ FROM pgml.models`
	`353`	`+ WHERE algorithm_name ={q(algorithm_name)}`
	`354`	`+ AND project_id ={q(project_id)}`
	`355`	`+ ORDER by models.created_at DESC`
	`356`	`+ LIMIT 1`
	`357`	`+ """`
	`358`	`+ )`
	`359`	`+iflen(result)==0:`
	`360`	`+returnNone`
	`361`	`+`
	`362`	`+model=Model()`
	`363`	`+model.__dict__=dict(result[0])`
	`364`	`+model.__init__()`
	`365`	`+returnmodel`
	`366`	`+`
	`367`	`+@classmethod`
	`368`	`+deffind_by_project_and_best_fit(cls,project:Project):`
	`369`	`+"""`
	`370`	`+ Args:`
	`371`	`+ project (Project): The project`
	`372`	`+ Returns:`
	`373`	`+ Model: the model with the best metrics for the project`
	`374`	`+ """`
	`375`	`+ifproject.objective=="regression":`
	`376`	`+metric="mean_squared_error"`
	`377`	`+elifproject.objective=="classification":`
	`378`	`+metric="f1"`
	`379`	`+`
	`380`	`+result=plpy.execute(`
	`381`	`+f"""`
	`382`	`+ SELECT models.*`
	`383`	`+ FROM pgml.models`
	`384`	`+ WHERE project_id ={q(project.id)}`
	`385`	`+ ORDER by models.metrics->>{q(metric)} DESC`
	`386`	`+ LIMIT 1`
	`387`	`+ """`
	`388`	`+ )`
	`389`	`+iflen(result)==0:`
	`390`	`+returnNone`
	`391`	`+`
	`392`	`+model=Model()`
	`393`	`+model.__dict__=dict(result[0])`
	`394`	`+model.__init__()`
	`395`	`+returnmodel`
	`396`	`+`
`323`	`397`	`def__init__(self):`
`324`	`398`	`self._algorithm=None`
`325`	`399`	`self._project=None`
`@@ -342,8 +416,13 @@ def algorithm(self):`
`342`	`416`	`else:`
`343`	`417`	`self._algorithm= {`
`344`	`418`	`"linear_regression":LinearRegression,`
	`419`	`+"linear_classification":LogisticRegression,`
	`420`	`+"svm_regression":SVR,`
	`421`	`+"svm_classification":SVC,`
`345`	`422`	`"random_forest_regression":RandomForestRegressor,`
`346`	`423`	`"random_forest_classification":RandomForestClassifier,`
	`424`	`+"gradient_boosting_trees_regression":GradientBoostingRegressor,`
	`425`	`+"gradient_boosting_trees_classification":GradientBoostingClassifier,`
`347`	`426`	`}[self.algorithm_name+"_"+self.project.objective]()`
`348`	`427`
`349`	`428`	`returnself._algorithm`
`@@ -362,8 +441,14 @@ def fit(self, snapshot: Snapshot):`
`362`	`441`
`363`	`442`	`# Test`
`364`	`443`	`y_pred=self.algorithm.predict(X_test)`
`365`		`-msq=mean_squared_error(y_test,y_pred)`
`366`		`-r2=r2_score(y_test,y_pred)`
	`444`	`+metrics= {}`
	`445`	`+ifself.project.objective=="regression":`
	`446`	`+metrics["mean_squared_error"]=mean_squared_error(y_test,y_pred)`
	`447`	`+metrics["r2"]=r2_score(y_test,y_pred)`
	`448`	`+elifself.project.objective=="classification":`
	`449`	`+metrics["f1"]=f1_score(y_test,y_pred,average="weighted")`
	`450`	`+metrics["precision"]=precision_score(y_test,y_pred,average="weighted")`
	`451`	`+metrics["recall"]=recall_score(y_test,y_pred,average="weighted")`
`367`	`452`
`368`	`453`	`# Save the model`
`369`	`454`	`self.__dict__=dict(`
`@@ -372,8 +457,7 @@ def fit(self, snapshot: Snapshot):`
`372`	`457`	`UPDATE pgml.models`
`373`	`458`	`SET pickle = '\\x{pickle.dumps(self.algorithm).hex()}',`
`374`	`459`	`status = 'successful',`
`375`		`- mean_squared_error ={q(msq)},`
`376`		`- r2_score ={q(r2)}`
	`460`	`+ metrics ={q(json.dumps(metrics))}`
`377`	`461`	`WHERE id ={q(self.id)}`
`378`	`462`	`RETURNING *`
`379`	`463`	`"""`
`@@ -398,6 +482,7 @@ def predict(self, data: list):`
`398`	`482`	`Returns:`
`399`	`483`	`float or int: scores for regressions or ints for classifications`
`400`	`484`	`"""`
	`485`	`+# TODO: add metrics for tracking prediction volume/accuracy by model`
`401`	`486`	`returnself.algorithm.predict(data)`
`402`	`487`
`403`	`488`
`@@ -406,6 +491,7 @@ def train(`
`406`	`491`	`objective:str,`
`407`	`492`	`relation_name:str,`
`408`	`493`	`y_column_name:str,`
	`494`	`+algorithm_name:str="linear",`
`409`	`495`	`test_size:floatorint=0.1,`
`410`	`496`	`test_sampling:str="random",`
`411`	`497`	`):`
`@@ -416,15 +502,14 @@ def train(`
`416`	`502`	`objective (str): Defaults to "regression". Valid values are ["regression", "classification"].`
`417`	`503`	`relation_name (str): the table or view that stores the training data`
`418`	`504`	`y_column_name (str): the column in the training data that acts as the label`
`419`		`-algorithm (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "random_forest"].`
	`505`	`+algorithm_name (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "svm", "random_forest", "gradient_boosting"].`
`420`	`506`	`test_size (float or int, optional): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If train_size is also None, it will be set to 0.25.`
`421`	`507`	`test_sampling: (str, optional): How to sample to create the test data. Defaults to "random". Valid values are ["first", "last", "random"].`
`422`	`508`	`"""`
`423`		`-ifobjective=="regression":`
`424`		`-algorithms= ["linear","random_forest"]`
`425`		`-elifobjective=="classification":`
`426`		`-algorithms= ["random_forest"]`
`427`		`-else:`
	`509`	`+ifalgorithm_nameisNone:`
	`510`	`+algorithm_name="linear"`
	`511`	`+`
	`512`	`+ifobjectivenotin ["regression","classification"]:`
`428`	`513`	`raisePgMLException(`
`429`	`514`	f"Unknown objective `{objective}`, available options are: regression, classification."
`430`	`515`	`)`
`@@ -440,23 +525,11 @@ def train(`
`440`	`525`	`)`
`441`	`526`
`442`	`527`	`snapshot=Snapshot.create(relation_name,y_column_name,test_size,test_sampling)`
`443`		`-deployed=Model.find_deployed(project.id)`
`444`		`-`
`445`		`-# Let's assume that the deployed model is better for now.`
`446`		`-best_model=deployed`
`447`		`-best_error=best_model.mean_squared_errorifbest_modelelseNone`
`448`		`-`
`449`		`-foralgorithm_nameinalgorithms:`
`450`		`-model=Model.create(project,snapshot,algorithm_name)`
`451`		`-model.fit(snapshot)`
	`528`	`+model=Model.create(project,snapshot,algorithm_name)`
	`529`	`+model.fit(snapshot)`
`452`	`530`
`453`		`-# Find the better model and deploy that.`
`454`		`-ifbest_errorisNoneormodel.mean_squared_error<best_error:`
`455`		`-best_error=model.mean_squared_error`
`456`		`-best_model=model`
`457`		`-`
`458`		`-ifdeployedanddeployed.id==best_model.id:`
`459`		`-return"rolled back"`
`460`		`-else:`
`461`		`-best_model.deploy()`
	`531`	`+ifproject.deployed_modelisNone:`
	`532`	`+model.deploy()`
`462`	`533`	`return"deployed"`
	`534`	`+else:`
	`535`	`+return"not deployed"`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit458008a

File tree

5 files changed

5 files changed

`‎examples/digits/run.sql`

`‎pgml/pgml/datasets.py`

`‎pgml/pgml/model.py`

0 commit comments