|
| 1 | +""" |
| 2 | +.. _l-example-benchmark-tree-implementation: |
| 3 | +
|
| 4 | +Benchmark of TreeEnsemble implementation |
| 5 | +======================================== |
| 6 | +
|
| 7 | +The following example compares the inference time between |
| 8 | +:epkg:`onnxruntime` and :class:`sklearn.ensemble.RandomForestRegressor`, |
| 9 | +fow different number of estimators, max depth, and parallelization. |
| 10 | +It does it for a fixed number of rows and features. |
| 11 | +
|
| 12 | +import and registration of necessary converters |
| 13 | +++++++++++++++++++++++++++++++++++++++++++++++++ |
| 14 | +""" |
| 15 | +importpickle |
| 16 | +importos |
| 17 | +importtime |
| 18 | +fromitertoolsimportproduct |
| 19 | + |
| 20 | +importmatplotlib.pyplotasplt |
| 21 | +importnumpy |
| 22 | +importpandas |
| 23 | +fromlightgbmimportLGBMRegressor |
| 24 | +fromonnxmltools.convert.lightgbm.operator_converters.LightGbmimportconvert_lightgbm |
| 25 | +fromonnxmltools.convert.xgboost.operator_converters.XGBoostimportconvert_xgboost |
| 26 | +fromonnxruntimeimportInferenceSession,SessionOptions |
| 27 | +frompsutilimportcpu_count |
| 28 | +frompyquickhelper.loghelperimportrun_cmd |
| 29 | +fromskl2onnximportto_onnx,update_registered_converter |
| 30 | +fromskl2onnx.common.shape_calculatorimportcalculate_linear_regressor_output_shapes |
| 31 | +fromsklearnimportset_config |
| 32 | +fromsklearn.ensembleimportRandomForestRegressor |
| 33 | +fromtqdmimporttqdm |
| 34 | +fromxgboostimportXGBRegressor |
| 35 | + |
| 36 | + |
| 37 | +defskl2onnx_convert_lightgbm(scope,operator,container): |
| 38 | +options=scope.get_options(operator.raw_operator) |
| 39 | +if"split"inoptions: |
| 40 | +operator.split=options["split"] |
| 41 | +else: |
| 42 | +operator.split=None |
| 43 | +convert_lightgbm(scope,operator,container) |
| 44 | + |
| 45 | + |
| 46 | +update_registered_converter( |
| 47 | +LGBMRegressor, |
| 48 | +"LightGbmLGBMRegressor", |
| 49 | +calculate_linear_regressor_output_shapes, |
| 50 | +skl2onnx_convert_lightgbm, |
| 51 | +options={"split":None}, |
| 52 | +) |
| 53 | +update_registered_converter( |
| 54 | +XGBRegressor, |
| 55 | +"XGBoostXGBRegressor", |
| 56 | +calculate_linear_regressor_output_shapes, |
| 57 | +convert_xgboost, |
| 58 | +) |
| 59 | + |
| 60 | +# The following instruction reduces the time spent by scikit-learn |
| 61 | +# to validate the data. |
| 62 | +set_config(assume_finite=True) |
| 63 | + |
| 64 | +########################################## |
| 65 | +# Machine details |
| 66 | +# +++++++++++++++ |
| 67 | + |
| 68 | + |
| 69 | +print(f"Number of cores:{cpu_count()}") |
| 70 | + |
| 71 | +############################################### |
| 72 | +# But this information is not usually enough. |
| 73 | +# Let's extract the cache information. |
| 74 | + |
| 75 | +try: |
| 76 | +out,err=run_cmd("lscpu") |
| 77 | +print(out) |
| 78 | +exceptExceptionase: |
| 79 | +print(f"lscpu not available:{e}") |
| 80 | + |
| 81 | +############################################### |
| 82 | +# Or with the following command. |
| 83 | +out,err=run_cmd("cat /proc/cpuinfo") |
| 84 | + |
| 85 | +############################################### |
| 86 | +# Fonction to measure inference time |
| 87 | +# ++++++++++++++++++++++++++++++++++ |
| 88 | + |
| 89 | + |
| 90 | +defmeasure_inference(fct,X,repeat,max_time=5,quantile=1): |
| 91 | +""" |
| 92 | + Run *repeat* times the same function on data *X*. |
| 93 | +
|
| 94 | + :param fct: fonction to run |
| 95 | + :param X: data |
| 96 | + :param repeat: number of times to run |
| 97 | + :param max_time: maximum time to use to measure the inference |
| 98 | + :return: number of runs, sum of the time, average, median |
| 99 | + """ |
| 100 | +times= [] |
| 101 | +forninrange(repeat): |
| 102 | +perf=time.perf_counter() |
| 103 | +fct(X) |
| 104 | +delta=time.perf_counter()-perf |
| 105 | +times.append(delta) |
| 106 | +iflen(times)<3: |
| 107 | +continue |
| 108 | +ifmax_timeisnotNoneandsum(times)>=max_time: |
| 109 | +break |
| 110 | +times.sort() |
| 111 | +quantile=0if (len(times)-quantile*2)<3elsequantile |
| 112 | +ifquantile==0: |
| 113 | +tt=times |
| 114 | +else: |
| 115 | +tt=times[quantile:-quantile] |
| 116 | +return (len(times),sum(times),sum(tt)/len(tt),times[len(times)//2]) |
| 117 | + |
| 118 | + |
| 119 | +############################################### |
| 120 | +# Benchmark |
| 121 | +# +++++++++ |
| 122 | +# |
| 123 | +# The following script benchmarks the inference for the same |
| 124 | +# model for a random forest and onnxruntime after it was converted |
| 125 | +# into ONNX and for the following configurations. |
| 126 | + |
| 127 | +legend="parallel-batch-4096-block" |
| 128 | + |
| 129 | +small=cpu_count()<12 |
| 130 | +ifsmall: |
| 131 | +N=1000 |
| 132 | +n_features=10 |
| 133 | +n_jobs= [1,cpu_count()//2,cpu_count()] |
| 134 | +n_ests= [10,20,30] |
| 135 | +depth= [4,6,8,10] |
| 136 | +Regressor=RandomForestRegressor |
| 137 | +else: |
| 138 | +N=100000 |
| 139 | +n_features=50 |
| 140 | +n_jobs= [cpu_count(),cpu_count()//2,1] |
| 141 | +n_ests= [100,200,400] |
| 142 | +depth= [6,8,10,12,14] |
| 143 | +Regressor=RandomForestRegressor |
| 144 | + |
| 145 | +# avoid duplicates on machine with 1 or 2 cores. |
| 146 | +n_jobs=list(sorted(set(n_jobs),reverse=True)) |
| 147 | + |
| 148 | +############################################## |
| 149 | +# Benchmark parameters |
| 150 | +repeat=7# repeat n times the same inference |
| 151 | +quantile=1# exclude extreme times |
| 152 | +max_time=5# maximum number of seconds to spend on one configuration |
| 153 | + |
| 154 | +############################################## |
| 155 | +# Data |
| 156 | + |
| 157 | + |
| 158 | +X=numpy.random.randn(N,n_features).astype(numpy.float32) |
| 159 | +noise= (numpy.random.randn(X.shape[0])/ (n_features//5)).astype(numpy.float32) |
| 160 | +y=X.mean(axis=1)+noise |
| 161 | +n_train=min(N,N//3) |
| 162 | + |
| 163 | + |
| 164 | +data= [] |
| 165 | +couples=list(product(n_jobs,depth,n_ests)) |
| 166 | +bar=tqdm(couples) |
| 167 | +cache_dir="_cache" |
| 168 | +ifnotos.path.exists(cache_dir): |
| 169 | +os.mkdir(cache_dir) |
| 170 | + |
| 171 | +forn_j,max_depth,n_estimatorsinbar: |
| 172 | +ifn_j==1andn_estimators>n_ests[0]: |
| 173 | +# skipping |
| 174 | +continue |
| 175 | + |
| 176 | +# parallelization |
| 177 | +cache_name=os.path.join( |
| 178 | +cache_dir,f"rf-J-{n_j}-E-{n_estimators}-D-{max_depth}.pkl" |
| 179 | + ) |
| 180 | +ifos.path.exists(cache_name): |
| 181 | +withopen(cache_name,"rb")asf: |
| 182 | +rf=pickle.load(f) |
| 183 | +else: |
| 184 | +bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} train rf") |
| 185 | +ifn_j==1andissubclass(Regressor,RandomForestRegressor): |
| 186 | +rf=Regressor(max_depth=max_depth,n_estimators=n_estimators,n_jobs=-1) |
| 187 | +rf.fit(X[:n_train],y[:n_train]) |
| 188 | +rf.n_jobs=1 |
| 189 | +else: |
| 190 | +rf=Regressor(max_depth=max_depth,n_estimators=n_estimators,n_jobs=n_j) |
| 191 | +rf.fit(X[:n_train],y[:n_train]) |
| 192 | +withopen(cache_name,"wb")asf: |
| 193 | +pickle.dump(rf,f) |
| 194 | + |
| 195 | +bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} ISession") |
| 196 | +so=SessionOptions() |
| 197 | +so.intra_op_num_threads=n_j |
| 198 | +cache_name=os.path.join( |
| 199 | +cache_dir,f"rf-J-{n_j}-E-{n_estimators}-D-{max_depth}.onnx" |
| 200 | + ) |
| 201 | +ifos.path.exists(cache_name): |
| 202 | +sess=InferenceSession(cache_name,so) |
| 203 | +else: |
| 204 | +bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} cvt onnx") |
| 205 | +onx=to_onnx(rf,X[:1]) |
| 206 | +withopen(cache_name,"wb")asf: |
| 207 | +f.write(onx.SerializeToString()) |
| 208 | +sess=InferenceSession(cache_name,so) |
| 209 | +onx_size=os.stat(cache_name).st_size |
| 210 | + |
| 211 | +# run once to avoid counting the first run |
| 212 | +bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predict1") |
| 213 | +rf.predict(X) |
| 214 | +sess.run(None, {"X":X}) |
| 215 | + |
| 216 | +# fixed data |
| 217 | +obs=dict( |
| 218 | +n_jobs=n_j, |
| 219 | +max_depth=max_depth, |
| 220 | +n_estimators=n_estimators, |
| 221 | +repeat=repeat, |
| 222 | +max_time=max_time, |
| 223 | +name=rf.__class__.__name__, |
| 224 | +n_rows=X.shape[0], |
| 225 | +n_features=X.shape[1], |
| 226 | +onnx_size=onx_size, |
| 227 | + ) |
| 228 | + |
| 229 | +# baseline |
| 230 | +bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predictB") |
| 231 | +r,t,mean,med=measure_inference(rf.predict,X,repeat=repeat,max_time=max_time) |
| 232 | +o1=obs.copy() |
| 233 | +o1.update(dict(avg=mean,med=med,n_runs=r,ttime=t,name="base")) |
| 234 | +data.append(o1) |
| 235 | + |
| 236 | +# baseline |
| 237 | +bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predictO") |
| 238 | +r,t,mean,med=measure_inference( |
| 239 | +lambdax:sess.run(None, {"X":x}),X,repeat=repeat,max_time=max_time |
| 240 | + ) |
| 241 | +o2=obs.copy() |
| 242 | +o2.update(dict(avg=mean,med=med,n_runs=r,ttime=t,name="ort_")) |
| 243 | +data.append(o2) |
| 244 | + |
| 245 | + |
| 246 | +################################################### |
| 247 | +# Saving data |
| 248 | +# +++++++++++ |
| 249 | + |
| 250 | +name=os.path.join(cache_dir,"plot_beanchmark_rf") |
| 251 | +print(f"Saving data into{name!r}") |
| 252 | + |
| 253 | +df=pandas.DataFrame(data) |
| 254 | +df2=df.copy() |
| 255 | +df2["legend"]=legend |
| 256 | +df2.to_csv(f"{name}-{legend}.csv",index=False) |
| 257 | + |
| 258 | +####################################################### |
| 259 | +# Printing the data |
| 260 | +print(df) |
| 261 | + |
| 262 | +##################################################### |
| 263 | +# Plot |
| 264 | +# ++++ |
| 265 | + |
| 266 | +n_rows=len(n_jobs) |
| 267 | +n_cols=len(n_ests) |
| 268 | + |
| 269 | + |
| 270 | +fig,axes=plt.subplots(n_rows,n_cols,figsize=(4*n_cols,4*n_rows)) |
| 271 | +fig.suptitle(f"{rf.__class__.__name__}") |
| 272 | + |
| 273 | +forn_j,n_estimatorsintqdm(product(n_jobs,n_ests)): |
| 274 | +i=n_jobs.index(n_j) |
| 275 | +j=n_ests.index(n_estimators) |
| 276 | +ax=axes[i,j] |
| 277 | + |
| 278 | +subdf=df[(df.n_estimators==n_estimators)& (df.n_jobs==n_j)] |
| 279 | +ifsubdf.shape[0]==0: |
| 280 | +continue |
| 281 | +piv=subdf.pivot(index="max_depth",columns="name",values=["avg","med"]) |
| 282 | +piv.plot(ax=ax,title=f"jobs={n_j}, trees={n_estimators}") |
| 283 | +ax.set_ylabel(f"n_jobs={n_j}",fontsize="small") |
| 284 | +ax.set_xlabel("max_depth",fontsize="small") |
| 285 | + |
| 286 | +# ratio |
| 287 | +ax2=ax.twinx() |
| 288 | +piv1=subdf.pivot(index="max_depth",columns="name",values="avg") |
| 289 | +piv1["speedup"]=piv1.base/piv1.ort_ |
| 290 | +ax2.plot(piv1.index,piv1.speedup,"b--",label="speedup avg") |
| 291 | + |
| 292 | +piv1=subdf.pivot(index="max_depth",columns="name",values="med") |
| 293 | +piv1["speedup"]=piv1.base/piv1.ort_ |
| 294 | +ax2.plot(piv1.index,piv1.speedup,"y--",label="speedup med") |
| 295 | +ax2.legend(fontsize="x-small") |
| 296 | + |
| 297 | +# 1 |
| 298 | +ax2.plot(piv1.index, [1for_inpiv1.index],"k--",label="no speedup") |
| 299 | + |
| 300 | +foriinrange(axes.shape[0]): |
| 301 | +forjinrange(axes.shape[1]): |
| 302 | +axes[i,j].legend(fontsize="small") |
| 303 | + |
| 304 | +fig.tight_layout() |
| 305 | +fig.savefig(f"{name}-{legend}.png") |
| 306 | +# plt.show() |