## Load some required librarieslibrary(extraDistr)# functions related with the inverse-Gamma distributionlibrary(LaplacesDemon)# functions related with the multivariate Student dist.## ## Attaching package: 'LaplacesDemon'## The following objects are masked from 'package:extraDistr':## ## dbern, dcat, ddirichlet, dgpd, dgpois, dinvchisq, dinvgamma,## dlaplace, dpareto, pbern, plaplace, ppareto, qbern, qcat, qlaplace,## qpareto, rbern, rcat, rdirichlet, rgpd, rinvchisq, rinvgamma,## rlaplace, rparetoTheAuto dataset of theISLR librarycontains information on several characteristics of 392 vehicles.Specifically, we will want to model the relationship between theefficiency of these cars, measured as the number of miles they are ableto travel per gallon of fuel (variablempg), and theirpower, quantified by the variablehorsepower.
We begin by visualizing the relationship between the twovariables
The relationship between both variables is clearly nonlinear,possibly quadratic, so we will fit a (Bayesian) quadratic regressionmodel to this data set to describe this relationship.
We are going to fit now a regression model which describes aquadratic function ofhorsepower following a frequentistapproach and tools. Previously, we will standardizehorsepower in order to obtain more manageable estimates ofthe coefficients of the model and their variability. Let’s do it:
# Standardization of horsepowerAuto$horse.std<- (Auto$horsepower-mean(Auto$horsepower))/sd(Auto$horsepower)# Quadratic fitFit1<-lm(mpg~poly(horse.std,degree =2,raw =TRUE),data = Auto)summary(Fit1)## ## Call:## lm(formula = mpg ~ poly(horse.std, degree = 2, raw = TRUE), data = Auto)## ## Residuals:## Min 1Q Median 3Q Max ## -14.7135 -2.5943 -0.0859 2.2868 15.8961 ## ## Coefficients:## Estimate Std. Error t value Pr(>|t|)## (Intercept) 21.6274 0.2852 75.83 <2e-16## poly(horse.std, degree = 2, raw = TRUE)1 -8.0478 0.2953 -27.25 <2e-16## poly(horse.std, degree = 2, raw = TRUE)2 1.8231 0.1809 10.08 <2e-16## ## (Intercept) ***## poly(horse.std, degree = 2, raw = TRUE)1 ***## poly(horse.std, degree = 2, raw = TRUE)2 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1## ## Residual standard error: 4.374 on 389 degrees of freedom## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686 ## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16# Plot of the fitted curveplot(mpg~ horse.std,data = Auto)where<-seq(min(Auto$horse.std),max(Auto$horse.std),length =100)lines(where,predict(Fit1,data.frame(horse.std = where)),col =2,lwd =2)The fitted curve adequately describes the relationship between bothvariables. Moreover, every term of the polynomial seem to have asignificant effect, that is, all of them seem necessary to appropriatelydescribe the variation of
mpg as a function ofhorse.std.
Let us now fit the Bayesian version of the above model. We will use,for convenience, Jeffreys’ prior for\(P(\boldsymbol{\beta},\sigma^2)\propto\sigma^{-2}\). Therefore, we have:\[\pi(\sigma^2\mid \mathcal{D})\simIG(\sigma^2\mid(n-(p+1))/2,s_e^2/2)\]
plot(seq(13,25,by =0.1),dinvgamma(seq(13,25,by =0.1), (nrow(Auto)-3)/2,sum(residuals(Fit1)^2)/2),type ="l",ylab ="Posterior density",xlab ="sigma2")This distribution yields a posterior mean of\((s^2_e/2)/((n-(p+1))/2-1)\) and a mode of\((s^2_e/2)/((n-(p+1))/2+1)\), thatis:
## [1] 19.23005## [1] 19.03332## [1] 19.13118The Bayesian posterior point estimates are quite close to thefrequentist estimate, although the posterior distribution of\(\sigma^2\) allows to explore also theuncertainty of this parameter.
As mentioned in the theoretical session,\(\boldsymbol{\beta}\) has the followingmarginal posterior distribution:\[\pi(\boldsymbol{\beta}\mid \boldsymbol{y})\simt_{n-(p+1)}(\hat{\boldsymbol{\beta}},s_e^2(\boldsymbol{X}'\boldsymbol{X})^{-1}/(n-(p+1)))\]The mean of this distribution coincides with the coefficients of thelinear modelFit1, and has as variance-covariancematrix:
X<-cbind(rep(1,dim(Auto)[1]), Auto$horse.std, Auto$horse.std^2)VarCov<-solve(t(X)%*%X)*sum(residuals(Fit1)^2)/(nrow(Auto)-3)VarCov## [,1] [,2] [,3]## [1,] 0.08134909 0.03529659 -0.03262829## [2,] 0.03529659 0.08720960 -0.03538686## [3,] -0.03262829 -0.03538686 0.03271174## [,1] [,2] [,3]## [1,] 1.0000000 0.4190581 -0.6325082## [2,] 0.4190581 1.0000000 -0.6625338## [3,] -0.6325082 -0.6625338 1.0000000Moreover, we can also plot the bivariate distribution for each pairof coefficients, for example, the joint posterior distribution for theintercept and the linear term:
# Joint posterior distribution of the coefficients for the intercept and the# linear termxlims<-c(Fit1$coefficients[1]-3*sqrt(VarCov[1,1]), Fit1$coefficients[1]+3*sqrt(VarCov[1,1]))ylims<-c(Fit1$coefficients[2]-3*sqrt(VarCov[2,2]), Fit1$coefficients[2]+3*sqrt(VarCov[2,2]))gridPoints<-expand.grid(seq(xlims[1],xlims[2],by=0.01),seq(ylims[1],ylims[2],by=0.01))resul<-matrix(dmvt(as.matrix(gridPoints),mu = Fit1$coefficients[1:2],S =round(VarCov[1:2,1:2],5),df =nrow(Auto)-3),nrow =length(seq(xlims[1], xlims[2],by =0.01)))image(x =seq(xlims[1], xlims[2],by =0.01),y =seq(ylims[1], ylims[2],by =0.01),z = resul,xlab ="Intercept",ylab ="beta, linear term",main ="Bivariate posterior density")or the joint posterior distribution for the coefficients of thelinear and the quadratic terms:
# Plot the linear and quadratic componentxlims<-c(Fit1$coefficients[2]-3*sqrt(VarCov[2,2]), Fit1$coefficients[2]+3*sqrt(VarCov[2,2]))ylims<-c(Fit1$coefficients[3]-3*sqrt(VarCov[3,3]), Fit1$coefficients[3]+3*sqrt(VarCov[3,3]))gridPoints<-expand.grid(seq(xlims[1],xlims[2],by=0.01),seq(ylims[1],ylims[2],by=0.01))resul<-matrix(dmvt(as.matrix(gridPoints),mu = Fit1$coefficients[c(2,3)],S =round(VarCov[c(2,3),c(2,3)],5),df =nrow(Auto)-3),nrow =length(seq(xlims[1], xlims[2],by =0.01)))image(x =seq(xlims[1], xlims[2],by =0.01),y =seq(ylims[1], ylims[2],by =0.01),z = resul,xlab ="beta, linear term",ylab ="beta, quadratic term",main ="Bivariate posterior density")That is, all three parameters are mutually dependent. Furthermore, wecan also quantify the probability that each of these parameters isnegative:
Probs<-vector()for(iin1:3){ Probs[i]<-pst(0,mu = Fit1$coefficients[i],sigma =sqrt(VarCov[i,i]),nu =nrow(Auto)-3)}Probs## [1] 1.913574e-235 1.000000e+00 1.098170e-21These probabilities may be used as Bayesian measures of evidence ofthe contribution of each variable to the fit. Values of theseprobabilities close to 0 or 1 indicate the appropriateness ofconsidering that component in the model.
Finally, the posterior mean of every component of\(\boldsymbol{\beta}\) coincides with thefrequentist estimate of that coefficient. Therefore, the point estimate(posterior mean) of the quadratic function resulting from the Bayesiananalysis exactly coincides with the corresponding curve from thefrequentist analysis.
We propose below an individual exercise that pursues to consolidatethe basic concepts that we have learned in the previous theoreticalsession and that we have been practicing in this session.
Exercise
The data setWeights, included in theVIBASS package, is the data set used for the example of thetheoretical session. That data set has a categorical variable,race, which contains the race of each child in the study.Fit a Bayesian linear regression model (ANCOVA) that explains eachchild’s weight as a function of age, race and the interaction of thesefactors. Explore the posterior distribution of the coefficients in themodel and assess therefore its convenience in the regression model.