Load the library and attach the Wage dataset- we will be using this data set for all three of the guided labs.
library(ISLR)
attach(Wage)
fit=lm(wage~poly(age,4),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value
## (Intercept) 111.70361 0.7287409 153.283015
## poly(age, 4)1 447.06785 39.9147851 11.200558
## poly(age, 4)2 -478.31581 39.9147851 -11.983424
## poly(age, 4)3 125.52169 39.9147851 3.144742
## poly(age, 4)4 -77.91118 39.9147851 -1.951938
## Pr(>|t|)
## (Intercept) 0.00000000000000000000000000000000000000
## poly(age, 4)1 0.00000000000000000000000000014846042765
## poly(age, 4)2 0.00000000000000000000000000000002355831
## poly(age, 4)3 0.00167862178126833294644626448643975891
## poly(age, 4)4 0.05103864982784254988867900237892172299
fit2=lm(wage~poly(age,4,raw=T),data=Wage)
coef(summary(fit2))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -184.1541797743 60.04037718327 -3.067172 0.0021802539
## poly(age, 4, raw = T)1 21.2455205321 5.88674824448 3.609042 0.0003123618
## poly(age, 4, raw = T)2 -0.5638593126 0.20610825640 -2.735743 0.0062606446
## poly(age, 4, raw = T)3 0.0068106877 0.00306593115 2.221409 0.0263977518
## poly(age, 4, raw = T)4 -0.0000320383 0.00001641359 -1.951938 0.0510386498
fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
coef(fit2a)
## (Intercept) age I(age^2) I(age^3) I(age^4)
## -184.1541797743 21.2455205321 -0.5638593126 0.0068106877 -0.0000320383
This code chunk accomplishes the same results as the poly function by simply creating the polynomial basis functions with the wrapper function I() and the ^ symbol.
fit2b=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage)
summary(fit2b)
##
## Call:
## lm(formula = wage ~ cbind(age, age^2, age^3, age^4), data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.707 -24.626 -4.993 15.217 203.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -184.15417977 60.04037718 -3.067 0.002180
## cbind(age, age^2, age^3, age^4)age 21.24552053 5.88674824 3.609 0.000312
## cbind(age, age^2, age^3, age^4) -0.56385931 0.20610826 -2.736 0.006261
## cbind(age, age^2, age^3, age^4) 0.00681069 0.00306593 2.221 0.026398
## cbind(age, age^2, age^3, age^4) -0.00003204 0.00001641 -1.952 0.051039
##
## (Intercept) **
## cbind(age, age^2, age^3, age^4)age ***
## cbind(age, age^2, age^3, age^4) **
## cbind(age, age^2, age^3, age^4) *
## cbind(age, age^2, age^3, age^4) .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared: 0.08626, Adjusted R-squared: 0.08504
## F-statistic: 70.69 on 4 and 2995 DF, p-value: < 0.00000000000000022
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1),mar=c(4.5, 4.5, 1, 1),oma=c(2,2,2,2))
plot(age,wage,xlim=agelims,cex=.5, col="darkgrey")
title ("Degree-4 Polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
preds2=predict(fit2,newdata=list(age=age.grid),se=TRUE)
max(abs(preds$fit-preds2$fit))
## [1] 0.0000000000781597
fit.1= lm(wage~age,data=Wage)
fit.2= lm(wage~poly(age,2),data=Wage)
fit.3= lm(wage~poly(age,3),data=Wage)
fit.4= lm(wage~poly(age,4),data=Wage)
fit.5= lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
Res.Df <dbl> | RSS <dbl> | Df <dbl> | Sum of Sq <dbl> | F <dbl> | Pr(>F) <dbl> | |
---|---|---|---|---|---|---|
1 | 2998 | 5022216 | NA | NA | NA | NA |
2 | 2997 | 4793430 | 1 | 228786.010 | 143.5931074 | 0.00000000000000000000000000000002367734 |
3 | 2996 | 4777674 | 1 | 15755.694 | 9.8887559 | 0.00167921282630841234122243132986795899 |
4 | 2995 | 4771604 | 1 | 6070.152 | 3.8098134 | 0.05104623133270855955823108729418891016 |
5 | 2994 | 4770322 | 1 | 1282.563 | 0.8049758 | 0.36968196597431635108677028256352059543 |
coef(summary(fit.5))
## Estimate Std. Error t value
## (Intercept) 111.70361 0.7287647 153.2780243
## poly(age, 5)1 447.06785 39.9160847 11.2001930
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341
## poly(age, 5)3 125.52169 39.9160847 3.1446392
## poly(age, 5)4 -77.91118 39.9160847 -1.9518743
## poly(age, 5)5 -35.81289 39.9160847 -0.8972045
## Pr(>|t|)
## (Intercept) 0.00000000000000000000000000000000000000
## poly(age, 5)1 0.00000000000000000000000000014911107825
## poly(age, 5)2 0.00000000000000000000000000000002367734
## poly(age, 5)3 0.00167921282630795762683129090930833627
## poly(age, 5)4 0.05104623133271749685357931980433932040
## poly(age, 5)5 0.36968196597439939576901224427274428308
# squaring T-statistics
print("model 2 F-stat:"); (-11.983)^2
## [1] "model 2 F-stat:"
## [1] 143.5923
print("model 3 F-stat:"); (3.144)^2
## [1] "model 3 F-stat:"
## [1] 9.884736
print("model 4 F-stat:"); (-1.951)^2
## [1] "model 4 F-stat:"
## [1] 3.806401
print("model 5 F-stat:"); (-0.897)^2
## [1] "model 5 F-stat:"
## [1] 0.804609
fit.1=lm(wage~education+age,data=Wage)
fit.2=lm(wage~education+poly(age,2),data=Wage)
fit.3=lm(wage~education+poly(age,3),data=Wage)
fit.4=lm(wage~education+poly(age,4),data=Wage)
fit.5=lm(wage~education+poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3, fit.4, fit.5)
Res.Df <dbl> | RSS <dbl> | Df <dbl> | Sum of Sq <dbl> | F <dbl> | Pr(>F) <dbl> | |
---|---|---|---|---|---|---|
1 | 2994 | 3867992 | NA | NA | NA | NA |
2 | 2993 | 3725395 | 1 | 142597.09701 | 114.7076992 | 0.00000000000000000000000002716626 |
3 | 2992 | 3719809 | 1 | 5586.66032 | 4.4940112 | 0.03409594091521635228492215219376 |
4 | 2991 | 3719777 | 1 | 31.74108 | 0.0255331 | 0.87305665726968351147974090054049 |
5 | 2990 | 3716972 | 1 | 2804.78556 | 2.2562205 | 0.13318409169395115099199244923511 |
fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial)
#
preds=predict(fit,newdata=list(age=age.grid),se=T)
pfit=exp(preds$fit)/(1+exp(preds$fit))
se.bands.logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands=exp(se.bands.logit)/(1+exp(se.bands.logit))
# ^ Transformation in order to get confidence intervals for the glm()
preds=predict(fit,newdata=list(age=age.grid),type="response",se=T)
# ^ directly computes the probabilities. Unfortunately, the corresponding confidence intervals would have not been sensible as they would have resulted in negative probabilities!
plot(age,I(wage>250),xlim=agelims,type="n",ylim=c(0,.2))
points(jitter(age),I((wage>250)/5),cex =.5,pch="|",col="darkgrey")
lines(age.grid,pfit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
We have drawn the age values corresponding to the observations with
wage values above 250k as gray marks at the top of the plot, while at
the same time those with wage values below 250 are shown as gray marks
at the bottom of the plot. The jitter() function was used to jitter the
age values so that observations with the same age would not cover each
other up.
table(cut(age,4))
##
## (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
## 750 1399 779 72
fit=lm(wage~cut(age,4),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value
## (Intercept) 94.158392 1.476069 63.789970
## cut(age, 4)(33.5,49] 24.053491 1.829431 13.148074
## cut(age, 4)(49,64.5] 23.664559 2.067958 11.443444
## cut(age, 4)(64.5,80.1] 7.640592 4.987424 1.531972
## Pr(>|t|)
## (Intercept) 0.00000000000000000000000000000000000000000000
## cut(age, 4)(33.5,49] 0.00000000000000000000000000000000000001982315
## cut(age, 4)(49,64.5] 0.00000000000000000000000000001040749536333972
## cut(age, 4)(64.5,80.1] 0.12563503865595057606974194186477689072489738
Important: default is a cubic spline
# 1st section
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage)
pred=predict(fit,newdata=list(age=age.grid),se=T)
plot(age,wage,col="gray")
lines(age.grid,pred$fit,lwd=2)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")
# 2nd section
dim(bs(age,knots=c(25,40,60)))
## [1] 3000 6
dim(bs(age,df=6))
## [1] 3000 6
attr(bs(age,df=6),"knots")
## 25% 50% 75%
## 33.75 42.00 51.00
# 3rd section
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit2,newdata=list(age=age.grid),se=T)
lines(age.grid,pred2$fit,col="red",lwd=2)
Recall: that a cubic spline with 3 knots has 7 degrees of freedom; these degrees of freedom are made up of the intercept plus six basis functions.
Additional info: the bs() function has a degree argument which allows one to fit splines of any degree not just the default degree of 3
As with the bs() function we could instead specify the knots directly using the knots option
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16)
fit2=smooth.spline(age,wage,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-unique
## 'x' values seems doubtful
fit2$df
## [1] 6.794596
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage)
fit2=loess(wage~age,span =.5,data=Wage)
lines(age.grid,predict(fit,data.frame(age=age.grid)),col="red",lwd=2)
lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2)
legend ("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
gam1=lm(wage~ns(year,4)+ns(age,5)+education,data=Wage)
Since this is just a large linear regression model using the appropriate choice of basis functions we can use an lm() function.
library(gam)
## Loading required package: foreach
## Loaded gam 1.20
gam.m3=gam(wage~s(year,4)+s(age,5)+education,data=Wage)
par(mfrow=c(1,3))
plot(gam.m3,se=TRUE,col="blue")
plot.Gam(gam1,se=TRUE,col="red") # VERY Important - plot.Gam, not plot.gam
The plotting of gam1 gives us a reproduction of figure 7.11 on page 284. Note that depending on your text version, the given code may read “plot.gam”, with a lower-case G. Whether an innocent typo or an update to the package since the text was published, the proper working function now has an upper-case G. If you defined the GAM using the gam() function, you can just call plot() and it works perfectly fine (the se=TRUE component says to give us the confidence interval bands where appropriate). If you define the GAM as a regular lm(), you can still plot it as a GAM, you just have to use that plot.Gam() construction. 50 of one, half-a-dozen of the other.
gam.m1=gam(wage~s(age,5)+education,data=Wage)
gam.m2=gam(wage~year+s(age,5)+education,data=Wage)
anova(gam.m1,gam.m2,gam.m3,test="F")
Resid. Df <dbl> | Resid. Dev <dbl> | Df <dbl> | Deviance <dbl> | F <dbl> | Pr(>F) <dbl> | |
---|---|---|---|---|---|---|
1 | 2990 | 3711731 | NA | NA | NA | NA |
2 | 2989 | 3693842 | 1.000000 | 17889.243 | 14.477130 | 0.0001447167 |
3 | 2986 | 3689770 | 2.999989 | 4071.134 | 1.098212 | 0.3485661430 |
summary(gam.m3)
##
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -119.43 -19.70 -3.33 14.17 213.48
##
## (Dispersion Parameter for gaussian family taken to be 1235.69)
##
## Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(year, 4) 1 27162 27162 21.981 0.000002877 ***
## s(age, 5) 1 195338 195338 158.081 < 0.00000000000000022 ***
## education 4 1069726 267432 216.423 < 0.00000000000000022 ***
## Residuals 2986 3689770 1236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(year, 4) 3 1.086 0.3537
## s(age, 5) 4 32.380 <0.0000000000000002 ***
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
preds=predict(gam.m2,newdata=Wage)
Here we are making predictions with our preferred model which uses year with a linear function. There is no use of this function in a portion of the model, just an illustration that a GAM model could be used to predict values.
gam.lo=gam(wage~s(year,df=4)+lo(age,span=0.7)+education,data=Wage)
plot.Gam(gam.lo,se=TRUE,col="green")
gam.lo.i=gam(wage~lo(year,age,span=0.5)+education,data=Wage)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : liv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : liv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)
library(akima)
plot(gam.lo.i)
gam.lr=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=Wage)
par(mfrow=c(1,3))
plot(gam.lr,se=T,col="green")
table(education,I(wage>250))
##
## education FALSE TRUE
## 1. < HS Grad 268 0
## 2. HS Grad 966 5
## 3. Some College 643 7
## 4. College Grad 663 22
## 5. Advanced Degree 381 45
gam.lr.s=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=Wage,subset=(education!="1. < HS Grad"))
plot(gam.lr.s,se=T,col="green")
rm(list=ls())
library(ISLR)
attach(College)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(leaps)
set.seed(5082)
Part_data <- createDataPartition(Outstate , p=.8, list = FALSE)
College.train <- College[Part_data,]
College.test <- College[-Part_data, ]
varfit<-regsubsets(Outstate~.,data=College.train ,method ="forward", nvmax=17)
var.summary<-summary(varfit)
names(var.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
which.max(var.summary$adjr2) #15
## [1] 15
which.min(var.summary$cp) #13
## [1] 13
which.min(var.summary$bic) #12
## [1] 12
par(mfrow=c(1,3))
plot(var.summary$cp, xlab="Variable Count", ylab = "Cp", type= "l")
plot(var.summary$bic, xlab="Variable Count", ylab = "BIC", type= "l")
plot(var.summary$adjr2, xlab="Variable Count", ylab = "Adj. R2", type= "l")
var.summary$cp
## [1] 820.94633 409.77064 205.25484 116.89272 70.45428 43.96096 40.99799
## [8] 38.93914 38.28257 28.99663 20.14375 13.74255 13.28235 13.29288
## [15] 14.05269 16.02524 18.00000
coefident<-coef(varfit, id=6)
names(coefident)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "PhD" "perc.alumni"
## [6] "Expend" "Grad.Rate"
r2_all=c()
for (i in 1:20) {
gm.1<-gam(Outstate~Private+s(Room.Board, df=i)+s(PhD, df=i)+s(perc.alumni, df=i)+s(Expend,df=i)+s(Grad.Rate, df=i),data=College.train, cv=TRUE)
preds<-predict(gm.1,newdata=College.test)
gamMSE<-mean((College.test$Outstate-preds)^2)
gamTSS<-mean((College.test$Outstate-mean(College.test$Outstate))^2)
testR2<-1-gamMSE/gamTSS
adjustedR2<-1-((1-testR2)*length(College.train$Outstate-1))/(length(College.train$Outstate)-6-1)
r2_all<-append(r2_all, adjustedR2)
}
which.max(r2_all) #7
## [1] 7
gm.1<-gam(Outstate~Private+s(Room.Board, df=7)+s(PhD, df=7)+s(perc.alumni, df=7)+s(Expend,df=7)+s(Grad.Rate, df=7),data=College.train, cv=TRUE)
par(mfrow=c(2,3))
plot(gm.1, se=TRUE, col='blue')
preds<-predict(gm.1,newdata=College.test)
gamMSE <- mean((College.test$Outstate-preds)^2)
gamMSE #3,861,987
## [1] 3861987
gamTSS<-mean((College.test$Outstate-mean(College.test$Outstate))^2)
testR2 <- 1-gamMSE/gamTSS
adjustedR2<- 1-((1-testR2)*length(College.test$Outstate-1))/(length(College.test$Outstate)-6-1)
adjustedR2 #0.7866438
## [1] 0.7866438
totalLM<-lm(Outstate~.,data=College.train)
summary(totalLM)$r.squared #Multiple R2 is 0.7675387
## [1] 0.7675387
summary(totalLM)$adj.r.squared #Adjusted R2 is 0.7610067
## [1] 0.7610067
summary(gm.1)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 7) + s(PhD,
## df = 7) + s(perc.alumni, df = 7) + s(Expend, df = 7) + s(Grad.Rate,
## df = 7), data = College.train, cv = TRUE)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6997.30 -1027.74 30.46 1224.10 4537.89
##
## (Dispersion Parameter for gaussian family taken to be 3320524)
##
## Null Deviance: 9634290635 on 622 degrees of freedom
## Residual Deviance: 1945830784 on 586.001 degrees of freedom
## AIC: 11160.59
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2768059391 2768059391 833.621 < 0.00000000000000022
## s(Room.Board, df = 7) 1 2011357524 2011357524 605.735 < 0.00000000000000022
## s(PhD, df = 7) 1 588428440 588428440 177.209 < 0.00000000000000022
## s(perc.alumni, df = 7) 1 373112457 373112457 112.365 < 0.00000000000000022
## s(Expend, df = 7) 1 777564551 777564551 234.169 < 0.00000000000000022
## s(Grad.Rate, df = 7) 1 116596134 116596134 35.114 0.000000005308
## Residuals 586 1945830784 3320524
##
## Private ***
## s(Room.Board, df = 7) ***
## s(PhD, df = 7) ***
## s(perc.alumni, df = 7) ***
## s(Expend, df = 7) ***
## s(Grad.Rate, df = 7) ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 7) 6 1.7868 0.09946 .
## s(PhD, df = 7) 6 1.7704 0.10285
## s(perc.alumni, df = 7) 6 1.9573 0.06982 .
## s(Expend, df = 7) 6 14.0520 0.000000000000005662 ***
## s(Grad.Rate, df = 7) 6 2.5261 0.02014 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1