rm(Wage)
## Warning in rm(Wage): object 'Wage' not found
require(ISLR)
## Loading required package: ISLR
attach(Wage)

Polynomials

  1. FIRST focus on a single predictor, AGE

NOTES FROM VIDEO: We will fit a fourth degree polynomial to The Age/Wage regression The function poly does the matrix-making for us!

fit=lm(wage~poly(age,4),data=Wage)
summary(fit)
## 
## Call:
## lm(formula = wage ~ poly(age, 4), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.707 -24.626  -4.993  15.217 203.693 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    111.7036     0.7287 153.283  < 2e-16 ***
## poly(age, 4)1  447.0679    39.9148  11.201  < 2e-16 ***
## poly(age, 4)2 -478.3158    39.9148 -11.983  < 2e-16 ***
## poly(age, 4)3  125.5217    39.9148   3.145  0.00168 ** 
## poly(age, 4)4  -77.9112    39.9148  -1.952  0.05104 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared:  0.08626,    Adjusted R-squared:  0.08504 
## F-statistic: 70.69 on 4 and 2995 DF,  p-value: < 2.2e-16
# get limits from age range
agelims=range(age)
# creates a sequence of nums from both ends of agelist
age.grid=seq(from=agelims[1],to=agelims[2])
# makes the predictions, also includes standard error
# returns two lists, one list of predictions, one with the standard errors
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
# use cbind to take two columns (lists) and turn into matrix
# And use +2 and -2 to create our Standard Error Bands
# preds$fit = predictions | preds$se = standard error
se.bands=cbind(preds$fit+2*preds$se, preds$fit-2*preds$se)
# first plot the data
plot(age, wage, col="darkgrey")
# then plot our error bands
lines(age.grid, preds$fit, lwd=2, col="blue")
matlines(age.grid,se.bands,col="blue",lty=2)

There are other ways we can do this in R

# wrap ^2 in "identity function" so the carat won't be 
# interpreted as part of the forumla -- I(age^2) == age squared
# I(age^3) == age cubed etc.
fit_v2=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
summary(fit_v2)
## 
## Call:
## lm(formula = wage ~ age + I(age^2) + I(age^3) + I(age^4), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.707 -24.626  -4.993  15.217 203.693 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.842e+02  6.004e+01  -3.067 0.002180 ** 
## age          2.125e+01  5.887e+00   3.609 0.000312 ***
## I(age^2)    -5.639e-01  2.061e-01  -2.736 0.006261 ** 
## I(age^3)     6.811e-03  3.066e-03   2.221 0.026398 *  
## I(age^4)    -3.204e-05  1.641e-05  -1.952 0.051039 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared:  0.08626,    Adjusted R-squared:  0.08504 
## F-statistic: 70.69 on 4 and 2995 DF,  p-value: < 2.2e-16

ON TO ANOVA

fita=lm(wage~education, data=Wage)
fitb=lm(wage~education+age,data=Wage)
fitc=lm(wage~education+poly(age,2),data=Wage)
fitd=lm(wage~education+poly(age,3),data=Wage)
anova(fita, fitb, fitc, fitd)
## Analysis of Variance Table
## 
## Model 1: wage ~ education
## Model 2: wage ~ education + age
## Model 3: wage ~ education + poly(age, 2)
## Model 4: wage ~ education + poly(age, 3)
##   Res.Df     RSS Df Sum of Sq        F Pr(>F)    
## 1   2995 3995721                                 
## 2   2994 3867992  1    127729 102.7378 <2e-16 ***
## 3   2993 3725395  1    142597 114.6969 <2e-16 ***
## 4   2992 3719809  1      5587   4.4936 0.0341 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

NOTES FROM THE VIDEO:

We use the ANOVA to tell us which variables we need. As we can see by the *** we clearly need Age, and Age ^2. Age ^3 is only statistically significant at the 5% level (see Signif.codes: *)

POLYNOMIAL LOGISTIC REGRESSION

We are going to separate out the “big wage earners” into a binary variable And then fit this to a logistic regression (which only works with binary response variables)

# Run the model
fit=glm(I(wage>250) ~ poly(age,3), data=Wage, family=binomial)
# Get the summary
summary(fit)
## 
## Call:
## glm(formula = I(wage > 250) ~ poly(age, 3), family = binomial, 
##     data = Wage)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2808  -0.2736  -0.2487  -0.1758   3.2868  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.8486     0.1597 -24.100  < 2e-16 ***
## poly(age, 3)1  37.8846    11.4818   3.300 0.000968 ***
## poly(age, 3)2 -29.5129    10.5626  -2.794 0.005205 ** 
## poly(age, 3)3   9.7966     8.9990   1.089 0.276317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 730.53  on 2999  degrees of freedom
## Residual deviance: 707.92  on 2996  degrees of freedom
## AIC: 715.92
## 
## Number of Fisher Scoring iterations: 8
# Get the prediction
preds=predict(fit,list(age=age.grid),se=T)
# Create Standard Error Bands
se.bands=preds$fit + cbind(fit=0,lower=-2*preds$se,upper=2*preds$se)
se.bands[1:5,]
##         fit      lower     upper
## 1 -7.664756 -10.759826 -4.569686
## 2 -7.324776 -10.106699 -4.542852
## 3 -7.001732  -9.492821 -4.510643
## 4 -6.695229  -8.917158 -4.473300
## 5 -6.404868  -8.378691 -4.431045
prob.bands=exp(se.bands)/(1+exp(se.bands))
matplot(age.grid,prob.bands,col="blue",lwd=c(2,1,1),lty=c(1,2,2),type="l",ylim=c(0,.1))
# we use the jitter function to show just how much data happened at each age
points(jitter(age), I(wage>250)/10,pch="l",cex=.5)

PURPOSE:

  • Confidence band is first computed on the LOGIT scale (where we are comparing the binary high/low earners)
  • And then APPLIED to our probability scale

UPNEXT: Splines

SPLINES

Splines are more flexible than polynomials (but the idea is similar)

Exploring cubic splines

A spline is a cubic polynomial Splines help us fit more flexible functions The knots break down the data Splines are more “local” than polynomials

Here is a “fixed knot regression spline”

require(splines)
## Loading required package: splines
fit=lm(wage~bs(age, knots=c(25,40,60)), data=Wage)
plot(age,wage,col="darkgrey")
lines(age.grid,predict(fit,list(age=age.grid)),col="darkgreen",lwd=2)
abline(v=c(25,40,60),lty=2,col="darkgreen")

## SMOOTHING SPLINE

Smoothing spline does not require knot selection it DOES have a smoothing parameter and it can be specified via the “degress of freedom” or df

fit=lm(wage~bs(age, knots=c(25,40,60)), data=Wage)
plot(age,wage,col="darkgrey")
lines(age.grid,predict(fit,list(age=age.grid)),col="darkgreen",lwd=2)
abline(v=c(25,40,60),lty=2,col="darkgreen")
# ----------
fit=smooth.spline(age,wage,df=16)
lines(fit,col="red",lwd=2)

Alternately we can use LOO cross-validation Which selects our smoothing parameter for us automatically

fit=lm(wage~bs(age, knots=c(25,40,60)), data=Wage)
plot(age,wage,col="darkgrey")
lines(age.grid,predict(fit,list(age=age.grid)),col="darkgreen",lwd=2)
abline(v=c(25,40,60),lty=2,col="darkgreen")
# ----------
fit=smooth.spline(age,wage,df=16)
lines(fit,col="red",lwd=2)
# ----------
fit=smooth.spline(age,wage,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-unique
## 'x' values seems doubtful
lines(fit,col="purple",lwd=2)

fit
## Call:
## smooth.spline(x = age, y = wage, cv = TRUE)
## 
## Smoothing Parameter  spar= 0.6988943  lambda= 0.02792303 (12 iterations)
## Equivalent Degrees of Freedom (Df): 6.794596
## Penalized Criterion (RSS): 75215.9
## PRESS(l.o.o. CV): 1593.383

What about MULTIPLE non-linear terms?

So far, we’ve only looked at single non-linear terms. Thankfully, the gam package makes working with multiple non-linear terms easier

require(gam)
## Loading required package: gam
## Loading required package: foreach
## Loaded gam 1.20
# We tell "gam" the the response is WAGE
# We want a smooth term in age, with four degrees of freedom
# We want a smooth term in year, also with four degrees of freedom
# As edu is a factor variable, it will make dummy variables for us
# NOTE: S is a special function in `gam` that denotes smoothing
gam1=gam(wage~s(age,df=4)+s(year,df=4)+education,data=Wage)
par(mfrow=c(1,3))
# Excellent way to fit and plot non-linear functions across several variables
plot(gam1,se=T)

gam2=gam(I(wage>250)~s(age,df=4)+s(year,df=4)+education,data=Wage,family=binomial)
# Left off standard errors on this one
plot(gam2)

gam also works for logistic regression and other kinds of generalized Linear Models

Let’s see if we need a nonlinear term for year ## Do some tests in gam

This one, we won’t have a smooth term for year (just a linear one) and then we use ANOVA to test both models

gam2a=gam(I(wage>250)~s(age,df=4)+year+education,data=Wage,family=binomial)
anova(gam2a,gam2,test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: I(wage > 250) ~ s(age, df = 4) + year + education
## Model 2: I(wage > 250) ~ s(age, df = 4) + s(year, df = 4) + education
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      2990     603.78                     
## 2      2987     602.87  3  0.90498   0.8242

TEST RESULTS: Because the p-value is 0.8242 it means we don’t need this non-linear term for year

ns means natural-spline

par(mfrow=c(1,3))
lm1=lm(wage~ns(age,df=4)+ns(year,df=4)+education,data=Wage)
plot.Gam(lm1,se=T)

AND THIS CONCLUDES OUR SECTION ON GAM

GENERALIZED ADDITIVE MODELS

FIN.