rm(Wage)
## Warning in rm(Wage): object 'Wage' not found
require(ISLR)
## Loading required package: ISLR
attach(Wage)
NOTES FROM VIDEO: We will fit a fourth degree polynomial to The Age/Wage regression The function poly
does the matrix-making for us!
fit=lm(wage~poly(age,4),data=Wage)
summary(fit)
##
## Call:
## lm(formula = wage ~ poly(age, 4), data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.707 -24.626 -4.993 15.217 203.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.7036 0.7287 153.283 < 2e-16 ***
## poly(age, 4)1 447.0679 39.9148 11.201 < 2e-16 ***
## poly(age, 4)2 -478.3158 39.9148 -11.983 < 2e-16 ***
## poly(age, 4)3 125.5217 39.9148 3.145 0.00168 **
## poly(age, 4)4 -77.9112 39.9148 -1.952 0.05104 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared: 0.08626, Adjusted R-squared: 0.08504
## F-statistic: 70.69 on 4 and 2995 DF, p-value: < 2.2e-16
# get limits from age range
agelims=range(age)
# creates a sequence of nums from both ends of agelist
age.grid=seq(from=agelims[1],to=agelims[2])
# makes the predictions, also includes standard error
# returns two lists, one list of predictions, one with the standard errors
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
# use cbind to take two columns (lists) and turn into matrix
# And use +2 and -2 to create our Standard Error Bands
# preds$fit = predictions | preds$se = standard error
se.bands=cbind(preds$fit+2*preds$se, preds$fit-2*preds$se)
# first plot the data
plot(age, wage, col="darkgrey")
# then plot our error bands
lines(age.grid, preds$fit, lwd=2, col="blue")
matlines(age.grid,se.bands,col="blue",lty=2)
There are other ways we can do this in R
# wrap ^2 in "identity function" so the carat won't be
# interpreted as part of the forumla -- I(age^2) == age squared
# I(age^3) == age cubed etc.
fit_v2=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
summary(fit_v2)
##
## Call:
## lm(formula = wage ~ age + I(age^2) + I(age^3) + I(age^4), data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.707 -24.626 -4.993 15.217 203.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.842e+02 6.004e+01 -3.067 0.002180 **
## age 2.125e+01 5.887e+00 3.609 0.000312 ***
## I(age^2) -5.639e-01 2.061e-01 -2.736 0.006261 **
## I(age^3) 6.811e-03 3.066e-03 2.221 0.026398 *
## I(age^4) -3.204e-05 1.641e-05 -1.952 0.051039 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.91 on 2995 degrees of freedom
## Multiple R-squared: 0.08626, Adjusted R-squared: 0.08504
## F-statistic: 70.69 on 4 and 2995 DF, p-value: < 2.2e-16
fita=lm(wage~education, data=Wage)
fitb=lm(wage~education+age,data=Wage)
fitc=lm(wage~education+poly(age,2),data=Wage)
fitd=lm(wage~education+poly(age,3),data=Wage)
anova(fita, fitb, fitc, fitd)
## Analysis of Variance Table
##
## Model 1: wage ~ education
## Model 2: wage ~ education + age
## Model 3: wage ~ education + poly(age, 2)
## Model 4: wage ~ education + poly(age, 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2995 3995721
## 2 2994 3867992 1 127729 102.7378 <2e-16 ***
## 3 2993 3725395 1 142597 114.6969 <2e-16 ***
## 4 2992 3719809 1 5587 4.4936 0.0341 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
NOTES FROM THE VIDEO:
We use the ANOVA to tell us which variables we need. As we can see by the ***
we clearly need Age, and Age ^2. Age ^3 is only statistically significant at the 5% level (see Signif.codes: *
)
We are going to separate out the “big wage earners” into a binary variable And then fit this to a logistic regression (which only works with binary response variables)
# Run the model
fit=glm(I(wage>250) ~ poly(age,3), data=Wage, family=binomial)
# Get the summary
summary(fit)
##
## Call:
## glm(formula = I(wage > 250) ~ poly(age, 3), family = binomial,
## data = Wage)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2808 -0.2736 -0.2487 -0.1758 3.2868
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.8486 0.1597 -24.100 < 2e-16 ***
## poly(age, 3)1 37.8846 11.4818 3.300 0.000968 ***
## poly(age, 3)2 -29.5129 10.5626 -2.794 0.005205 **
## poly(age, 3)3 9.7966 8.9990 1.089 0.276317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 730.53 on 2999 degrees of freedom
## Residual deviance: 707.92 on 2996 degrees of freedom
## AIC: 715.92
##
## Number of Fisher Scoring iterations: 8
# Get the prediction
preds=predict(fit,list(age=age.grid),se=T)
# Create Standard Error Bands
se.bands=preds$fit + cbind(fit=0,lower=-2*preds$se,upper=2*preds$se)
se.bands[1:5,]
## fit lower upper
## 1 -7.664756 -10.759826 -4.569686
## 2 -7.324776 -10.106699 -4.542852
## 3 -7.001732 -9.492821 -4.510643
## 4 -6.695229 -8.917158 -4.473300
## 5 -6.404868 -8.378691 -4.431045
prob.bands=exp(se.bands)/(1+exp(se.bands))
matplot(age.grid,prob.bands,col="blue",lwd=c(2,1,1),lty=c(1,2,2),type="l",ylim=c(0,.1))
# we use the jitter function to show just how much data happened at each age
points(jitter(age), I(wage>250)/10,pch="l",cex=.5)
UPNEXT: Splines
Splines are more flexible than polynomials (but the idea is similar)
A spline is a cubic polynomial Splines help us fit more flexible functions The knots break down the data Splines are more “local” than polynomials
Here is a “fixed knot regression spline”
require(splines)
## Loading required package: splines
fit=lm(wage~bs(age, knots=c(25,40,60)), data=Wage)
plot(age,wage,col="darkgrey")
lines(age.grid,predict(fit,list(age=age.grid)),col="darkgreen",lwd=2)
abline(v=c(25,40,60),lty=2,col="darkgreen")
## SMOOTHING SPLINE
Smoothing spline does not require knot selection it DOES have a smoothing parameter and it can be specified via the “degress of freedom” or df
fit=lm(wage~bs(age, knots=c(25,40,60)), data=Wage)
plot(age,wage,col="darkgrey")
lines(age.grid,predict(fit,list(age=age.grid)),col="darkgreen",lwd=2)
abline(v=c(25,40,60),lty=2,col="darkgreen")
# ----------
fit=smooth.spline(age,wage,df=16)
lines(fit,col="red",lwd=2)
Alternately we can use LOO cross-validation Which selects our smoothing parameter for us automatically
fit=lm(wage~bs(age, knots=c(25,40,60)), data=Wage)
plot(age,wage,col="darkgrey")
lines(age.grid,predict(fit,list(age=age.grid)),col="darkgreen",lwd=2)
abline(v=c(25,40,60),lty=2,col="darkgreen")
# ----------
fit=smooth.spline(age,wage,df=16)
lines(fit,col="red",lwd=2)
# ----------
fit=smooth.spline(age,wage,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-unique
## 'x' values seems doubtful
lines(fit,col="purple",lwd=2)
fit
## Call:
## smooth.spline(x = age, y = wage, cv = TRUE)
##
## Smoothing Parameter spar= 0.6988943 lambda= 0.02792303 (12 iterations)
## Equivalent Degrees of Freedom (Df): 6.794596
## Penalized Criterion (RSS): 75215.9
## PRESS(l.o.o. CV): 1593.383
So far, we’ve only looked at single non-linear terms. Thankfully, the gam
package makes working with multiple non-linear terms easier
require(gam)
## Loading required package: gam
## Loading required package: foreach
## Loaded gam 1.20
# We tell "gam" the the response is WAGE
# We want a smooth term in age, with four degrees of freedom
# We want a smooth term in year, also with four degrees of freedom
# As edu is a factor variable, it will make dummy variables for us
# NOTE: S is a special function in `gam` that denotes smoothing
gam1=gam(wage~s(age,df=4)+s(year,df=4)+education,data=Wage)
par(mfrow=c(1,3))
# Excellent way to fit and plot non-linear functions across several variables
plot(gam1,se=T)
gam2=gam(I(wage>250)~s(age,df=4)+s(year,df=4)+education,data=Wage,family=binomial)
# Left off standard errors on this one
plot(gam2)
gam
also works for logistic regression and other kinds of generalized Linear Models
Let’s see if we need a nonlinear term for year ## Do some tests in gam
This one, we won’t have a smooth term for year (just a linear one) and then we use ANOVA to test both models
gam2a=gam(I(wage>250)~s(age,df=4)+year+education,data=Wage,family=binomial)
anova(gam2a,gam2,test="Chisq")
## Analysis of Deviance Table
##
## Model 1: I(wage > 250) ~ s(age, df = 4) + year + education
## Model 2: I(wage > 250) ~ s(age, df = 4) + s(year, df = 4) + education
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2990 603.78
## 2 2987 602.87 3 0.90498 0.8242
TEST RESULTS: Because the p-value is 0.8242 it means we don’t need this non-linear term for year
ns
means natural-spline
par(mfrow=c(1,3))
lm1=lm(wage~ns(age,df=4)+ns(year,df=4)+education,data=Wage)
plot.Gam(lm1,se=T)
GAM
FIN.