library(ISLR)
summary(Hitters)
## AtBat Hits HmRun Runs
## Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
## 1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
## Median :379.5 Median : 96 Median : 8.00 Median : 48.00
## Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
## 3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
## Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
##
## RBI Walks Years CAtBat
## Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
## 1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
## Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
## Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
## 3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
## Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
##
## CHits CHmRun CRuns CRBI
## Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
## 1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
## Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
## Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
## 3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
## Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
##
## CWalks League Division PutOuts Assists
## Min. : 0.00 A:175 E:157 Min. : 0.0 Min. : 0.0
## 1st Qu.: 67.25 N:147 W:165 1st Qu.: 109.2 1st Qu.: 7.0
## Median : 170.50 Median : 212.0 Median : 39.5
## Mean : 260.24 Mean : 288.9 Mean :106.9
## 3rd Qu.: 339.25 3rd Qu.: 325.0 3rd Qu.:166.0
## Max. :1566.00 Max. :1378.0 Max. :492.0
##
## Errors Salary NewLeague
## Min. : 0.00 Min. : 67.5 A:176
## 1st Qu.: 3.00 1st Qu.: 190.0 N:146
## Median : 6.00 Median : 425.0
## Mean : 8.04 Mean : 535.9
## 3rd Qu.:11.00 3rd Qu.: 750.0
## Max. :32.00 Max. :2460.0
## NA's :59
We’ll quickly deal with this by 1. Removing everything with “na” na.omit
2. Checking our work sum(is,na(Salary))
Hitters = na.omit(Hitters)
with(Hitters,sum(is.na(Salary)))
## [1] 0
DEF: Looks through all possible regression models of all different subset sizes and looks for the best of each size. PRODUCES: A sequence of models which is the best subset for each particular size
library(leaps)
regfit.full=regsubsets(Salary~.,data=Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
It gives best-subsets up to size 8 – what if we want all variables?
regfit.full=regsubsets(Salary~.,data=Hitters, nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
plot(reg.summary$cp, xlab="Number of Variables", ylab="Cp")
which.min(reg.summary$cp)
## [1] 10
points(10,reg.summary$cp[10], pch=20, col="red")
plot(regfit.full, scale="Cp")
coef(regfit.full,10)
## (Intercept) AtBat Hits Walks CAtBat CRuns
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798 1.4082490
## CRBI CWalks DivisionW PutOuts Assists
## 0.7743122 -0.8308264 -112.3800575 0.2973726 0.2831680
NOTES: * Base subset is agressive, looking at all possible subsets * Forward stepwise is a greedy algorithm. Each time it includes the next best variable But it produces a nested sequence. * Each new model includes all the variables that were before, plus a new one
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19, method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
plot(regfit.fwd,scale="Cp")
train=sample(seq(263),180,replace=FALSE)
In English: We’re going to create a sequence of numbers from 1-263. We are going to take a sample of 180 from that sequencedim(Hitters)
## [1] 263 20
set.seed(1)
train=sample(seq(263),180,replace=FALSE)
train
## [1] 167 129 187 85 79 213 37 105 217 110 229 165 34 106 126 89 172 207
## [19] 33 84 163 70 74 42 166 111 148 156 20 44 121 87 242 233 40 247
## [37] 25 119 198 122 39 179 240 134 24 160 14 130 45 146 22 206 193 115
## [55] 104 231 208 209 103 75 13 253 176 248 23 254 244 205 29 141 150 236
## [73] 108 48 245 215 149 31 102 145 73 232 83 118 90 190 107 64 196 60
## [91] 51 251 138 262 43 26 143 195 152 178 223 219 202 181 222 169 1 239
## [109] 78 211 246 28 116 257 61 113 86 71 225 99 173 234 49 256 174 194
## [127] 50 135 238 235 230 263 53 100 164 65 142 175 140 124 224 77 159 98
## [145] 66 19 17 228 204 186 35 144 46 180 109 210 16 161 9 137 92 162
## [163] 10 259 32 243 95 154 93 12 255 177 15 2 128 67 183 117 197 5
regfit.fwd=regsubsets(Salary~.,data=Hitters[train,],nvmax=19, method="forward")
Now we will work out the validation errors on the remainder of the data!
We will make predictions on the observations not used for training. (We know there are 19 models so we set up some vectors to record the errors)
To get the right elements of x test, we have to index the columns by the names that are on the coefficient vector.
val.errors=rep(NA,19)
x.test = model.matrix(Salary~.,data=Hitters[-train,])
for(i in 1:19){
# use coef function to extract the coefficients
coefi=coef(regfit.fwd,id=i)
# making our own prediction method
# subset of columns of x test that correspond to the variables that are in this current coefficient vector
# Then matrix mulitplied by the coefficient vector
pred=x.test[,names(coefi)]%*%coefi
# And now that we have our predictions, we can calculate the mean squared error
val.errors[i]=mean((Hitters$Salary[-train]-pred)^2)
}
plot(sqrt(val.errors),ylab="Root MSE",ylim=c(300,400), pch=19,type="b")
points(sqrt(regfit.fwd$rss[-1]/180),col="blue", pch=19, type="b")
# pch=19 simply means round, full dot (on the plot)
legend("topright", legend=c("Training", "Validation"), col=c("blue","black"), pch=19)
predict
method for regsubsets
we’ll write one!!regsubset
has a component called call
and part of call
is a formula and we can extract the forumla through this little “incantation” here. Using the formula, we make a model matrix from it. Much like we did before. Then we extract the coefficient and do the matrix multiplication.
Now we have a function for reuse in our next section!
predict.regsubsets=function(object, newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
mat[,names(coefi)]%*%coefi
}
We will do 10-fold cross-validation.
set.seed(11)
# Sample from the numbers 1-10, each is going to be assigned a fold number
# We create a vector, 1 to 10, of length the number of rows of hitters
folds=sample(rep(1:10,length=nrow(Hitters)))
folds
## [1] 10 4 4 4 3 7 10 8 10 3 4 2 2 1 1 5 3 3 5 9 5 2 7 3 8
## [26] 7 4 6 7 7 7 10 2 6 6 1 8 8 10 1 8 3 7 10 1 6 10 3 8 7
## [51] 9 3 6 2 5 6 5 5 2 6 4 7 10 7 1 8 1 2 8 2 1 8 2 6 4
## [76] 3 1 1 5 1 2 10 5 8 8 3 9 8 3 6 1 9 5 9 9 8 8 6 8 10
## [101] 9 8 7 8 10 1 3 1 10 5 2 5 2 4 4 10 3 6 9 7 4 3 9 8 10
## [126] 7 5 9 10 4 7 4 1 9 4 5 6 8 10 6 7 1 10 6 4 6 7 1 9 4
## [151] 2 1 7 2 5 7 10 7 9 6 9 5 5 4 10 4 2 10 9 3 3 2 9 2 6
## [176] 9 4 3 9 9 9 4 6 1 7 8 5 8 10 6 9 8 9 1 5 3 3 1 7 1
## [201] 2 9 1 9 10 3 10 3 4 6 8 2 9 5 7 2 10 4 1 6 10 3 5 5 3
## [226] 5 7 3 6 5 4 1 3 7 1 5 2 9 4 6 8 3 2 4 5 10 8 5 2 2
## [251] 2 6 3 4 7 2 6 4 8 7 1 2 6
table(folds)
## folds
## 1 2 3 4 5 6 7 8 9 10
## 27 27 27 26 26 26 26 26 26 26
cv.errors=matrix(NA,10,19)
for(k in 1:10){
best.fit=regsubsets(Salary~.,data=Hitters[folds!=k,],nvmax=19,method="forward")
for(i in 1:19){
pred=predict(best.fit,Hitters[folds==k,],id=i)
cv.errors[k,i]=mean( (Hitters$Salary[folds==k]-pred)^2)
}
}
# Now we process our output matrix
rmse.cv=sqrt(apply(cv.errors,2,mean))
plot(rmse.cv,pch=19,type="b")
In the video, it seemed to prefer models of 11 or 12 – per our current graph, it seems to prefer models of 9.
Using a package glimnet
which does not use the model formula language, so we have to set up an x
and y
(which is the predictor matrix and the response)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
# Predictor matrix
x=model.matrix(Salary~.-1,data=Hitters)
# Response
y=Hitters$Salary
First, we will fit a ridge-regression model. We do this by calling glmnet
with alpha=0
(see helpfile?). there is also a cv.glmnet
function which will do the cross-validation for us
RIDGE REGRESSION is penalized by the sum of squares of the coefficients Penalties put on the sum of squares of the coefficients (controlled by parameter lambda) Criterion for ridge regresion is residual sum of squares which is the usual for linear regression plus lambda times summation
RIDGE REGRESSION ELI5
From here
Let’s say you’re trying to predict future satisfaction using a number of inputs (age, income, gender etc.). Ridge regression penalizes large weights assigned to each input so that one input does not dominate all others in the prediction.
For example, a linear regression may find that income is the most important variable by far in determining future satisfaction (i.e. is assigned the largest weight in the linear regression). But maybe you have a small sample, or you think your sample is not representative, so you don’t want income to be essentially the only input used to predict future satisfaction. The other inputs may have an effect, but their weights in a linear regression are so small relative to that of income that income is essentially the only input that matters in your prediction. You want to take into account all your variables in a more “even” way. Ridge regression will force all coefficients to be in a similar range so that you get a wider range of predictions that are not dominated by income only.
fit.ridge=glmnet(x,y,alpha=0)
plot(fit.ridge, xvar="lambda", label=TRUE)
NOTE Unlike subset and forward stepwise regression, which controls the complexity of a model by restricting the number of variables, ridge regression keeps all the variables in and shrinks the coefficients towards zero.
cv.ridge=cv.glmnet(x,y,alpha=0)
plot(cv.ridge)
Lasso similar to ridge, the only difference is the penalty. FOr lasso we minimize the RSS plus lambda and there’s a very subtle change. Insated of the sum of squares of the coefficients we penalize the absolute values of the coefficients.
NOTE Lasso is doing both shrinkage and variable selection
# Don't need alpha here because alpha defaults to alpha=1
fit.lasso=glmnet(x,y)
plot(fit.lasso,xvar="lambda", label=TRUE)
We can also plot “dev” instead of “lambda” – This is the percentage of deviance explained or percentage of the sum of squares explained (which is the rsquared)
This plot changes the orientaton
plot(fit.lasso,xvar="dev", label=TRUE)
An indication that maybe at the end of the path there is overfitting
TL;DR: There are different ways of plotting the coefficients and they give you different information about the coefficients and about the nature of the path.
cv.lasso=cv.glmnet(x,y)
plot(cv.lasso)
coef(cv.lasso)
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 144.37970485
## AtBat .
## Hits 1.36380384
## HmRun .
## Runs .
## RBI .
## Walks 1.49731098
## Years .
## CAtBat .
## CHits .
## CHmRun .
## CRuns 0.15275165
## CRBI 0.32833941
## CWalks .
## LeagueA .
## LeagueN .
## DivisionW .
## PutOuts 0.06625755
## Assists .
## Errors .
## NewLeagueN .
Suppose we want to use our earlier train/validation set to select the lambda
for the lasso?
lasso.tr=glmnet(x[train,], y[train])
lasso.tr
##
## Call: glmnet(x = x[train, ], y = y[train])
##
## Df %Dev Lambda
## 1 0 0.00 262.100
## 2 1 5.92 238.800
## 3 1 10.83 217.600
## 4 1 14.91 198.300
## 5 2 19.72 180.600
## 6 3 23.94 164.600
## 7 3 27.45 150.000
## 8 3 30.37 136.700
## 9 3 32.79 124.500
## 10 3 34.80 113.500
## 11 4 36.50 103.400
## 12 5 38.77 94.190
## 13 6 40.90 85.820
## 14 6 42.73 78.200
## 15 6 44.25 71.250
## 16 6 45.51 64.920
## 17 6 46.55 59.150
## 18 6 47.42 53.900
## 19 6 48.14 49.110
## 20 6 48.74 44.750
## 21 6 49.24 40.770
## 22 6 49.65 37.150
## 23 6 49.99 33.850
## 24 7 50.28 30.840
## 25 7 50.51 28.100
## 26 8 50.71 25.610
## 27 8 50.94 23.330
## 28 8 51.12 21.260
## 29 8 51.28 19.370
## 30 8 51.41 17.650
## 31 8 51.52 16.080
## 32 8 51.60 14.650
## 33 8 51.68 13.350
## 34 9 51.75 12.170
## 35 9 51.99 11.080
## 36 10 52.23 10.100
## 37 10 52.44 9.202
## 38 11 52.64 8.385
## 39 11 52.82 7.640
## 40 11 52.97 6.961
## 41 11 53.09 6.343
## 42 11 53.19 5.779
## 43 12 53.28 5.266
## 44 14 53.53 4.798
## 45 14 53.83 4.372
## 46 15 54.06 3.984
## 47 16 54.45 3.630
## 48 16 54.79 3.307
## 49 16 55.06 3.013
## 50 16 55.29 2.746
## 51 17 55.48 2.502
## 52 17 55.65 2.280
## 53 17 55.78 2.077
## 54 17 55.89 1.892
## 55 18 56.00 1.724
## 56 18 56.16 1.571
## 57 18 56.30 1.432
## 58 19 56.42 1.304
## 59 19 56.53 1.189
## 60 19 56.63 1.083
## 61 19 56.71 0.987
## 62 19 56.77 0.899
## 63 19 56.83 0.819
## 64 19 56.88 0.746
## 65 19 56.92 0.680
## 66 19 56.95 0.620
## 67 19 56.98 0.565
## 68 19 57.00 0.514
## 69 19 57.02 0.469
## 70 19 57.04 0.427
## 71 19 57.05 0.389
## 72 19 57.06 0.355
## 73 19 57.07 0.323
## 74 19 57.08 0.294
## 75 19 57.08 0.268
## 76 19 57.09 0.244
## 77 19 57.09 0.223
## 78 19 57.10 0.203
## 79 19 57.10 0.185
## 80 19 57.11 0.168
## 81 19 57.11 0.154
## 82 19 57.11 0.140
## 83 19 57.11 0.127
pred=predict(lasso.tr,x[-train,])
# There are 83 observations in the validation set
# We have 89 values of lambda so there are going to be 89 different columns
# In this prediction matrix
dim(pred)
## [1] 83 83
# This command recycles this vector 89 times, column wise to calculate the
# sum of squared errors
# We square those errors and then use apply to compute the means in each column of the squared errors and we put that in and then take the square root
rmse=sqrt(apply((y[-train]-pred)^2,2,mean))
plot(log(lasso.tr$lambda),rmse,type="b",xlab="Log(lambda)")
lam.best=lasso.tr$lambda[order(rmse)[1]]
lam.best
## [1] 1.571184
coef(lasso.tr,s=lam.best)
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 284.22742427
## AtBat -1.14830040
## Hits 4.92901730
## HmRun .
## Runs -3.51936866
## RBI 0.38309009
## Walks 6.01828596
## Years -20.74174043
## CAtBat -0.01903175
## CHits 0.08077424
## CHmRun 0.53493799
## CRuns 0.77272746
## CRBI 0.49203970
## CWalks -0.47458673
## LeagueA -91.21313136
## LeagueN .
## DivisionW -161.10222986
## PutOuts 0.28639231
## Assists 0.29245560
## Errors -4.69237401
## NewLeagueN -56.95409378
In the last two sessions we’ve looked at:
Model Selection
Base subset
Forward Stepwise
Ridge regression
Lasso
Validation using
cp
kfold cross validation
NEXT – * Principal Component Regression * Partially Squares
RSS (Residual Sum of Squares) A residual sum of squares is a statistical technique used to measure the variance in a data set that is not explained by the regression model. PG. 62 in ISL
Ridge Regression penalizes potential over-representation – for example, if the sample size is small or maybe not representative of the population as a whole, Ridge Regression will penalize the large weights associated with that potential over-representation