library(ISLR)
summary(Hitters)
## AtBat Hits HmRun Runs
## Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
## 1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
## Median :379.5 Median : 96 Median : 8.00 Median : 48.00
## Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
## 3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
## Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
##
## RBI Walks Years CAtBat
## Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
## 1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
## Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
## Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
## 3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
## Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
##
## CHits CHmRun CRuns CRBI
## Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
## 1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
## Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
## Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
## 3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
## Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
##
## CWalks League Division PutOuts Assists
## Min. : 0.00 A:175 E:157 Min. : 0.0 Min. : 0.0
## 1st Qu.: 67.25 N:147 W:165 1st Qu.: 109.2 1st Qu.: 7.0
## Median : 170.50 Median : 212.0 Median : 39.5
## Mean : 260.24 Mean : 288.9 Mean :106.9
## 3rd Qu.: 339.25 3rd Qu.: 325.0 3rd Qu.:166.0
## Max. :1566.00 Max. :1378.0 Max. :492.0
##
## Errors Salary NewLeague
## Min. : 0.00 Min. : 67.5 A:176
## 1st Qu.: 3.00 1st Qu.: 190.0 N:146
## Median : 6.00 Median : 425.0
## Mean : 8.04 Mean : 535.9
## 3rd Qu.:11.00 3rd Qu.: 750.0
## Max. :32.00 Max. :2460.0
## NA's :59
We’ll quickly deal with this by 1. Removing everything with “na” na.omit
2. Checking our work sum(is,na(Salary))
Hitters = na.omit(Hitters)
with(Hitters,sum(is.na(Salary)))
## [1] 0
DEF: Looks through all possible regression models of all different subset sizes and looks for the best of each size. PRODUCES: A sequence of models which is the best subset for each particular size
library(leaps)
regfit.full=regsubsets(Salary~.,data=Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
It gives best-subsets up to size 8 – what if we want all variables?
regfit.full=regsubsets(Salary~.,data=Hitters, nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
plot(reg.summary$cp, xlab="Number of Variables", ylab="Cp")
which.min(reg.summary$cp)
## [1] 10
points(10,reg.summary$cp[10], pch=20, col="red")
plot(regfit.full, scale="Cp")
coef(regfit.full,10)
## (Intercept) AtBat Hits Walks CAtBat CRuns
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798 1.4082490
## CRBI CWalks DivisionW PutOuts Assists
## 0.7743122 -0.8308264 -112.3800575 0.2973726 0.2831680
NOTES: * Base subset is agressive, looking at all possible subsets * Forward stepwise is a greedy algorithm. Each time it includes the next best variable But it produces a nested sequence. * Each new model includes all the variables that were before, plus a new one
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19, method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
plot(regfit.fwd,scale="Cp")
train=sample(seq(263),180,replace=FALSE)
In English: We’re going to create a sequence of numbers from 1-263. We are going to take a sample of 180 from that sequencedim(Hitters)
## [1] 263 20
set.seed(1)
train=sample(seq(263),180,replace=FALSE)
train
## [1] 167 129 187 85 79 213 37 105 217 110 229 165 34 106 126 89 172 207
## [19] 33 84 163 70 74 42 166 111 148 156 20 44 121 87 242 233 40 247
## [37] 25 119 198 122 39 179 240 134 24 160 14 130 45 146 22 206 193 115
## [55] 104 231 208 209 103 75 13 253 176 248 23 254 244 205 29 141 150 236
## [73] 108 48 245 215 149 31 102 145 73 232 83 118 90 190 107 64 196 60
## [91] 51 251 138 262 43 26 143 195 152 178 223 219 202 181 222 169 1 239
## [109] 78 211 246 28 116 257 61 113 86 71 225 99 173 234 49 256 174 194
## [127] 50 135 238 235 230 263 53 100 164 65 142 175 140 124 224 77 159 98
## [145] 66 19 17 228 204 186 35 144 46 180 109 210 16 161 9 137 92 162
## [163] 10 259 32 243 95 154 93 12 255 177 15 2 128 67 183 117 197 5
regfit.fwd=regsubsets(Salary~.,data=Hitters[train,],nvmax=19, method="forward")
Now we will work out the validation errors on the remainder of the data!
We will make predictions on the observations not used for training. (We know there are 19 models so we set up some vectors to record the errors)
To get the right elements of x test, we have to index the columns by the names that are on the coefficient vector.
val.errors=rep(NA,19)
x.test = model.matrix(Salary~.,data=Hitters[-train,])
for(i in 1:19){
# use coef function to extract the coefficients
coefi=coef(regfit.fwd,id=i)
# making our own prediction method
# subset of columns of x test that correspond to the variables that are in this current coefficient vector
# Then matrix mulitplied by the coefficient vector
pred=x.test[,names(coefi)]%*%coefi
# And now that we have our predictions, we can calculate the mean squared error
val.errors[i]=mean((Hitters$Salary[-train]-pred)^2)
}
plot(sqrt(val.errors),ylab="Root MSE",ylim=c(300,400), pch=19,type="b")
points(sqrt(regfit.fwd$rss[-1]/180),col="blue", pch=19, type="b")
# pch=19 simply means round, full dot (on the plot)
legend("topright", legend=c("Training", "Validation"), col=c("blue","black"), pch=19)
predict
method for regsubsets
we’ll write one!!regsubset
has a component called call
and part of call
is a formula and we can extract the forumla through this little “incantation” here. Using the formula, we make a model matrix from it. Much like we did before. Then we extract the coefficient and do the matrix multiplication.
Now we have a function for reuse in our next section!
predict.regsubsets=function(object, newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
mat[,names(coefi)]%*%coefi
}
We will do 10-fold cross-validation.
set.seed(11)
# Sample from the numbers 1-10, each is going to be assigned a fold number
# We create a vector, 1 to 10, of length the number of rows of hitters
folds=sample(rep(1:10,length=nrow(Hitters)))
folds
## [1] 10 4 4 4 3 7 10 8 10 3 4 2 2 1 1 5 3 3 5 9 5 2 7 3 8
## [26] 7 4 6 7 7 7 10 2 6 6 1 8 8 10 1 8 3 7 10 1 6 10 3 8 7
## [51] 9 3 6 2 5 6 5 5 2 6 4 7 10 7 1 8 1 2 8 2 1 8 2 6 4
## [76] 3 1 1 5 1 2 10 5 8 8 3 9 8 3 6 1 9 5 9 9 8 8 6 8 10
## [101] 9 8 7 8 10 1 3 1 10 5 2 5 2 4 4 10 3 6 9 7 4 3 9 8 10
## [126] 7 5 9 10 4 7 4 1 9 4 5 6 8 10 6 7 1 10 6 4 6 7 1 9 4
## [151] 2 1 7 2 5 7 10 7 9 6 9 5 5 4 10 4 2 10 9 3 3 2 9 2 6
## [176] 9 4 3 9 9 9 4 6 1 7 8 5 8 10 6 9 8 9 1 5 3 3 1 7 1
## [201] 2 9 1 9 10 3 10 3 4 6 8 2 9 5 7 2 10 4 1 6 10 3 5 5 3
## [226] 5 7 3 6 5 4 1 3 7 1 5 2 9 4 6 8 3 2 4 5 10 8 5 2 2
## [251] 2 6 3 4 7 2 6 4 8 7 1 2 6
table(folds)
## folds
## 1 2 3 4 5 6 7 8 9 10
## 27 27 27 26 26 26 26 26 26 26
cv.errors=matrix(NA,10,19)
for(k in 1:10){
best.fit=regsubsets(Salary~.,data=Hitters[folds!=k,],nvmax=19,method="forward")
for(i in 1:19){
pred=predict(best.fit,Hitters[folds==k,],id=i)
cv.errors[k,i]=mean( (Hitters$Salary[folds==k]-pred)^2)
}
}
# Now we process our output matrix
rmse.cv=sqrt(apply(cv.errors,2,mean))
plot(rmse.cv,pch=19,type="b")
In the video, it seemed to prefer models of 11 or 12 – per our current graph, it seems to prefer models of 9.