# MODEL SELECTION

library(ISLR)
summary(Hitters)
##      AtBat            Hits         HmRun            Runs       
##  Min.   : 16.0   Min.   :  1   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:255.2   1st Qu.: 64   1st Qu.: 4.00   1st Qu.: 30.25  
##  Median :379.5   Median : 96   Median : 8.00   Median : 48.00  
##  Mean   :380.9   Mean   :101   Mean   :10.77   Mean   : 50.91  
##  3rd Qu.:512.0   3rd Qu.:137   3rd Qu.:16.00   3rd Qu.: 69.00  
##  Max.   :687.0   Max.   :238   Max.   :40.00   Max.   :130.00  
##                                                                
##       RBI             Walks            Years            CAtBat       
##  Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
##  1st Qu.: 28.00   1st Qu.: 22.00   1st Qu.: 4.000   1st Qu.:  816.8  
##  Median : 44.00   Median : 35.00   Median : 6.000   Median : 1928.0  
##  Mean   : 48.03   Mean   : 38.74   Mean   : 7.444   Mean   : 2648.7  
##  3rd Qu.: 64.75   3rd Qu.: 53.00   3rd Qu.:11.000   3rd Qu.: 3924.2  
##  Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
##                                                                      
##      CHits            CHmRun           CRuns             CRBI        
##  Min.   :   4.0   Min.   :  0.00   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.: 209.0   1st Qu.: 14.00   1st Qu.: 100.2   1st Qu.:  88.75  
##  Median : 508.0   Median : 37.50   Median : 247.0   Median : 220.50  
##  Mean   : 717.6   Mean   : 69.49   Mean   : 358.8   Mean   : 330.12  
##  3rd Qu.:1059.2   3rd Qu.: 90.00   3rd Qu.: 526.2   3rd Qu.: 426.25  
##  Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.00  
##                                                                      
##      CWalks        League  Division    PutOuts          Assists     
##  Min.   :   0.00   A:175   E:157    Min.   :   0.0   Min.   :  0.0  
##  1st Qu.:  67.25   N:147   W:165    1st Qu.: 109.2   1st Qu.:  7.0  
##  Median : 170.50                    Median : 212.0   Median : 39.5  
##  Mean   : 260.24                    Mean   : 288.9   Mean   :106.9  
##  3rd Qu.: 339.25                    3rd Qu.: 325.0   3rd Qu.:166.0  
##  Max.   :1566.00                    Max.   :1378.0   Max.   :492.0  
##                                                                     
##      Errors          Salary       NewLeague
##  Min.   : 0.00   Min.   :  67.5   A:176    
##  1st Qu.: 3.00   1st Qu.: 190.0   N:146    
##  Median : 6.00   Median : 425.0            
##  Mean   : 8.04   Mean   : 535.9            
##  3rd Qu.:11.00   3rd Qu.: 750.0            
##  Max.   :32.00   Max.   :2460.0            
##                  NA's   :59

Oh no! Missing values!!

We’ll quickly deal with this by 1. Removing everything with “na” na.omit 2. Checking our work sum(is,na(Salary))

Hitters = na.omit(Hitters)
with(Hitters,sum(is.na(Salary)))
## [1] 0

Best Subset Regression

DEF: Looks through all possible regression models of all different subset sizes and looks for the best of each size. PRODUCES: A sequence of models which is the best subset for each particular size

library(leaps)
regfit.full=regsubsets(Salary~.,data=Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "   " " 
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"   " " 
##          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "

It gives best-subsets up to size 8 – what if we want all variables?

regfit.full=regsubsets(Salary~.,data=Hitters, nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
plot(reg.summary$cp, xlab="Number of Variables", ylab="Cp")
which.min(reg.summary$cp)
## [1] 10
points(10,reg.summary$cp[10], pch=20, col="red")

plot(regfit.full, scale="Cp")

coef(regfit.full,10)
##  (Intercept)        AtBat         Hits        Walks       CAtBat        CRuns 
##  162.5354420   -2.1686501    6.9180175    5.7732246   -0.1300798    1.4082490 
##         CRBI       CWalks    DivisionW      PutOuts      Assists 
##    0.7743122   -0.8308264 -112.3800575    0.2973726    0.2831680

Forward Stepwise Selection

NOTES: * Base subset is agressive, looking at all possible subsets * Forward stepwise is a greedy algorithm. Each time it includes the next best variable But it produces a nested sequence. * Each new model includes all the variables that were before, plus a new one

regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19, method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
##           CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"
plot(regfit.fwd,scale="Cp")

Model selection using a Validation Set

  • Make training and validation set so we can choose a good subset model
  • NOTE: Slightly different from the book
  • train=sample(seq(263),180,replace=FALSE) In English: We’re going to create a sequence of numbers from 1-263. We are going to take a sample of 180 from that sequence
dim(Hitters)
## [1] 263  20
set.seed(1)
train=sample(seq(263),180,replace=FALSE)
train
##   [1] 167 129 187  85  79 213  37 105 217 110 229 165  34 106 126  89 172 207
##  [19]  33  84 163  70  74  42 166 111 148 156  20  44 121  87 242 233  40 247
##  [37]  25 119 198 122  39 179 240 134  24 160  14 130  45 146  22 206 193 115
##  [55] 104 231 208 209 103  75  13 253 176 248  23 254 244 205  29 141 150 236
##  [73] 108  48 245 215 149  31 102 145  73 232  83 118  90 190 107  64 196  60
##  [91]  51 251 138 262  43  26 143 195 152 178 223 219 202 181 222 169   1 239
## [109]  78 211 246  28 116 257  61 113  86  71 225  99 173 234  49 256 174 194
## [127]  50 135 238 235 230 263  53 100 164  65 142 175 140 124 224  77 159  98
## [145]  66  19  17 228 204 186  35 144  46 180 109 210  16 161   9 137  92 162
## [163]  10 259  32 243  95 154  93  12 255 177  15   2 128  67 183 117 197   5
regfit.fwd=regsubsets(Salary~.,data=Hitters[train,],nvmax=19, method="forward")

Now we will work out the validation errors on the remainder of the data!

  • We will make predictions on the observations not used for training. (We know there are 19 models so we set up some vectors to record the errors)

  • To get the right elements of x test, we have to index the columns by the names that are on the coefficient vector.

val.errors=rep(NA,19)
x.test = model.matrix(Salary~.,data=Hitters[-train,])
for(i in 1:19){
  # use coef function to extract the coefficients
  coefi=coef(regfit.fwd,id=i)
  
  # making our own prediction method
  # subset of columns of x test that correspond to the variables that are in this current coefficient vector
  # Then matrix mulitplied by the coefficient vector
  pred=x.test[,names(coefi)]%*%coefi
  
  # And now that we have our predictions, we can calculate the mean squared error
  val.errors[i]=mean((Hitters$Salary[-train]-pred)^2)
}

plot(sqrt(val.errors),ylab="Root MSE",ylim=c(300,400), pch=19,type="b")
points(sqrt(regfit.fwd$rss[-1]/180),col="blue", pch=19, type="b")

# pch=19 simply means round, full dot (on the plot)
legend("topright", legend=c("Training", "Validation"), col=c("blue","black"), pch=19)

Because it’s tedious, not having a predict method for regsubsets we’ll write one!!

  • regsubset has a component called call and part of call is a formula and we can extract the forumla through this little “incantation” here. Using the formula, we make a model matrix from it. Much like we did before. Then we extract the coefficient and do the matrix multiplication.

  • Now we have a function for reuse in our next section!

predict.regsubsets=function(object, newdata,id,...){
  form=as.formula(object$call[[2]])
  mat=model.matrix(form,newdata)
  coefi=coef(object,id=id)
  mat[,names(coefi)]%*%coefi
}

Model Selection by Cross-Validation

We will do 10-fold cross-validation.

set.seed(11)
# Sample from the numbers 1-10, each is going to be assigned a fold number
# We create a vector, 1 to 10, of length the number of rows of hitters 
folds=sample(rep(1:10,length=nrow(Hitters)))
folds
##   [1] 10  4  4  4  3  7 10  8 10  3  4  2  2  1  1  5  3  3  5  9  5  2  7  3  8
##  [26]  7  4  6  7  7  7 10  2  6  6  1  8  8 10  1  8  3  7 10  1  6 10  3  8  7
##  [51]  9  3  6  2  5  6  5  5  2  6  4  7 10  7  1  8  1  2  8  2  1  8  2  6  4
##  [76]  3  1  1  5  1  2 10  5  8  8  3  9  8  3  6  1  9  5  9  9  8  8  6  8 10
## [101]  9  8  7  8 10  1  3  1 10  5  2  5  2  4  4 10  3  6  9  7  4  3  9  8 10
## [126]  7  5  9 10  4  7  4  1  9  4  5  6  8 10  6  7  1 10  6  4  6  7  1  9  4
## [151]  2  1  7  2  5  7 10  7  9  6  9  5  5  4 10  4  2 10  9  3  3  2  9  2  6
## [176]  9  4  3  9  9  9  4  6  1  7  8  5  8 10  6  9  8  9  1  5  3  3  1  7  1
## [201]  2  9  1  9 10  3 10  3  4  6  8  2  9  5  7  2 10  4  1  6 10  3  5  5  3
## [226]  5  7  3  6  5  4  1  3  7  1  5  2  9  4  6  8  3  2  4  5 10  8  5  2  2
## [251]  2  6  3  4  7  2  6  4  8  7  1  2  6
table(folds)
## folds
##  1  2  3  4  5  6  7  8  9 10 
## 27 27 27 26 26 26 26 26 26 26
cv.errors=matrix(NA,10,19)
for(k in 1:10){
  best.fit=regsubsets(Salary~.,data=Hitters[folds!=k,],nvmax=19,method="forward")
  for(i in 1:19){
    pred=predict(best.fit,Hitters[folds==k,],id=i)
    cv.errors[k,i]=mean( (Hitters$Salary[folds==k]-pred)^2)
    
  }
}
# Now we process our output matrix
rmse.cv=sqrt(apply(cv.errors,2,mean))
plot(rmse.cv,pch=19,type="b")

In the video, it seemed to prefer models of 11 or 12 – per our current graph, it seems to prefer models of 9.

Ridge Regression and the Lasso

Using a package glimnet which does not use the model formula language, so we have to set up an x and y (which is the predictor matrix and the response)

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
# Predictor matrix
x=model.matrix(Salary~.-1,data=Hitters)
# Response
y=Hitters$Salary

First, we will fit a ridge-regression model. We do this by calling glmnet with alpha=0 (see helpfile?). there is also a cv.glmnet function which will do the cross-validation for us

RIDGE REGRESSION is penalized by the sum of squares of the coefficients Penalties put on the sum of squares of the coefficients (controlled by parameter lambda) Criterion for ridge regresion is residual sum of squares which is the usual for linear regression plus lambda times summation

RIDGE REGRESSION ELI5

From here

Let’s say you’re trying to predict future satisfaction using a number of inputs (age, income, gender etc.). Ridge regression penalizes large weights assigned to each input so that one input does not dominate all others in the prediction.

For example, a linear regression may find that income is the most important variable by far in determining future satisfaction (i.e. is assigned the largest weight in the linear regression). But maybe you have a small sample, or you think your sample is not representative, so you don’t want income to be essentially the only input used to predict future satisfaction. The other inputs may have an effect, but their weights in a linear regression are so small relative to that of income that income is essentially the only input that matters in your prediction. You want to take into account all your variables in a more “even” way. Ridge regression will force all coefficients to be in a similar range so that you get a wider range of predictions that are not dominated by income only.

fit.ridge=glmnet(x,y,alpha=0)
plot(fit.ridge, xvar="lambda", label=TRUE)

NOTE Unlike subset and forward stepwise regression, which controls the complexity of a model by restricting the number of variables, ridge regression keeps all the variables in and shrinks the coefficients towards zero.

cv.ridge=cv.glmnet(x,y,alpha=0)
plot(cv.ridge)

Lasso

Lasso similar to ridge, the only difference is the penalty. FOr lasso we minimize the RSS plus lambda and there’s a very subtle change. Insated of the sum of squares of the coefficients we penalize the absolute values of the coefficients.

NOTE Lasso is doing both shrinkage and variable selection

# Don't need alpha here because alpha defaults to alpha=1
fit.lasso=glmnet(x,y)
plot(fit.lasso,xvar="lambda", label=TRUE)

We can also plot “dev” instead of “lambda” – This is the percentage of deviance explained or percentage of the sum of squares explained (which is the rsquared)

This plot changes the orientaton

plot(fit.lasso,xvar="dev", label=TRUE)

An indication that maybe at the end of the path there is overfitting

TL;DR: There are different ways of plotting the coefficients and they give you different information about the coefficients and about the nature of the path.

cv.lasso=cv.glmnet(x,y)
plot(cv.lasso)

THIS PICKS THE COEFFICIENTS FOR US

coef(cv.lasso)
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept) 144.37970485
## AtBat         .         
## Hits          1.36380384
## HmRun         .         
## Runs          .         
## RBI           .         
## Walks         1.49731098
## Years         .         
## CAtBat        .         
## CHits         .         
## CHmRun        .         
## CRuns         0.15275165
## CRBI          0.32833941
## CWalks        .         
## LeagueA       .         
## LeagueN       .         
## DivisionW     .         
## PutOuts       0.06625755
## Assists       .         
## Errors        .         
## NewLeagueN    .

Suppose we want to use our earlier train/validation set to select the lambda for the lasso?

lasso.tr=glmnet(x[train,], y[train])
lasso.tr
## 
## Call:  glmnet(x = x[train, ], y = y[train]) 
## 
##    Df  %Dev  Lambda
## 1   0  0.00 262.100
## 2   1  5.92 238.800
## 3   1 10.83 217.600
## 4   1 14.91 198.300
## 5   2 19.72 180.600
## 6   3 23.94 164.600
## 7   3 27.45 150.000
## 8   3 30.37 136.700
## 9   3 32.79 124.500
## 10  3 34.80 113.500
## 11  4 36.50 103.400
## 12  5 38.77  94.190
## 13  6 40.90  85.820
## 14  6 42.73  78.200
## 15  6 44.25  71.250
## 16  6 45.51  64.920
## 17  6 46.55  59.150
## 18  6 47.42  53.900
## 19  6 48.14  49.110
## 20  6 48.74  44.750
## 21  6 49.24  40.770
## 22  6 49.65  37.150
## 23  6 49.99  33.850
## 24  7 50.28  30.840
## 25  7 50.51  28.100
## 26  8 50.71  25.610
## 27  8 50.94  23.330
## 28  8 51.12  21.260
## 29  8 51.28  19.370
## 30  8 51.41  17.650
## 31  8 51.52  16.080
## 32  8 51.60  14.650
## 33  8 51.68  13.350
## 34  9 51.75  12.170
## 35  9 51.99  11.080
## 36 10 52.23  10.100
## 37 10 52.44   9.202
## 38 11 52.64   8.385
## 39 11 52.82   7.640
## 40 11 52.97   6.961
## 41 11 53.09   6.343
## 42 11 53.19   5.779
## 43 12 53.28   5.266
## 44 14 53.53   4.798
## 45 14 53.83   4.372
## 46 15 54.06   3.984
## 47 16 54.45   3.630
## 48 16 54.79   3.307
## 49 16 55.06   3.013
## 50 16 55.29   2.746
## 51 17 55.48   2.502
## 52 17 55.65   2.280
## 53 17 55.78   2.077
## 54 17 55.89   1.892
## 55 18 56.00   1.724
## 56 18 56.16   1.571
## 57 18 56.30   1.432
## 58 19 56.42   1.304
## 59 19 56.53   1.189
## 60 19 56.63   1.083
## 61 19 56.71   0.987
## 62 19 56.77   0.899
## 63 19 56.83   0.819
## 64 19 56.88   0.746
## 65 19 56.92   0.680
## 66 19 56.95   0.620
## 67 19 56.98   0.565
## 68 19 57.00   0.514
## 69 19 57.02   0.469
## 70 19 57.04   0.427
## 71 19 57.05   0.389
## 72 19 57.06   0.355
## 73 19 57.07   0.323
## 74 19 57.08   0.294
## 75 19 57.08   0.268
## 76 19 57.09   0.244
## 77 19 57.09   0.223
## 78 19 57.10   0.203
## 79 19 57.10   0.185
## 80 19 57.11   0.168
## 81 19 57.11   0.154
## 82 19 57.11   0.140
## 83 19 57.11   0.127
pred=predict(lasso.tr,x[-train,])
# There are 83 observations in the validation set
# We have 89 values of lambda so there are going to be 89 different columns
# In this prediction matrix
dim(pred)
## [1] 83 83
# This command recycles this vector 89 times, column wise to calculate the 
# sum of squared errors
# We square those errors and then use apply to compute the means in each column of the squared errors and we put that in and then take the square root

rmse=sqrt(apply((y[-train]-pred)^2,2,mean))
plot(log(lasso.tr$lambda),rmse,type="b",xlab="Log(lambda)")

lam.best=lasso.tr$lambda[order(rmse)[1]]
lam.best
## [1] 1.571184
coef(lasso.tr,s=lam.best)
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  284.22742427
## AtBat         -1.14830040
## Hits           4.92901730
## HmRun          .         
## Runs          -3.51936866
## RBI            0.38309009
## Walks          6.01828596
## Years        -20.74174043
## CAtBat        -0.01903175
## CHits          0.08077424
## CHmRun         0.53493799
## CRuns          0.77272746
## CRBI           0.49203970
## CWalks        -0.47458673
## LeagueA      -91.21313136
## LeagueN        .         
## DivisionW   -161.10222986
## PutOuts        0.28639231
## Assists        0.29245560
## Errors        -4.69237401
## NewLeagueN   -56.95409378

IN SUMMARY:

In the last two sessions we’ve looked at:

  • Model Selection

  • Base subset

  • Forward Stepwise

  • Ridge regression

  • Lasso

  • Validation using

  • cp

  • kfold cross validation

NEXT – * Principal Component Regression * Partially Squares

GLOSSARY:

RSS (Residual Sum of Squares) A residual sum of squares is a statistical technique used to measure the variance in a data set that is not explained by the regression model. PG. 62 in ISL

Ridge Regression penalizes potential over-representation – for example, if the sample size is small or maybe not representative of the population as a whole, Ridge Regression will penalize the large weights associated with that potential over-representation