Hands-On Machine Learning

CH2: End-to-End Machine Learning Project

1. Look at the Big Picture

  • Frame the problem
  • Select a performance measure
  • Check assumptions

2. Get the Data

In [65]:
import os
import tarfile
import urllib
In [66]:
DOWNLOAD_ROOT =  "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
In [67]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
In [68]:
fetch_housing_data()
In [69]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
In [70]:
# housing = load_housing_data()
housing = pd.read_csv('../IST_718_Big_Data/WK2/coaches9_clean.csv')

-- 2: Take a Quick Look at the Data Structure

In [71]:
housing.head()
Out[71]:
Unnamed: 0 School Conference Coach TotalPay Bonus BonusPaid Buyout
0 0 Air Force Mt. West Troy Calhoun 885000.0 247000.0 0.0 0.0
1 1 Akron MAC Terry Bowden 412500.0 225000.0 50000.0 688500.0
2 2 Alabama SEC Nick Saban 8307000.0 1100000.0 500000.0 33600000.0
3 3 Alabama at Birmingham C-USA Bill Clark 900000.0 950000.0 165471.0 3847500.0
4 4 Appalachian State Sun Belt Scott Satterfield 712500.0 295000.0 145000.0 2160417.0
In [72]:
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 8 columns):
Unnamed: 0    129 non-null int64
School        129 non-null object
Conference    129 non-null object
Coach         129 non-null object
TotalPay      129 non-null float64
Bonus         129 non-null float64
BonusPaid     129 non-null float64
Buyout        129 non-null float64
dtypes: float64(4), int64(1), object(3)
memory usage: 8.2+ KB
In [73]:
housing['Conference'].value_counts()
Out[73]:
ACC         14
Big Ten     14
SEC         14
C-USA       14
Pac-12      12
Mt. West    12
MAC         12
AAC         11
Sun Belt    10
Big 12      10
Ind.         6
Name: Conference, dtype: int64
In [74]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

-- 2. Create a Test Set

In [75]:
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices] 
In [76]:
train_set, test_set = split_train_test(housing, 0.2)
In [77]:
len(train_set)
Out[77]:
104
In [78]:
len(test_set)
Out[78]:
25
In [79]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
In [80]:
# # Insure that there is a proportional number of each cagetory in the test set

# housing["bonus_cat"] = pd.cut(housing["Bonus"], 
#                                bins=[0.,1.5,3.0,4.5,6., np.inf],
#                                labels=[1,2,3,4,5])
# housing["bonus_cat"].hist()
In [81]:
# from sklearn.model_selection import StratifiedShuffleSplit
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(housing, housing["bonus_cat"]):
#     strat_train_set = housing.loc[train_index]
#     strat_test_set = housing.loc[test_index]
In [82]:
# strat_test_set["income_cat"].value_counts() / len(strat_test_set)
In [83]:
# Remove the `income cat` attribute so data is back to original
# for set_ in (strat_train_set, strat_test_set):
#     set_.drop("income_cat", axis=1, inplace=True)

3. Discover and Visualize the Data to Gain Insights

-- 3. Visualizing Geographical Data

In [84]:
# housing.plot(kind="scatter", x="longitude", y="latitude")
In [85]:
# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
In [86]:
# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, 
#             s=housing["population"]/100, label="population", figsize=(10,7), 
#             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
# plt.legend()

-- 3. Looking For Correlations

NOTES:

  • A clustering algo could be useful for detecting the main cluster and for adding new features that measure the proximity to the cluster events
  • Correlation Coefficient ranges from -1 to 1
    • When it is close to 1, strong positive linear relationshiop (EX: Median house value increases as median income increases)
    • When it is close to -1, strong negative linear relationship
  • Correlation Coefficient only measures linear relationships
In [87]:
corr_matrix = housing.corr()
In [88]:
corr_matrix
Out[88]:
Unnamed: 0 TotalPay Bonus BonusPaid Buyout
Unnamed: 0 1.000000 0.109425 -0.134731 -0.108440 0.022433
TotalPay 0.109425 1.000000 0.390325 0.393773 0.782862
Bonus -0.134731 0.390325 1.000000 0.413335 0.447226
BonusPaid -0.108440 0.393773 0.413335 1.000000 0.423513
Buyout 0.022433 0.782862 0.447226 0.423513 1.000000
In [89]:
corr_matrix["TotalPay"].sort_values(ascending=False)
Out[89]:
TotalPay      1.000000
Buyout        0.782862
BonusPaid     0.393773
Bonus         0.390325
Unnamed: 0    0.109425
Name: TotalPay, dtype: float64
In [90]:
from pandas.plotting import scatter_matrix
attributes = ["TotalPay", "Bonus", "BonusPaid", "Buyout"]
scatter_matrix(housing[attributes], figsize=(12,8))
Out[90]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1ef85250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f089f10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f084790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f2a2d50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1edf3590>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1ed76d90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1edc05d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1ee53dd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1ee2f2d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f2bc310>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f329650>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f35ce50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1f39e690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f3d1e90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f4136d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f446ed0>]],
      dtype=object)

NOTE: This plots every numerical atribute against every ther numerical attribute, plus a nistogram of each numerical attribute

In [91]:
# housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

-- 3. Experimenting with Attribute Combinations

In [92]:
# housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
# housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
# housing["population_per_household"] = housing["population"]/housing["households"]
In [93]:
# corr_matrix = housing.corr()
In [94]:
# corr_matrix["median_house_value"].sort_values(ascending=False)

4. Prepare the Data for Machine Learning Algorithms

NOTES:

  • write functions for this
In [95]:
# housing = strat_train_set.drop("TotalPay", axis=1)
# housing_labels = strat_train_set["TotalPay"].copy()

housing = train_set.drop("TotalPay", axis=1)
housing_labels = train_set["TotalPay"].copy()
In [96]:
housing.head()
Out[96]:
Unnamed: 0 School Conference Coach Bonus BonusPaid Buyout
70 70 Nebraska Big Ten Scott Frost 950000.0 0.0 25416667.0
78 78 Northern Illinois MAC Rod Carey 205000.0 42500.0 1822918.0
47 47 Kansas State Big 12 Bill Snyder 580000.0 50000.0 3000000.0
0 0 Air Force Mt. West Troy Calhoun 247000.0 0.0 0.0
12 12 Baylor Big 12 Matt Rhule 0.0 0.0 0.0
In [97]:
housing_labels
Out[97]:
70     5000000.0
78      633460.0
47     3500000.0
0       885000.0
12           0.0
         ...    
106    4840717.0
14     2514859.0
92     2200000.0
51      700000.0
102    2401206.0
Name: TotalPay, Length: 103, dtype: float64

-- 4. Data Cleaning

What do we do with missing features? Either:

  1. Get rid of the districts
  2. Get rid of the whole attribute (bedrooms)
  3. Replace the value with some value (zero, mean, median)
In [99]:
# housing.dropna(subset=["total_bedrooms"]) # option 1
# housing.drop("total_bedrooms", axis=1) # option 2
# median = housing["total_bedrooms"].median() # option 3
# housing["total_bedrooms"].fillna(median, inplace=True)
In [102]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
# NOTE: median can only be computed with numerical attributes so we have to drop "ocean proximity"
housing_num = housing.drop(["School","Conference", "Coach"], axis=1)
In [103]:
imputer.fit(housing_num)
Out[103]:
SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)
In [104]:
imputer.statistics_
Out[104]:
array([6.500000e+01, 5.800000e+05, 1.000000e+04, 2.671875e+06])
In [105]:
housing_num.median().values
Out[105]:
array([6.500000e+01, 5.800000e+05, 1.000000e+04, 2.671875e+06])
In [106]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

-- 4. Handling Text and Categorical Attributes

In [107]:
housing_cat = housing[["Conference"]]
housing_cat.head()
Out[107]:
Conference
70 Big Ten
78 MAC
47 Big 12
0 Mt. West
12 Big 12
In [108]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]
Out[108]:
array([[ 3.],
       [ 6.],
       [ 2.],
       [ 7.],
       [ 2.],
       [ 3.],
       [ 3.],
       [10.],
       [ 9.],
       [10.]])
In [109]:
ordinal_encoder.categories_
Out[109]:
[array(['AAC', 'ACC', 'Big 12', 'Big Ten', 'C-USA', 'Ind.', 'MAC',
        'Mt. West', 'Pac-12', 'SEC', 'Sun Belt'], dtype=object)]
In [110]:
# PROBLEM: We don't want these to actaully have any relationship to one another 
# Like good, better, best
# SOLUTION: One-hot encoding
In [111]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
Out[111]:
<103x11 sparse matrix of type '<class 'numpy.float64'>'
	with 103 stored elements in Compressed Sparse Row format>
In [112]:
cat_encoder.categories_
Out[112]:
[array(['AAC', 'ACC', 'Big 12', 'Big Ten', 'C-USA', 'Ind.', 'MAC',
        'Mt. West', 'Pac-12', 'SEC', 'Sun Belt'], dtype=object)]

-- 4. Custom Transformers

In [113]:
# from sklearn.base import BaseEstimator, TransformerMixin
# rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6
In [115]:
# class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
#     def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs ???
#         self.add_bedrooms_per_room = add_bedrooms_per_room
#     def fit(self, X, y=None):
#         return self # nothing else to do ??
#     def transform(self, X, y=None):
#         rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
#         population_per_household = X[:, population_ix] / X[:, households_ix]
#         if self.add_bedrooms_per_room:
#             bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
#             return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
#         else:
#             return np.c_[X, rooms_per_household, population_per_household]
        
# attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# housing_extra_attribs = attr_adder.transform(housing.values)
        

-- 5. Feature Scaling

In [116]:
# MinMaxScaler 
# Standard Scaler

-- 6. Transformation Pipelines

In [122]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_addr', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
In [123]:
housing.head()
Out[123]:
Unnamed: 0 School Conference Coach Bonus BonusPaid Buyout
70 70 Nebraska Big Ten Scott Frost 950000.0 0.0 25416667.0
78 78 Northern Illinois MAC Rod Carey 205000.0 42500.0 1822918.0
47 47 Kansas State Big 12 Bill Snyder 580000.0 50000.0 3000000.0
0 0 Air Force Mt. West Troy Calhoun 247000.0 0.0 0.0
12 12 Baylor Big 12 Matt Rhule 0.0 0.0 0.0
In [125]:
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ['School','Conference','Coach']

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)
# housing_prepared.head()
In [126]:
# train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

LINEAR REGRESSION

In [127]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
Out[127]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [128]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
In [130]:
some_data_prepared = full_pipeline.transform(some_data)
print("PREDICTIONS:", lin_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))
PREDICTIONS: [ 4.99999995e+06  6.33459826e+05  3.49999968e+06  8.85000248e+05
 -1.72633900e-01]
LABELS: [5000000.0, 633460.0, 3500000.0, 885000.0, 0.0]
In [131]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
Out[131]:
0.17789554083065956

DECISION TREE

In [132]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
Out[132]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
In [133]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
Out[133]:
0.0
In [134]:
print("PREDICTIONS:", tree_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))
PREDICTIONS: [5000000.  633460. 3500000.  885000.       0.]
LABELS: [5000000.0, 633460.0, 3500000.0, 885000.0, 0.0]
In [135]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
In [136]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Stardard Deviation:", scores.std())

display_scores(tree_rmse_scores)
Scores: [1921668.72468537 1091087.99785723 1209420.42511647 1143037.75223529
  668214.0704852   846650.20714573 1227627.38852736 1293675.71808321
 1709706.3777542  1791818.37090546]
Mean: 1290290.7032795516
Stardard Deviation: 384930.80454183096

RANDOM FOREST

In [137]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Out[137]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
In [138]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
Out[138]:
405088.1029853629
In [139]:
scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)
Scores: [1245041.49316675 1132197.17771841  601294.72814302  772713.97332458
 1049396.54429136  602795.8423112   928462.08597401  965397.66858737
 1101029.86713052 1792083.30766503]
Mean: 1019041.2688312249
Stardard Deviation: 329946.92239829834
In [140]:
print("PREDICTIONS:", forest_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))
PREDICTIONS: [5010000.   584250.  2844071.7  970596.5  746668.2]
LABELS: [5000000.0, 633460.0, 3500000.0, 885000.0, 0.0]
In [ ]: