Hands-On Machine Learning¶

CH2: End-to-End Machine Learning Project¶

1. Look at the Big Picture¶

Frame the problem
Select a performance measure
Check assumptions

2. Get the Data¶

import os
import tarfile
import urllib

DOWNLOAD_ROOT =  "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

# housing = load_housing_data()
housing = pd.read_csv('../IST_718_Big_Data/WK2/coaches9_clean.csv')

-- 2: Take a Quick Look at the Data Structure¶

housing.head()

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 8 columns):
Unnamed: 0    129 non-null int64
School        129 non-null object
Conference    129 non-null object
Coach         129 non-null object
TotalPay      129 non-null float64
Bonus         129 non-null float64
BonusPaid     129 non-null float64
Buyout        129 non-null float64
dtypes: float64(4), int64(1), object(3)
memory usage: 8.2+ KB

housing['Conference'].value_counts()

ACC         14
Big Ten     14
SEC         14
C-USA       14
Pac-12      12
Mt. West    12
MAC         12
AAC         11
Sun Belt    10
Big 12      10
Ind.         6
Name: Conference, dtype: int64

%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

-- 2. Create a Test Set¶

import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)

len(train_set)

104

len(test_set)

25

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

# # Insure that there is a proportional number of each cagetory in the test set

# housing["bonus_cat"] = pd.cut(housing["Bonus"], 
#                                bins=[0.,1.5,3.0,4.5,6., np.inf],
#                                labels=[1,2,3,4,5])
# housing["bonus_cat"].hist()

# from sklearn.model_selection import StratifiedShuffleSplit
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(housing, housing["bonus_cat"]):
#     strat_train_set = housing.loc[train_index]
#     strat_test_set = housing.loc[test_index]

# strat_test_set["income_cat"].value_counts() / len(strat_test_set)

# Remove the `income cat` attribute so data is back to original
# for set_ in (strat_train_set, strat_test_set):
#     set_.drop("income_cat", axis=1, inplace=True)

3. Discover and Visualize the Data to Gain Insights¶

-- 3. Visualizing Geographical Data¶

# housing.plot(kind="scatter", x="longitude", y="latitude")

# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, 
#             s=housing["population"]/100, label="population", figsize=(10,7), 
#             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
# plt.legend()

-- 3. Looking For Correlations¶

NOTES:

A clustering algo could be useful for detecting the main cluster and for adding new features that measure the proximity to the cluster events
Correlation Coefficient ranges from -1 to 1
- When it is close to 1, strong positive linear relationshiop (EX: Median house value increases as median income increases)
- When it is close to -1, strong negative linear relationship
Correlation Coefficient only measures linear relationships

corr_matrix = housing.corr()

corr_matrix

corr_matrix["TotalPay"].sort_values(ascending=False)

TotalPay      1.000000
Buyout        0.782862
BonusPaid     0.393773
Bonus         0.390325
Unnamed: 0    0.109425
Name: TotalPay, dtype: float64

from pandas.plotting import scatter_matrix
attributes = ["TotalPay", "Bonus", "BonusPaid", "Buyout"]
scatter_matrix(housing[attributes], figsize=(12,8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1ef85250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f089f10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f084790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f2a2d50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1edf3590>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1ed76d90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1edc05d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1ee53dd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1ee2f2d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f2bc310>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f329650>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f35ce50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1f39e690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f3d1e90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f4136d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f446ed0>]],
      dtype=object)

NOTE: This plots every numerical atribute against every ther numerical attribute, plus a nistogram of each numerical attribute

# housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

-- 3. Experimenting with Attribute Combinations¶

# housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
# housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
# housing["population_per_household"] = housing["population"]/housing["households"]

# corr_matrix = housing.corr()

# corr_matrix["median_house_value"].sort_values(ascending=False)

4. Prepare the Data for Machine Learning Algorithms¶

NOTES:

write functions for this

# housing = strat_train_set.drop("TotalPay", axis=1)
# housing_labels = strat_train_set["TotalPay"].copy()

housing = train_set.drop("TotalPay", axis=1)
housing_labels = train_set["TotalPay"].copy()

housing.head()

housing_labels

70     5000000.0
78      633460.0
47     3500000.0
0       885000.0
12           0.0
         ...    
106    4840717.0
14     2514859.0
92     2200000.0
51      700000.0
102    2401206.0
Name: TotalPay, Length: 103, dtype: float64

-- 4. Data Cleaning¶

What do we do with missing features? Either:

Get rid of the districts
Get rid of the whole attribute (bedrooms)
Replace the value with some value (zero, mean, median)

# housing.dropna(subset=["total_bedrooms"]) # option 1
# housing.drop("total_bedrooms", axis=1) # option 2
# median = housing["total_bedrooms"].median() # option 3
# housing["total_bedrooms"].fillna(median, inplace=True)

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
# NOTE: median can only be computed with numerical attributes so we have to drop "ocean proximity"
housing_num = housing.drop(["School","Conference", "Coach"], axis=1)

imputer.fit(housing_num)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

imputer.statistics_

array([6.500000e+01, 5.800000e+05, 1.000000e+04, 2.671875e+06])

housing_num.median().values

array([6.500000e+01, 5.800000e+05, 1.000000e+04, 2.671875e+06])

X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

-- 4. Handling Text and Categorical Attributes¶

housing_cat = housing[["Conference"]]
housing_cat.head()

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[ 3.],
       [ 6.],
       [ 2.],
       [ 7.],
       [ 2.],
       [ 3.],
       [ 3.],
       [10.],
       [ 9.],
       [10.]])

ordinal_encoder.categories_

[array(['AAC', 'ACC', 'Big 12', 'Big Ten', 'C-USA', 'Ind.', 'MAC',
        'Mt. West', 'Pac-12', 'SEC', 'Sun Belt'], dtype=object)]

# PROBLEM: We don't want these to actaully have any relationship to one another 
# Like good, better, best
# SOLUTION: One-hot encoding

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

<103x11 sparse matrix of type '<class 'numpy.float64'>'
	with 103 stored elements in Compressed Sparse Row format>

cat_encoder.categories_

[array(['AAC', 'ACC', 'Big 12', 'Big Ten', 'C-USA', 'Ind.', 'MAC',
        'Mt. West', 'Pac-12', 'SEC', 'Sun Belt'], dtype=object)]

-- 4. Custom Transformers¶

# from sklearn.base import BaseEstimator, TransformerMixin
# rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

# class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
#     def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs ???
#         self.add_bedrooms_per_room = add_bedrooms_per_room
#     def fit(self, X, y=None):
#         return self # nothing else to do ??
#     def transform(self, X, y=None):
#         rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
#         population_per_household = X[:, population_ix] / X[:, households_ix]
#         if self.add_bedrooms_per_room:
#             bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
#             return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
#         else:
#             return np.c_[X, rooms_per_household, population_per_household]
        
# attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# housing_extra_attribs = attr_adder.transform(housing.values)

-- 5. Feature Scaling¶

# MinMaxScaler 
# Standard Scaler

-- 6. Transformation Pipelines¶

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_addr', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing.head()

from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ['School','Conference','Coach']

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)
# housing_prepared.head()

# train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

LINEAR REGRESSION¶

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

some_data_prepared = full_pipeline.transform(some_data)
print("PREDICTIONS:", lin_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))

PREDICTIONS: [ 4.99999995e+06  6.33459826e+05  3.49999968e+06  8.85000248e+05
 -1.72633900e-01]
LABELS: [5000000.0, 633460.0, 3500000.0, 885000.0, 0.0]

from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.17789554083065956

DECISION TREE¶

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

print("PREDICTIONS:", tree_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))

PREDICTIONS: [5000000.  633460. 3500000.  885000.       0.]
LABELS: [5000000.0, 633460.0, 3500000.0, 885000.0, 0.0]

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Stardard Deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [1921668.72468537 1091087.99785723 1209420.42511647 1143037.75223529
  668214.0704852   846650.20714573 1227627.38852736 1293675.71808321
 1709706.3777542  1791818.37090546]
Mean: 1290290.7032795516
Stardard Deviation: 384930.80454183096

RANDOM FOREST¶

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

405088.1029853629

scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)

Scores: [1245041.49316675 1132197.17771841  601294.72814302  772713.97332458
 1049396.54429136  602795.8423112   928462.08597401  965397.66858737
 1101029.86713052 1792083.30766503]
Mean: 1019041.2688312249
Stardard Deviation: 329946.92239829834

print("PREDICTIONS:", forest_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))

PREDICTIONS: [5010000.   584250.  2844071.7  970596.5  746668.2]
LABELS: [5000000.0, 633460.0, 3500000.0, 885000.0, 0.0]

	Unnamed: 0	School	Conference	Coach	TotalPay	Bonus	BonusPaid	Buyout
0	0	Air Force	Mt. West	Troy Calhoun	885000.0	247000.0	0.0	0.0
1	1	Akron	MAC	Terry Bowden	412500.0	225000.0	50000.0	688500.0
2	2	Alabama	SEC	Nick Saban	8307000.0	1100000.0	500000.0	33600000.0
3	3	Alabama at Birmingham	C-USA	Bill Clark	900000.0	950000.0	165471.0	3847500.0
4	4	Appalachian State	Sun Belt	Scott Satterfield	712500.0	295000.0	145000.0	2160417.0

	Unnamed: 0	TotalPay	Bonus	BonusPaid	Buyout
Unnamed: 0	1.000000	0.109425	-0.134731	-0.108440	0.022433
TotalPay	0.109425	1.000000	0.390325	0.393773	0.782862
Bonus	-0.134731	0.390325	1.000000	0.413335	0.447226
BonusPaid	-0.108440	0.393773	0.413335	1.000000	0.423513
Buyout	0.022433	0.782862	0.447226	0.423513	1.000000

	Unnamed: 0	School	Conference	Coach	Bonus	BonusPaid	Buyout
70	70	Nebraska	Big Ten	Scott Frost	950000.0	0.0	25416667.0
78	78	Northern Illinois	MAC	Rod Carey	205000.0	42500.0	1822918.0
47	47	Kansas State	Big 12	Bill Snyder	580000.0	50000.0	3000000.0
0	0	Air Force	Mt. West	Troy Calhoun	247000.0	0.0	0.0
12	12	Baylor	Big 12	Matt Rhule	0.0	0.0	0.0