Hands-On Machine Learning¶

CH2: End-to-End Machine Learning Project¶

1. Look at the Big Picture¶

Frame the problem
Select a performance measure
Check assumptions

2. Get the Data¶

import os
import tarfile
import urllib

DOWNLOAD_ROOT =  "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

-- 2: Take a Quick Look at the Data Structure¶

housing.head()

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

housing['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

-- 2. Create a Test Set¶

import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)

len(train_set)

16512

len(test_set)

4128

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

# Insure that there is a proportional number of each cagetory in the test set

housing["income_cat"] = pd.cut(housing["median_income"], 
                               bins=[0.,1.5,3.0,4.5,6., np.inf],
                               labels=[1,2,3,4,5])
housing["income_cat"].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x1a20c27410>

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

strat_test_set["income_cat"].value_counts() / len(strat_test_set)

3    0.350533
2    0.318798
4    0.176357
5    0.114583
1    0.039729
Name: income_cat, dtype: float64

# Remove the `income cat` attribute so data is bad to original
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

3. Discover and Visualize the Data to Gain Insights¶

-- 3. Visualizing Geographical Data¶

housing.plot(kind="scatter", x="longitude", y="latitude")

<matplotlib.axes._subplots.AxesSubplot at 0x1a21318150>

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x1a21475950>

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, 
            s=housing["population"]/100, label="population", figsize=(10,7), 
            c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()

<matplotlib.legend.Legend at 0x1a21475b50>

-- 3. Looking For Correlations¶

NOTES:

A clustering algo could be useful for detecting the main cluster and for adding new features that measure the proximity to the cluster events
Correlation Coefficient ranges from -1 to 1
- When it is close to 1, strong positive linear relationshiop (EX: Median house value increases as median income increases)
- When it is close to -1, strong negative linear relationship
Correlation Coefficient only measures linear relationships

corr_matrix = housing.corr()

corr_matrix

corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a21a6a450>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a21c76ed0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f4b4710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a21e69f10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a21b63750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a21b93f50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a20ca5f50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a20cd9390>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a20c72410>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a206ab4d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a20ad6850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a20e7a0d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1f61bfd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a20d247d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a20bca4d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1f768bd0>]],
      dtype=object)

NOTE: This plots every numerical atribute against every ther numerical attribute, plus a nistogram of each numerical attribute

housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x1a1f636490>

-- 3. Experimenting with Attribute Combinations¶

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

corr_matrix = housing.corr()

corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.688075
rooms_per_household         0.151948
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049686
population_per_household   -0.023737
population                 -0.024650
longitude                  -0.045967
latitude                   -0.144160
bedrooms_per_room          -0.255880
Name: median_house_value, dtype: float64

4. Prepare the Data for Machine Learning Algorithms¶

NOTES:

write functions for this

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
longitude	1.000000	-0.924664	-0.108197	0.044568	0.069608	0.099773	0.055310	-0.015176	-0.045967
latitude	-0.924664	1.000000	0.011173	-0.036100	-0.066983	-0.108785	-0.071035	-0.079809	-0.144160
housing_median_age	-0.108197	0.011173	1.000000	-0.361262	-0.320451	-0.296244	-0.302916	-0.119034	0.105623
total_rooms	0.044568	-0.036100	-0.361262	1.000000	0.930380	0.857126	0.918484	0.198050	0.134153
total_bedrooms	0.069608	-0.066983	-0.320451	0.930380	1.000000	0.877747	0.979728	-0.007723	0.049686
population	0.099773	-0.108785	-0.296244	0.857126	0.877747	1.000000	0.907222	0.004834	-0.024650
households	0.055310	-0.071035	-0.302916	0.918484	0.979728	0.907222	1.000000	0.013033	0.065843
median_income	-0.015176	-0.079809	-0.119034	0.198050	-0.007723	0.004834	0.013033	1.000000	0.688075
median_house_value	-0.045967	-0.144160	0.105623	0.134153	0.049686	-0.024650	0.065843	0.688075	1.000000