In [1]:
import os
import tarfile
import urllib
In [2]:
DOWNLOAD_ROOT =  "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
In [5]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
In [6]:
fetch_housing_data()
In [9]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
In [10]:
housing = load_housing_data()
In [11]:
housing.head()
Out[11]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
In [12]:
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
In [13]:
housing['ocean_proximity'].value_counts()
Out[13]:
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64
In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()
In [19]:
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices] 
In [21]:
train_set, test_set = split_train_test(housing, 0.2)
In [22]:
len(train_set)
Out[22]:
16512
In [23]:
len(test_set)
Out[23]:
4128
In [ ]:
 
In [24]:
housing.plot(kind="scatter", x="longitude", y="latitude")
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a515190>
In [27]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b477fd0>
In [28]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, 
            s=housing["population"]/100, label="population", figsize=(10,7), 
            c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()
Out[28]:
<matplotlib.legend.Legend at 0x1196fd8d0>

Looking For Correlations

NOTES:

  • A clustering algo could be useful for detecting the main cluster and for adding new features that measure the proximity to the cluster events
  • Correlation Coefficient ranges from -1 to 1
    • When it is close to 1, strong positive linear relationshiop (EX: Median house value increases as median income increases)
    • When it is close to -1, strong negative linear relationship
  • Correlation Coefficient only measures linear relationships
In [31]:
corr_matrix = housing.corr()
In [33]:
corr_matrix
Out[33]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
longitude 1.000000 -0.924664 -0.108197 0.044568 0.069608 0.099773 0.055310 -0.015176 -0.045967
latitude -0.924664 1.000000 0.011173 -0.036100 -0.066983 -0.108785 -0.071035 -0.079809 -0.144160
housing_median_age -0.108197 0.011173 1.000000 -0.361262 -0.320451 -0.296244 -0.302916 -0.119034 0.105623
total_rooms 0.044568 -0.036100 -0.361262 1.000000 0.930380 0.857126 0.918484 0.198050 0.134153
total_bedrooms 0.069608 -0.066983 -0.320451 0.930380 1.000000 0.877747 0.979728 -0.007723 0.049686
population 0.099773 -0.108785 -0.296244 0.857126 0.877747 1.000000 0.907222 0.004834 -0.024650
households 0.055310 -0.071035 -0.302916 0.918484 0.979728 0.907222 1.000000 0.013033 0.065843
median_income -0.015176 -0.079809 -0.119034 0.198050 -0.007723 0.004834 0.013033 1.000000 0.688075
median_house_value -0.045967 -0.144160 0.105623 0.134153 0.049686 -0.024650 0.065843 0.688075 1.000000
In [34]:
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[34]:
median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64
In [35]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))
Out[35]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11a28a750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a377350>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a4b45d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a07ced0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x119e3bb10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a063610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a3fb8d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119e89fd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x119fdf050>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119017d90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a2bdd50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a487590>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11a27d9d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a3f85d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a200dd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119fc7610>]],
      dtype=object)

NOTE: This plots every numerical atribute against every ther numerical attribute, plus a nistogram of each numerical attribute

In [36]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a129610>

Experimenting with Attribute Combinations

In [38]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
In [39]:
corr_matrix = housing.corr()
In [40]:
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[40]:
median_house_value          1.000000
median_income               0.688075
rooms_per_household         0.151948
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049686
population_per_household   -0.023737
population                 -0.024650
longitude                  -0.045967
latitude                   -0.144160
bedrooms_per_room          -0.255880
Name: median_house_value, dtype: float64

Prepare the Data for Machine Learning Algorithms

NOTES:

  • write functions for this
In [ ]: