import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
os.makedirs(housing_path, exist_ok=True)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
fetch_housing_data()
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
# housing = load_housing_data()
housing = pd.read_csv('../IST_718_Big_Data/WK2/coaches9_clean.csv')
housing.head()
housing.info()
housing['Conference'].value_counts()
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
len(train_set)
len(test_set)
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
# # Insure that there is a proportional number of each cagetory in the test set
# housing["bonus_cat"] = pd.cut(housing["Bonus"],
# bins=[0.,1.5,3.0,4.5,6., np.inf],
# labels=[1,2,3,4,5])
# housing["bonus_cat"].hist()
# from sklearn.model_selection import StratifiedShuffleSplit
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(housing, housing["bonus_cat"]):
# strat_train_set = housing.loc[train_index]
# strat_test_set = housing.loc[test_index]
# strat_test_set["income_cat"].value_counts() / len(strat_test_set)
# Remove the `income cat` attribute so data is back to original
# for set_ in (strat_train_set, strat_test_set):
# set_.drop("income_cat", axis=1, inplace=True)
# housing.plot(kind="scatter", x="longitude", y="latitude")
# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
# s=housing["population"]/100, label="population", figsize=(10,7),
# c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
# plt.legend()
NOTES:
corr_matrix = housing.corr()
corr_matrix
corr_matrix["TotalPay"].sort_values(ascending=False)
from pandas.plotting import scatter_matrix
attributes = ["TotalPay", "Bonus", "BonusPaid", "Buyout"]
scatter_matrix(housing[attributes], figsize=(12,8))
NOTE: This plots every numerical atribute against every ther numerical attribute, plus a nistogram of each numerical attribute
# housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
# housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
# housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
# housing["population_per_household"] = housing["population"]/housing["households"]
# corr_matrix = housing.corr()
# corr_matrix["median_house_value"].sort_values(ascending=False)
NOTES:
# housing = strat_train_set.drop("TotalPay", axis=1)
# housing_labels = strat_train_set["TotalPay"].copy()
housing = train_set.drop("TotalPay", axis=1)
housing_labels = train_set["TotalPay"].copy()
housing.head()
housing_labels
What do we do with missing features? Either:
# housing.dropna(subset=["total_bedrooms"]) # option 1
# housing.drop("total_bedrooms", axis=1) # option 2
# median = housing["total_bedrooms"].median() # option 3
# housing["total_bedrooms"].fillna(median, inplace=True)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
# NOTE: median can only be computed with numerical attributes so we have to drop "ocean proximity"
housing_num = housing.drop(["School","Conference", "Coach"], axis=1)
imputer.fit(housing_num)
imputer.statistics_
housing_num.median().values
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)
housing_cat = housing[["Conference"]]
housing_cat.head()
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]
ordinal_encoder.categories_
# PROBLEM: We don't want these to actaully have any relationship to one another
# Like good, better, best
# SOLUTION: One-hot encoding
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
cat_encoder.categories_
# from sklearn.base import BaseEstimator, TransformerMixin
# rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6
# class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
# def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs ???
# self.add_bedrooms_per_room = add_bedrooms_per_room
# def fit(self, X, y=None):
# return self # nothing else to do ??
# def transform(self, X, y=None):
# rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
# population_per_household = X[:, population_ix] / X[:, households_ix]
# if self.add_bedrooms_per_room:
# bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
# return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
# else:
# return np.c_[X, rooms_per_household, population_per_household]
# attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# housing_extra_attribs = attr_adder.transform(housing.values)
# MinMaxScaler
# Standard Scaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
# ('attribs_addr', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing.head()
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ['School','Conference','Coach']
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)
# housing_prepared.head()
# train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("PREDICTIONS:", lin_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
print("PREDICTIONS:", tree_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Stardard Deviation:", scores.std())
display_scores(tree_rmse_scores)
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)
print("PREDICTIONS:", forest_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))