IST 718 LAB 3 | EXPLORE, MODEL, INTERPRET¶

OUTPUT: Models¶

E: EXPLORE¶

import pandas as pd
data = pd.read_csv("v2_coaches2019.csv")
# data = pd.read_csv("v2_coaches9.csv")
data.head()

%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()

from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()

data.drop(['total_pay'], axis=1, inplace=True)

from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()

M: MODEL¶

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()

from sklearn.compose import ColumnTransformer

data_num = data.drop(['school', 'conf', 'coach'],axis =1) 
num_attribs = list(data_num)
cat_attribs = ['school','conf','coach']

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_addr', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

data_prepared = full_pipeline.fit_transform(data)

LINEAR REGRESSION¶

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Stardard Deviation:", scores.std())

scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
display_scores(lin_reg_rmse_scores)

Scores: [574898.67237434 499160.20701545 320812.55580393 567213.49167286
 605310.16603356 330855.92593581 515748.17222354 616645.77015702
 704247.3349575  447829.67008972]
Mean: 518272.19662637485
Stardard Deviation: 117000.20724156257

DECISION TREE¶

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_prepared, data_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

scores = cross_val_score(tree_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)
display_scores(tree_reg_rmse_scores)

Scores: [214472.26270165 155690.02489854 244732.23677856 107472.93453322
 283678.92862883  97701.88951295 153285.51859194 222059.58934912
 251077.05758352 126329.83886042]
Mean: 185650.02814387478
Stardard Deviation: 62328.10555798719

RANDOM FOREST¶

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(data_prepared, data_labels)

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

scores = cross_val_score(forest_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_reg_rmse_scores = np.sqrt(-scores)
display_scores(forest_reg_rmse_scores)

Scores: [628719.3413954  192256.29509133  84447.71092478 115905.1805359
 175902.32302786  70212.85679102  90060.78797636 191726.12740988
 239775.33143116  66031.9689106 ]
Mean: 185503.79234942925
Stardard Deviation: 158522.27183938777

	rank	school	conf	coach	school_pay	total_pay	max_bonus	bonus_paid	asst_pay	buyout
0	1	Clemson	ACC	Dabo Swinney	9255000.0	9315600.0	1125000.0	1075000.0	7410000.0	50000000.0
1	2	Alabama	SEC	Nick Saban	8707000.0	8857000.0	1100000.0	875000.0	7541277.0	34100000.0
2	3	Michigan	Big Ten	Jim Harbaugh	7504000.0	7504000.0	1325000.0	350000.0	6005000.0	11687500.0
3	4	Texas A&M	SEC	Jimbo Fisher	7500000.0	7500000.0	1500000.0	250000.0	7145215.0	60625000.0
4	5	Georgia	SEC	Kirby Smart	6703600.0	6871600.0	1150000.0	275000.0	6212935.0	24239584.0