IST 718 LAB 3 | EXPLORE, MODEL, INTERPRET

OUTPUT: Models

E: EXPLORE

In [56]:
import pandas as pd
# data = pd.read_csv("v2_coaches2019.csv")
data = pd.read_csv("v2_coaches9.csv")
data.head()
Out[56]:
school conf coach school_pay total_pay max_bonus bonus_paid asst_pay buyout
0 Air Force Mt. West Troy Calhoun 885000.0 885000.0 247000.0 0.0 0.0 0.0
1 Akron MAC Terry Bowden 411000.0 412500.0 225000.0 50000.0 0.0 688500.0
2 Alabama SEC Nick Saban 8307000.0 8307000.0 1100000.0 500000.0 0.0 33600000.0
3 Alabama at Birmingham C-USA Bill Clark 900000.0 900000.0 950000.0 165471.0 0.0 3847500.0
4 Appalachian State Sun Belt Scott Satterfield 712500.0 712500.0 295000.0 145000.0 0.0 2160417.0
In [57]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
In [58]:
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/pandas/plotting/_matplotlib/misc.py:80: UserWarning: Attempting to set identical left == right == 0.0 results in singular transformations; automatically expanding.
  ax.set_xlim(boundaries_list[j])
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/pandas/plotting/_matplotlib/misc.py:81: UserWarning: Attempting to set identical bottom == top == 0.0 results in singular transformations; automatically expanding.
  ax.set_ylim(boundaries_list[i])
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/pandas/plotting/_matplotlib/misc.py:71: UserWarning: Attempting to set identical left == right == 0.0 results in singular transformations; automatically expanding.
  ax.set_xlim(boundaries_list[i])
In [59]:
data.drop(['total_pay'], axis=1, inplace=True)
In [60]:
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()

M: MODEL

In [61]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()
In [62]:
from sklearn.compose import ColumnTransformer

data_num = data.drop(['school', 'conf', 'coach'],axis =1) 
num_attribs = list(data_num)
cat_attribs = ['school','conf','coach']
In [63]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_addr', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
In [64]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])
In [67]:
data_prepared = full_pipeline.fit_transform(data)

LINEAR REGRESSION

In [73]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)
Out[73]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [74]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Stardard Deviation:", scores.std())
In [75]:
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
display_scores(lin_reg_rmse_scores)
Scores: [1000541.93638924 1087129.88544642  577004.76896174  704475.53300203
  790898.30783315  330726.44118222  648550.56477007  704778.79695918
  846850.45215746 1192609.51924728]
Mean: 788356.6205948773
Stardard Deviation: 242651.70009816633

DECISION TREE

In [76]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_prepared, data_labels)
Out[76]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
In [77]:
scores = cross_val_score(tree_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)
display_scores(tree_reg_rmse_scores)
Scores: [1966907.17208171  978656.77219637 1118237.21419429 1094127.61604536
 1114106.5739837  1240087.18596283 1168226.55874967 1481352.96779525
 1016554.07011728 1903687.92354703]
Mean: 1308194.4054673477
Stardard Deviation: 339993.5390961565

RANDOM FOREST

In [78]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(data_prepared, data_labels)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Out[78]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
In [79]:
scores = cross_val_score(forest_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_reg_rmse_scores = np.sqrt(-scores)
display_scores(forest_reg_rmse_scores)
Scores: [1191252.55081039 1114834.49465545  722867.32496421  832952.16695923
 1108885.16416161  772305.1286189   949988.04336817  926006.29415125
 1165139.26983104 1649137.25577549]
Mean: 1043336.7693295751
Stardard Deviation: 256128.59947813684
In [ ]: