IST 718 LAB 3 | EXPLORE, MODEL, INTERPRET

OUTPUT: Models

E: EXPLORE

In [2]:
import pandas as pd
# data = pd.read_csv("v2_coaches2019.csv")
# data = pd.read_csv("v2_coaches9.csv")
data = pd.read_csv("v2_coaches_modify.csv")
data.head()
Out[2]:
school conf coach ncaafbrev16 median_conf_sal school_pay total_pay max_bonus bonus_paid pay_plus_bonus ... grad_rate_rank combo true_rank w l ratio offensive_score defensive_score score points_per_game
0 Air Force Mt. West Troy Calhoun $59,577,780 879288.0 885000.0 885000.0 247000.0 NaN 885000.0 ... 97.0 65.0 67.0 10.0 3.0 3.333333 4.96 -0.87 4.09 78.16
1 Akron MAC Terry Bowden $35,331,217 492413.0 411000.0 412500.0 225000.0 50000.0 462500.0 ... 5.0 11.0 129.0 5.0 7.0 0.714286 -0.40 -6.12 -6.52 72.64
2 Alabama SEC Nick Saban $174,307,419 3929800.0 8307000.0 8307000.0 1100000.0 500000.0 8807000.0 ... 84.0 111.0 5.0 14.0 1.0 14.000000 8.01 17.62 25.62 90.38
3 Appalachian State Sun Belt Scott Satterfield $35,058,621 675000.0 712500.0 712500.0 295000.0 145000.0 857500.0 ... 13.0 20.0 122.0 10.0 3.0 3.333333 -0.86 9.68 8.83 70.76
4 Arizona Pac-12 Kevin Sumlin $90,976,758 2752233.0 1600000.0 2000000.0 2025000.0 NaN 2000000.0 ... 60.0 66.0 62.0 3.0 9.0 0.333333 1.01 -5.64 -4.63 74.42

5 rows × 23 columns

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
In [4]:
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()
In [5]:
data.columns
Out[5]:
Index(['school', 'conf', 'coach', 'ncaafbrev16', 'median_conf_sal',
       'school_pay', 'total_pay', 'max_bonus', 'bonus_paid', 'pay_plus_bonus',
       'stad_size', 'grad_rate', 'seat_rank', 'grad_rate_rank', 'combo',
       'true_rank', 'w', 'l', 'ratio', 'offensive_score', 'defensive_score',
       'score', 'points_per_game'],
      dtype='object')
In [6]:
from pandas.plotting import scatter_matrix
columns = ['conf','school_pay', 'grad_rate', 'stad_size', 'ratio']
df = pd.DataFrame(data, columns = columns)
scatter_matrix(df, figsize=(12,8))
plt.show()
In [7]:
g = sns.pairplot(df, hue="conf", palette="Set2", diag_kind="kde", height=2.5)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/statsmodels/nonparametric/kde.py:447: RuntimeWarning: invalid value encountered in greater
  X = X[np.logical_and(X > clip[0], X < clip[1])] # won't work for two columns.
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/statsmodels/nonparametric/kde.py:447: RuntimeWarning: invalid value encountered in less
  X = X[np.logical_and(X > clip[0], X < clip[1])] # won't work for two columns.
In [8]:
with sns.plotting_context("talk"):
    g = sns.FacetGrid(data, col="conf", col_wrap = 4, hue="w", palette = "Blues")
    g = g.map(plt.scatter, "stad_size", "school_pay")
In [9]:
data.drop(['total_pay'], axis=1, inplace=True)
# data.drop(['rank'], axis=1, inplace=True)
In [10]:
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()
In [24]:
with sns.plotting_context("talk"):
    g = sns.FacetGrid(data, col="conf", col_wrap = 4, hue="ratio", palette = "Blues")
    g = g.map(plt.scatter, "max_bonus", "school_pay", "ratio")
In [25]:
data['ratio_big'] = data['ratio'] * 100


with sns.plotting_context("talk"):
    g = sns.FacetGrid(data, col="conf", col_wrap = 4, hue="ratio_big", palette = "Blues")
    g = g.map(plt.scatter, "max_bonus", "school_pay", "ratio_big")

M: MODEL

In [26]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()
In [27]:
from sklearn.compose import ColumnTransformer

data_num = data.drop(['school', 'conf', 'coach'],axis =1) 
num_attribs = list(data_num)
cat_attribs = ['school','conf','coach']
In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_addr', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
In [29]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])
In [30]:
data_prepared = full_pipeline.fit_transform(data)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/sklearn/impute/_base.py in _validate_input(self, X)
    198             X = check_array(X, accept_sparse='csc', dtype=dtype,
--> 199                             force_all_finite=force_all_finite, copy=self.copy)
    200         except ValueError as ve:

~/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    495                 warnings.simplefilter('error', ComplexWarning)
--> 496                 array = np.asarray(array, dtype=dtype, order=order)
    497             except ComplexWarning:

~/anaconda3/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     84     """
---> 85     return array(a, dtype, copy=False, order=order)
     86 

ValueError: could not convert string to float: ' $95,251,461 '

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
<ipython-input-30-5c873493145d> in <module>
----> 1 data_prepared = full_pipeline.fit_transform(data)

~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    474         self._validate_remainder(X)
    475 
--> 476         result = self._fit_transform(X, y, _fit_transform_one)
    477 
    478         if not result:

~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
    418                     message=self._log_message(name, idx, len(transformers)))
    419                 for idx, (name, trans, column, weight) in enumerate(
--> 420                         self._iter(fitted=fitted, replace_strings=True), 1))
    421         except ValueError as e:
    422             if "Expected 2D array, got 1D array instead" in str(e):

~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
    919             # remaining jobs.
    920             self._iterating = False
--> 921             if self.dispatch_one_batch(iterator):
    922                 self._iterating = self._original_iterator is not None
    923 

~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    714     with _print_elapsed_time(message_clsname, message):
    715         if hasattr(transformer, 'fit_transform'):
--> 716             res = transformer.fit_transform(X, y, **fit_params)
    717         else:
    718             res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385         """
    386         last_step = self._final_estimator
--> 387         Xt, fit_params = self._fit(X, y, **fit_params)
    388         with _print_elapsed_time('Pipeline',
    389                                  self._log_message(len(self.steps) - 1)):

~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    315                 message_clsname='Pipeline',
    316                 message=self._log_message(step_idx),
--> 317                 **fit_params_steps[name])
    318             # Replace the transformer of the step with the fitted
    319             # transformer. This is necessary when loading the transformer

~/anaconda3/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    353 
    354     def __call__(self, *args, **kwargs):
--> 355         return self.func(*args, **kwargs)
    356 
    357     def call_and_shelve(self, *args, **kwargs):

~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    714     with _print_elapsed_time(message_clsname, message):
    715         if hasattr(transformer, 'fit_transform'):
--> 716             res = transformer.fit_transform(X, y, **fit_params)
    717         else:
    718             res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    551         if y is None:
    552             # fit method of arity 1 (unsupervised transformation)
--> 553             return self.fit(X, **fit_params).transform(X)
    554         else:
    555             # fit method of arity 2 (supervised transformation)

~/anaconda3/lib/python3.7/site-packages/sklearn/impute/_base.py in fit(self, X, y)
    230         self : SimpleImputer
    231         """
--> 232         X = self._validate_input(X)
    233 
    234         # default fill_value is 0 for numerical input and "missing_value"

~/anaconda3/lib/python3.7/site-packages/sklearn/impute/_base.py in _validate_input(self, X)
    202                 raise ValueError("Cannot use {0} strategy with non-numeric "
    203                                  "data. Received datatype :{1}."
--> 204                                  "".format(self.strategy, X.dtype.kind))
    205             else:
    206                 raise ve

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'dtype'

LINEAR REGRESSION

In [107]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)
Out[107]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [108]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Stardard Deviation:", scores.std())
In [109]:
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
display_scores(lin_reg_rmse_scores)
Scores: [ 863391.43623717 1329276.35896028  940871.66390862 1258097.36176848
  823246.34433711  871677.13226375  501034.63415128  905209.97976486
 1056012.22089929 1167760.28656792]
Mean: 971657.7418858772
Stardard Deviation: 229369.45582631155

DECISION TREE

In [110]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_prepared, data_labels)
Out[110]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
In [111]:
scores = cross_val_score(tree_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)
display_scores(tree_reg_rmse_scores)
Scores: [1469853.78732636 1985939.9550611  1607669.72348535 2011720.40863393
 1215687.98128237  982787.80966926  584613.1934606   805628.22551125
 1407292.11675043 1569107.42329036]
Mean: 1364030.0624470995
Stardard Deviation: 448041.9923430583

RANDOM FOREST

In [112]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(data_prepared, data_labels)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Out[112]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
In [113]:
scores = cross_val_score(forest_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_reg_rmse_scores = np.sqrt(-scores)
display_scores(forest_reg_rmse_scores)
Scores: [1119857.44839398 1287429.28953944 1390971.85418215 1796434.34057579
  850164.3655399   679468.08656183  553168.3106628  1030410.36380279
 1093633.48200284 1352072.98180393]
Mean: 1115361.0523065452
Stardard Deviation: 347787.8257380748
In [ ]:
 
In [ ]: