IST 718 LAB 3 | EXPLORE, MODEL, INTERPRET

OUTPUT: Models

E: EXPLORE

In [45]:
import pandas as pd
data = pd.read_csv("v2_coaches2019.csv")
data.head()
Out[45]:
rank school conf coach school_pay total_pay max_bonus bonus_paid asst_pay buyout
0 1 Clemson ACC Dabo Swinney 9255000.0 9315600.0 1125000.0 1075000.0 7410000.0 50000000.0
1 2 Alabama SEC Nick Saban 8707000.0 8857000.0 1100000.0 875000.0 7541277.0 34100000.0
2 3 Michigan Big Ten Jim Harbaugh 7504000.0 7504000.0 1325000.0 350000.0 6005000.0 11687500.0
3 4 Texas A&M SEC Jimbo Fisher 7500000.0 7500000.0 1500000.0 250000.0 7145215.0 60625000.0
4 5 Georgia SEC Kirby Smart 6703600.0 6871600.0 1150000.0 275000.0 6212935.0 24239584.0
In [46]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
In [47]:
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()
In [48]:
data.drop(['total_pay'], axis=1, inplace=True)
In [49]:
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()

M: MODEL

In [50]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()
In [51]:
from sklearn.compose import ColumnTransformer

data_num = data.drop(['school', 'conf', 'coach'],axis =1) 
num_attribs = list(data_num)
cat_attribs = ['school','conf','coach']
In [52]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_addr', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
In [53]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])
In [54]:
data_prepared = full_pipeline.fit_transform(data)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)
Out[54]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [55]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Stardard Deviation:", scores.std())
    
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)
Scores: [574898.67237434 499160.20701545 320812.55580393 567213.49167286
 605310.16603356 330855.92593581 515748.17222354 616645.77015702
 704247.3349575  447829.67008972]
Mean: 518272.19662637485
Stardard Deviation: 117000.20724156257
In [ ]:
 
In [ ]: