IST 718 | LAB 3

O: Obtain

In [129]:
import pandas as pd
# data = pd.read_csv("Coaches9.csv")
data = pd.read_csv("coaches2019.csv")
In [130]:
data.head()
Out[130]:
RK SCHOOL CONF COACH SCHOOL PAY TOTAL PAY MAX BONUS BONUSES PAID (2018-19) ASST PAY TOTAL SCHOOL BUYOUT AS OF 12/1/19
0 1 Clemson ACC Dabo Swinney 9255000 9315600 1125000 1075000 7410000 50000000
1 2 Alabama SEC Nick Saban 8707000 8857000 1100000 875000 7541277 34100000
2 3 Michigan Big Ten Jim Harbaugh 7504000 7504000 1325000 350000 6005000 11687500
3 4 Texas A&M SEC Jimbo Fisher 7500000 7500000 1500000 250000 7145215 60625000
4 5 Georgia SEC Kirby Smart 6703600 6871600 1150000 275000 6212935 24239584

S: Scrub

Check and change types

In [131]:
data.dtypes
Out[131]:
RK                              int64
SCHOOL                         object
CONF                           object
COACH                          object
SCHOOL PAY                     object
TOTAL PAY                      object
MAX BONUS                      object
BONUSES PAID (2018-19)         object
ASST PAY TOTAL                  int64
SCHOOL BUYOUT AS OF 12/1/19    object
dtype: object
Change strings to categories
In [132]:
# attrs = "School Conference Coach".split() # Coaches9
attrs = "SCHOOL CONF COACH".split() # Coaches9
data[attrs] = data[attrs].astype('category')
In [133]:
data.dtypes
Out[133]:
RK                                int64
SCHOOL                         category
CONF                           category
COACH                          category
SCHOOL PAY                       object
TOTAL PAY                        object
MAX BONUS                        object
BONUSES PAID (2018-19)           object
ASST PAY TOTAL                    int64
SCHOOL BUYOUT AS OF 12/1/19      object
dtype: object
Change money to ints
In [134]:
list(data[data.columns[4:]].columns)
Out[134]:
['SCHOOL PAY',
 'TOTAL PAY',
 'MAX BONUS',
 'BONUSES PAID (2018-19)',
 'ASST PAY TOTAL',
 'SCHOOL BUYOUT AS OF 12/1/19']
In [135]:
# attrs = "SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout".split() # Coaches9
attrs = list(data[data.columns[4:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
In [136]:
data.dtypes
Out[136]:
RK                                int64
SCHOOL                         category
CONF                           category
COACH                          category
SCHOOL PAY                      float64
TOTAL PAY                       float64
MAX BONUS                       float64
BONUSES PAID (2018-19)          float64
ASST PAY TOTAL                  float64
SCHOOL BUYOUT AS OF 12/1/19     float64
dtype: object

Check for missing values

In [137]:
data.isnull().sum()
Out[137]:
RK                             0
SCHOOL                         0
CONF                           0
COACH                          0
SCHOOL PAY                     0
TOTAL PAY                      0
MAX BONUS                      0
BONUSES PAID (2018-19)         0
ASST PAY TOTAL                 0
SCHOOL BUYOUT AS OF 12/1/19    0
dtype: int64
In [138]:
len(data)
Out[138]:
130

Drop unnecessary / unhelpful values

In [139]:
# data.drop('AssistantPay', axis=1, inplace=True) # Coaches9
# data.drop('AssistantPay', axis=1, inplace=True) 

E: Explore

In [140]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
In [141]:
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()

Drop overly correlated variables

In [142]:
# data.drop('SchoolPay', axis=1, inplace=True) # Coaches9
data.drop(['RK','TOTAL PAY'], axis=1, inplace=True) 
scatter_matrix(data, figsize=(12,8))
plt.show()
In [143]:
# data.to_csv('coaches9_clean.csv') #Coaches9
data.to_csv('coaches2019_clean.csv')
In [147]:
# data = train_set.drop('TotalPay', axis=1) #Coaches9
# data_labels = train_set['TotalPay'].copy() #Coaches9

# data = train_set.drop('SCHOOL PAY', axis=1)
# data_labels = train_set['SCHOOL PAY'].copy()

data.columns = ['school', 'conf', 'coach', 'school_pay', 'max_bonus', 'bonus_paid', 'asst_pay', 'buyout']
data
Out[147]:
school conf coach school_pay max_bonus bonus_paid asst_pay buyout
0 Clemson ACC Dabo Swinney 9255000.0 1125000.0 1075000.0 7410000.0 50000000.0
1 Alabama SEC Nick Saban 8707000.0 1100000.0 875000.0 7541277.0 34100000.0
2 Michigan Big Ten Jim Harbaugh 7504000.0 1325000.0 350000.0 6005000.0 11687500.0
3 Texas A&M SEC Jimbo Fisher 7500000.0 1500000.0 250000.0 7145215.0 60625000.0
4 Georgia SEC Kirby Smart 6703600.0 1150000.0 275000.0 6212935.0 24239584.0
... ... ... ... ... ... ... ... ...
125 Army INDEP Jeff Monken 0.0 0.0 0.0 0.0 0.0
126 Liberty INDEP Hugh Freeze 0.0 0.0 0.0 0.0 0.0
127 Brigham Young INDEP Kalani Sitake 0.0 0.0 0.0 0.0 0.0
128 Air Force MWC Troy Calhoun 0.0 0.0 0.0 0.0 0.0
129 Southern Methodist AAC Sonny Dykes 0.0 0.0 0.0 0.0 0.0

130 rows × 8 columns

In [149]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()
In [151]:
data.head()
Out[151]:
school conf coach max_bonus bonus_paid asst_pay buyout
70 Indiana Big Ten Tom Allen 1566668.0 125000.0 3693922.0 1500000.0
78 UAB CUSA Bill Clark 1060000.0 228105.0 1270250.0 7012500.0
47 Mississippi State SEC Joe Moorhead 1350000.0 75000.0 3680000.0 0.0
0 Clemson ACC Dabo Swinney 1125000.0 1075000.0 7410000.0 50000000.0
12 Oklahoma State Big 12 Mike Gundy 900000.0 62500.0 4210000.0 16632813.0
In [152]:
from sklearn.compose import ColumnTransformer

# data_num = data.drop(['School', 'Conference', 'Coach'],axis =1) # Coaches9
data_num = data.drop(['school', 'conf', 'coach'],axis =1) 
num_attribs = list(data_num)
In [153]:
# cat_attribs = ['School','Conference','Coach'] # Coaches9

cat_attribs = ['school','conf','coach']
In [154]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_addr', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
In [155]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])
In [156]:
# data_prepared = full_pipeline.fit_transform(data)
In [157]:
# data_prepared
In [ ]:
 
In [158]:
data_prepared = full_pipeline.fit_transform(data)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)
Out[158]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [159]:
data_labels
Out[159]:
70     1805000.0
78     1450000.0
47     3050000.0
0      9255000.0
12     5125000.0
         ...    
71     1800000.0
106     600000.0
14     5000000.0
92      800000.0
102     635000.0
Name: school_pay, Length: 104, dtype: float64
In [160]:
some_data = data.iloc[:10]
some_labels = data_labels.iloc[:10]
In [161]:
some_data_prepared = full_pipeline.transform(some_data)
print("PREDICTIONS:", lin_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))
PREDICTIONS: [1805000.06726596 1449999.95045756 3049999.91574903 9255000.01706716
 5124999.98643905 3218935.04943988 2774999.98391966 4150000.07544885
 1600000.0416255   499999.9936865 ]
LABELS: [1805000.0, 1450000.0, 3050000.0, 9255000.0, 5125000.0, 3218935.0, 2775000.0, 4150000.0, 1600000.0, 500000.0]
In [164]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Stardard Deviation:", scores.std())
    
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)
Scores: [ 863391.43623717 1329276.35896028  940871.66390862 1258097.36176848
  823246.34433711  871677.13226375  501034.63415128  905209.97976486
 1056012.22089929 1167760.28656792]
Mean: 971657.7418858772
Stardard Deviation: 229369.45582631155
In [166]:
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="r2", cv=10)
In [167]:
scores
Out[167]:
array([0.86730289, 0.43275873, 0.75055157, 0.59065345, 0.8923716 ,
       0.37485483, 0.77776374, 0.86534655, 0.77612481, 0.51148849])
In [168]:
from sklearn.metrics import r2_score
y_true = list(some_labels)
y_pred = lin_reg.predict(some_data_prepared)
r2_score(y_true, y_pred)
Out[168]:
0.9999999999999996
In [ ]: