01-23-20
import pandas as pd
data = pd.read_csv("Coaches9.csv")
data.head()
attrs = list(data[data.columns[3:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
data.dtypes
data.isnull().sum()
Four options:
For V1, we are going to
SchoolPay
BonusPaid
Bonus
& Buyout
with the attribute mean*For V2, we are going to use #4 and a more advanced version of #3* taking within conference mean
data.dropna(subset=['SchoolPay'], inplace=True)
data.drop('BonusPaid', axis=1, inplace=True)
median_bonus = data['Bonus'].median()
median_buyout = data['Buyout'].median()
data["Bonus"].fillna(median_bonus, inplace = True)
data["Buyout"].fillna(median_buyout, inplace = True)
data.isnull().sum()
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()
From these two visualizations, it's clear we need to:
SchoolPay
or TotalPay
AssistantPay
data.drop(['TotalPay','AssistantPay'], axis=1, inplace=True)
scatter_matrix(data, figsize=(12,8))
plt.show()
data.head()
We've lost a lot of data at this point and we need to add more. To do so we:
This sub OSM occured in an attached auxiliary file
data['school_key'] = data.apply(lambda x: ''.join(x['School'].lower().split(' '))[:8], axis=1)
data
winloss = pd.read_csv('winloss_withkey.csv')
test = data.copy()
data_wl = test.merge(winloss, left_on="school_key", right_on="team_key")
data_wl
data_wl['team_coach_key'] = data_wl.apply(lambda x: (''.join(x['Coach'].split())+x['School'][:4]).lower(), axis=1)
gsr = pd.read_csv('c_modify_small.csv')
test = data_wl.copy()
data_wl_gsr = test.merge(gsr, left_on="team_coach_key", right_on="team_coach_key")
data_wl_gsr.columns
data_wl_gsr.isna().sum()
data_wl_gsr.seat_rank.value_counts()
data_for_m = data_wl_gsr.drop(['school_key','Team','team_key_y',
'team_key_x', 'team_coach_key', 'coach', 'team_coach', 'school'], axis=1)
data_for_m.columns
columns = ['school','conf','coach','school_pay', 'bonus', 'buyout', 'rank', 'W','L', 'T', 'ratio', 'stad_size', 'gsr', 'med_conf_sal', 'seat_rank',
'combo_rank', 'true_rank', 'gs_rank' ]
data_for_m.columns = columns
data_for_m['med_conf_sal'] = data_for_m['med_conf_sal'].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
data_for_m['stad_size'] = data_for_m['stad_size'].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
scatter_matrix(data_for_m, figsize=(12,8))
plt.show()
data_for_m.drop('T', axis=1, inplace=True)
data_for_m.to_csv('data_for_m_v3.csv', index=False)
data = data_for_m.copy()
data.head()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()
from sklearn.compose import ColumnTransformer
data_num = data.drop(['school', 'conf', 'coach'],axis =1)
num_attribs = list(data_num)
# cat_attribs = ['school','conf','coach']
cat_attribs = ['conf']
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
# ('attribs_addr', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('cat', OneHotEncoder(), cat_attribs)
])
data_prepared = full_pipeline.fit_transform(data)
# data.dtypes
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Stardard Deviation:", scores.std())
With only win-loss Scores: [ 627138.42590247 528461.92031606 740615.40460434 772653.8213664 421934.01610918 189714.87363032 913332.69201488 1110949.79108439 559555.59486329 1070420.75902697] Mean: 693477.7298918304 Stardard Deviation: 273911.52769763477
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
display_scores(lin_reg_rmse_scores)
data_whole = data_for_m.copy()
data_whole_y = data_whole['school_pay']
data_whole_X = data_whole.drop(['school_pay', 'school','coach'], axis=1)
# data_whole_prepared = full_pipeline.fit_transform(data_whole_X)
# lin_reg.fit(data_whole_prepared, data_whole_y)
# lin_reg.fit()
# data_whole_X
# lin_reg.fit(data_whole_X, data_whole_y)
# post_prediction = data_for_m.copy()
# post_prediction['prediction'] = lin_reg.predict(data_whole_prepared)
# post_prediction['prediction_clear'] = post_prediction.apply(lambda x: '%.1f' % x['prediction'], axis=1)
# post_prediction
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_prepared, data_labels)
With only win-loss Scores: [1061315.48217972 388101.64569291 838352.48061063 1310105.90729247 761111.21968192 813957.7190166 1191476.11762561 1072436.80091342 1048970.92139129 1632316.56249617] Mean: 1011814.4856900737 Stardard Deviation: 321697.2361820451
scores = cross_val_score(tree_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)
display_scores(tree_reg_rmse_scores)
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(data_prepared, data_labels)
With only win-loss Scores: [ 673351.8422406 606464.01825049 425498.26999178 1090543.6350158 690811.54849218 850426.329261 839837.59854235 1300692.87382977 816062.72166777 1095827.29603885] Mean: 838951.6133330595 Stardard Deviation: 248497.0665433372
scores = cross_val_score(forest_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_reg_rmse_scores = np.sqrt(-scores)
display_scores(forest_reg_rmse_scores)
import statsmodels.formula.api as smf
data = data_for_m.copy()
my_model = str('school_pay ~ bonus + rank + buyout')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
my_model = str('school_pay ~ conf + bonus + buyout')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
1.489e+06 + 8.803e+05
# + -0.3289(bonus) + 0.1225(buyout)
my_model = str('school_pay ~ conf + buyout')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
1.243e+06 + 7.582e+05
# + 0.1213(buyout)
my_model = str('school_pay ~ conf')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
1.767e+06 + 1.838e+06
my_model = str('school_pay ~ conf + ratio')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
6.742e+05 + 1.765e+06 + (2.061e+06*0.417)
my_model = str('school_pay ~ conf + rank')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
2.467e+06 + 1.72e+06 + (-1.154e+04*81)
my_model = str('school_pay ~ conf + stad_size + grad_rate')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1] * syracuse['stad_size'])
# train_model_fit.params[-1]
syracuse = data[data['school'] == 'Syracuse']
syracuse
data = data_for_m.copy()
data[data['school'] == 'Syracuse']
len(data)