1.24.20
import pandas as pd
data = pd.read_csv("Coaches9.csv")
attrs = list(data[data.columns[3:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
data.isnull().sum()
todrop = data[data['SchoolPay'].isnull()]
todrop.School.values
data.dropna(subset=['SchoolPay'], inplace=True)
data.drop('BonusPaid', axis=1, inplace=True)
median_bonus = data['Bonus'].median()
median_buyout = data['Buyout'].median()
data["Bonus"].fillna(median_bonus, inplace = True)
data["Buyout"].fillna(median_buyout, inplace = True)
data.drop(['TotalPay','AssistantPay'], axis=1, inplace=True)
data.head()
data.to_csv('dataset_1_coaches9.csv', index=False)
syr = data[data['School'] == 'Syracuse']
data_nosyr = data[data['School'] != 'Syracuse']
data.columns = "school conference coach school_pay bonus buyout".split()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
import statsmodels.formula.api as smf
my_model = str('school_pay ~ conference')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
train_model_fit.params[0] + train_model_fit.params[1]
syr
train_model_fit.params[0] + train_model_fit.params[3]
conf_means = pd.DataFrame(data.groupby('conference')['school_pay'].mean())
conf_means_dict = conf_means.T.to_dict('records')
data['conf_mean_pay'] = data.apply(lambda x: conf_means_dict[0][(x['conference'])], axis=1)
data
data.to_csv('dataset_2_coaches9.csv', index=False)
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conf_mean_pay')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*3.409629e+06)
train_model_fit.params[0] + (train_model_fit.params[1]*3.409629e+06)
data
data['school_key'] = data.apply(lambda x: ''.join(x['school'].lower().split(' '))[:8], axis=1)
winloss = pd.read_csv('winloss_withkey.csv')
test = data.copy()
data_wl = test.merge(winloss, left_on="school_key", right_on="team_key")
data_wl.to_csv('dataset_3_winloss.csv', index=False)
data = data_wl.copy()
attr = 'Rank + conf_mean_pay '
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conference +' + attr)
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
syr = data[data['school'] == 'Syracuse']
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*3.409629e+06)
syr
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr])
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-2]*syr['Rank']) + (train_model_fit.params[-1]*syr['conf_mean_pay'])
df2 = pd.read_csv('../LAB3/kmodify_2018.csv')
df2.columns
syr = df2[df2['school'] == 'Syracuse']
curious_about = ['enrollment', 'stad_size', 'gsr', 'seat_rank', 'gsr_rank', 'combo_rank',
'true_rank', 'rank_attend', 'total_attend', 'avg_attend', 'stad_size2',
'rank_2018', 'school_pay_2018',
'max_bonus_2018', 'bonuses_paid_2018', 'asst_pay_total_2018',
'buyout_2018', 'W_2018', 'L_2018', 'C_2018', 'N_2018', 'P_2018',
'AVG_PTS_FOR_2018', 'AVG_PTS_AGT_2018', 'AVG_MGN_2018',
'AVE_RUN_YPG_FOR_2018', 'AVG_PAS_YPG_FOR_2018', 'AVG_TOT_YPG_FOR_2018',
'AVG_RUN_YPG_AGT_2018', 'AVG_PAS_YPG_AGT_2018', 'AVG_TOT_YPG_AGT_2018',
'AVG_NET_YPG_2018', 'NET_TO_2018', 'AVG_PWR_RTG_2018',
'AVG_OPP_PWR_RTG_2018', 'NET_PWR_RTG_CHG_2018', 'LINE_POWR_2018',
'GAME_PLAY_POWR_2018', 'COMP_POWR_2018', 'X_FCTR_2018',
'vegas_year_2018']
syr = df2[df2['school'] == 'Syracuse']
df2_nosyr = df2[df2['school'] != 'Syracuse']
df2_nosyr_labels = df2_nosyr['school_pay_2018']
df2_nosyr_nolabels = df2_nosyr.drop('school_pay_2018', axis=1)
# df2_labels = df2.drop('school_pay', axis=1)
for attr in curious_about:
# train_set, test_set = train_test_split(df2, test_size=0.2, random_state=42)
my_model = str('school_pay_2018 ~ conf_y + '+ attr)
train_model_fit = smf.ols(my_model, data = df2_nosyr).fit()
# print(train_model_fit.summary())
print('MODEL: predicting school_pay with conf and', attr, train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr]))
train_set, test_set = train_test_split(df2, test_size=0.2, random_state=42)
my_model = str('school_pay_2018 ~ conf_y + rank_2018')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
# print('MODEL: predicting school_pay with conf and', attr, train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr]))
print(train_model_fit.params[0] + (train_model_fit.params[-1]*syr['rank_2018']))
2.806250e+06
syr = df2[df2['school'] == 'Syracuse']
syr
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr])
syr['school_pay_2018']
attrs = [
'enrollment',
'stad_size',
'gsr',
'seat_rank',
'gsr_rank',
'combo_rank',
'true_rank',
'rank_attend',
'total_attend',
'avg_attend',
'stad_size2',
'pct_capacity',
'rank_2019',
'conf_2019',
'coach_2019',
'max_bonus_2019',
'bonuses_paid_2019',
'asst_pay_total_2019']
# attrs = [
# 'enrollment']
# for attr in attrs:
# train_set, test_set = train_test_split(df2, test_size=0.2, random_state=42)
# my_model = str('school_pay_2019 ~ conf_y + ' + attr)
# train_model_fit = smf.ols(my_model, data = train_set).fit()
# print(attr, "--",train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr]))
df2.dtypes
# kcoach2016[attrs] = kcoach2016[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
from pandas.plotting import scatter_matrix
attributes = ['enrollment', 'stad_size']
scatter_matrix(df2[attributes])
attributes = ['enrollment', 'gsr']
scatter_matrix(df2[attributes])
attributes = ['enrollment', 'total_attend']
scatter_matrix(df2[attributes])
attributes = ['enrollment', 'avg_attend']
scatter_matrix(df2[attributes])
df3 = pd.read_csv('v2_coaches_modify.csv')
df3
# attrs = [' NCAAFBREV16 ', 'MedianConfSal',
# ' SchoolPay ', ' TotalPay ', ' Bonus ', ' BonusPaid ',
# ' PayPlusBonus2016 ', ' StadSize ', ' Graduation Rate (GSR) ',
# ' Seat Rank ', ' GSRank ', ' Combo Rank ', ' TrueRank ', 'W', 'L',
# ' Ratio ', 'OffenceScore', 'Defense Score', 'Score', 'PointsPerGame']
# for attr in attrs:
# if df3[attr].dtypes != 'float64':
# df3[attr] = df3[attr].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
df3.columns
for i,col in enumerate(df3.columns):
if df3[col].dtype == 'float64':
median = df3[col].median()
df3[col].fillna(median, inplace = True)
df3.isnull().sum()
df3.drop('ncaafbrev16', axis=1, inplace=True)
df3.drop('total_pay', axis=1, inplace=True)
df3.columns
syr = df3[df3['school'] == 'Syracuse']
syr
curious_about = ['median_conf_sal', 'max_bonus',
'bonus_paid', 'pay_plus_bonus', 'stad_size', 'grad_rate', 'seat_rank',
'grad_rate_rank', 'combo', 'true_rank', 'w', 'l', 'ratio',
'offensive_score', 'defensive_score', 'score', 'points_per_game']
for attr in curious_about:
train_set, test_set = train_test_split(df3, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conf + '+ attr)
train_model_fit = smf.ols(my_model, data = train_set).fit()
# print(train_model_fit.summary())
print('MODEL: predicting school_pay with conf and', attr, (train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr])))
train_set, test_set = train_test_split(df3, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conf + score')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
train_model_fit.params[0] + train_model_fit.params[3] + (train_model_fit.params[-1]*syr['score'])
syr['school_pay']
2.431316e+06
3.587410e+06
train_model_fit.params[0] + (train_model_fit.params[-1]*syr['score'])
1.443604e+06