1.24.20
import pandas as pd
data = pd.read_csv("Coaches9.csv")
attrs = list(data[data.columns[3:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
data.isnull().sum()
todrop = data[data['SchoolPay'].isnull()]
todrop.School.values
data.dropna(subset=['SchoolPay'], inplace=True)
data.drop('BonusPaid', axis=1, inplace=True)
median_bonus = data['Bonus'].median()
median_buyout = data['Buyout'].median()
data["Bonus"].fillna(median_bonus, inplace = True)
data["Buyout"].fillna(median_buyout, inplace = True)
data.drop(['TotalPay','AssistantPay'], axis=1, inplace=True)
data.head()
data.to_csv('dataset_1_coaches9.csv', index=False)
syr = data[data['School'] == 'Syracuse']
data_nosyr = data[data['School'] != 'Syracuse']
data.columns = "school conference coach school_pay bonus buyout".split()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
import statsmodels.formula.api as smf
my_model = str('school_pay ~ conference')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
train_model_fit.params[0] + train_model_fit.params[1]
syr
train_model_fit.params[0] + train_model_fit.params[3]
conf_means = pd.DataFrame(data.groupby('conference')['school_pay'].mean())
conf_means_dict = conf_means.T.to_dict('records')
data['conf_mean_pay'] = data.apply(lambda x: conf_means_dict[0][(x['conference'])], axis=1)
data
data.to_csv('dataset_2_coaches9.csv', index=False)
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conf_mean_pay')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*3.409629e+06)
train_model_fit.params[0] + (train_model_fit.params[1]*3.409629e+06)
data
data['school_key'] = data.apply(lambda x: ''.join(x['school'].lower().split(' '))[:8], axis=1)
winloss = pd.read_csv('winloss_withkey.csv')
test = data.copy()
data_wl = test.merge(winloss, left_on="school_key", right_on="team_key")
data_wl.to_csv('dataset_3_winloss.csv', index=False)
data = data_wl.copy()
attr = 'Rank + conf_mean_pay '
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conference +' + attr)
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
syr = data[data['school'] == 'Syracuse']
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*3.409629e+06)
syr
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr])
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-2]*syr['Rank']) + (train_model_fit.params[-1]*syr['conf_mean_pay'])