IST 718 | LAB 3 (V3)

1.24.20

In [85]:
import pandas as pd
data = pd.read_csv("Coaches9.csv")
attrs = list(data[data.columns[3:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
data.isnull().sum()
Out[85]:
School           0
Conference       0
Coach            0
SchoolPay        4
TotalPay         4
Bonus           22
BonusPaid       41
AssistantPay     0
Buyout          22
dtype: int64
In [86]:
todrop = data[data['SchoolPay'].isnull()]
todrop.School.values
Out[86]:
array(['Baylor', 'Brigham Young', 'Rice', 'Southern Methodist'],
      dtype=object)
In [87]:
data.dropna(subset=['SchoolPay'], inplace=True)
In [88]:
data.drop('BonusPaid', axis=1, inplace=True)
In [89]:
median_bonus = data['Bonus'].median()
median_buyout = data['Buyout'].median()
data["Bonus"].fillna(median_bonus, inplace = True)
data["Buyout"].fillna(median_buyout, inplace = True)
In [90]:
data.drop(['TotalPay','AssistantPay'], axis=1, inplace=True) 
In [91]:
data.head()
Out[91]:
School Conference Coach SchoolPay Bonus Buyout
0 Air Force Mt. West Troy Calhoun 885000.0 247000.0 4000000.0
1 Akron MAC Terry Bowden 411000.0 225000.0 688500.0
2 Alabama SEC Nick Saban 8307000.0 1100000.0 33600000.0
3 Alabama at Birmingham C-USA Bill Clark 900000.0 950000.0 3847500.0
4 Appalachian State Sun Belt Scott Satterfield 712500.0 295000.0 2160417.0
In [92]:
data.to_csv('dataset_1_coaches9.csv', index=False)
In [93]:
syr = data[data['School'] == 'Syracuse']
data_nosyr = data[data['School'] != 'Syracuse']

data.columns = "school conference coach school_pay bonus buyout".split()
In [94]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
In [95]:
import statsmodels.formula.api as smf
my_model = str('school_pay ~ conference')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             school_pay   R-squared:                       0.682
Model:                            OLS   Adj. R-squared:                  0.646
Method:                 Least Squares   F-statistic:                     19.05
Date:                Fri, 24 Jan 2020   Prob (F-statistic):           3.47e-18
Time:                        17:44:22   Log-Likelihood:                -1527.3
No. Observations:                 100   AIC:                             3077.
Df Residuals:                      89   BIC:                             3105.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               1.718e+06   3.67e+05      4.676      0.000    9.88e+05    2.45e+06
conference[T.ACC]       1.715e+06   4.86e+05      3.529      0.001    7.49e+05    2.68e+06
conference[T.Big 12]    2.588e+06   5.35e+05      4.834      0.000    1.52e+06    3.65e+06
conference[T.Big Ten]   2.663e+06   4.95e+05      5.376      0.000    1.68e+06    3.65e+06
conference[T.C-USA]    -8.334e+05   5.06e+05     -1.646      0.103   -1.84e+06    1.73e+05
conference[T.Ind.]     -9.513e+05   7.35e+05     -1.295      0.199   -2.41e+06    5.08e+05
conference[T.MAC]      -1.136e+06   5.19e+05     -2.187      0.031   -2.17e+06   -1.04e+05
conference[T.Mt. West] -6.646e+05   5.06e+05     -1.312      0.193   -1.67e+06    3.42e+05
conference[T.Pac-12]    1.139e+06   4.86e+05      2.343      0.021    1.73e+05     2.1e+06
conference[T.SEC]       2.817e+06   5.35e+05      5.260      0.000    1.75e+06    3.88e+06
conference[T.Sun Belt] -1.065e+06   5.35e+05     -1.988      0.050   -2.13e+06    -679.708
==============================================================================
Omnibus:                       21.218   Durbin-Watson:                   1.752
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               39.205
Skew:                           0.849   Prob(JB):                     3.07e-09
Kurtosis:                       5.554   Cond. No.                         12.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [96]:
train_model_fit.params[0] + train_model_fit.params[1] 
Out[96]:
3432823.000000003
In [97]:
syr
Out[97]:
School Conference Coach SchoolPay Bonus Buyout
102 Syracuse ACC Dino Babers 2401206.0 775000.0 4000000.0
In [98]:
train_model_fit.params[0] + train_model_fit.params[3] 
Out[98]:
4380583.363636367
In [101]:
conf_means = pd.DataFrame(data.groupby('conference')['school_pay'].mean())
conf_means_dict = conf_means.T.to_dict('records')
data['conf_mean_pay'] = data.apply(lambda x: conf_means_dict[0][(x['conference'])], axis=1)
In [102]:
data
Out[102]:
school conference coach school_pay bonus buyout conf_mean_pay
0 Air Force Mt. West Troy Calhoun 885000.0 247000.0 4000000.0 1.000642e+06
1 Akron MAC Terry Bowden 411000.0 225000.0 688500.0 5.798362e+05
2 Alabama SEC Nick Saban 8307000.0 1100000.0 33600000.0 4.642080e+06
3 Alabama at Birmingham C-USA Bill Clark 900000.0 950000.0 3847500.0 8.395185e+05
4 Appalachian State Sun Belt Scott Satterfield 712500.0 295000.0 2160417.0 6.506500e+05
... ... ... ... ... ... ... ...
124 West Virginia Big 12 Dana Holgorsen 3605000.0 1000000.0 7150000.0 4.016756e+06
125 Western Kentucky C-USA Mike Sanford Jr. 800000.0 400000.0 1200000.0 8.395185e+05
126 Western Michigan MAC Tim Lester 800000.0 346500.0 800000.0 5.798362e+05
127 Wisconsin Big Ten Paul Chryst 3750000.0 775000.0 6000000.0 4.304014e+06
128 Wyoming Mt. West Craig Bohl 1412000.0 450000.0 8016667.0 1.000642e+06

125 rows × 7 columns

In [103]:
data.to_csv('dataset_2_coaches9.csv', index=False)
In [115]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conf_mean_pay')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             school_pay   R-squared:                       0.679
Model:                            OLS   Adj. R-squared:                  0.676
Method:                 Least Squares   F-statistic:                     207.3
Date:                Fri, 24 Jan 2020   Prob (F-statistic):           6.43e-26
Time:                        17:52:23   Log-Likelihood:                -1527.7
No. Observations:                 100   AIC:                             3059.
Df Residuals:                      98   BIC:                             3065.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      5414.2909   1.98e+05      0.027      0.978   -3.88e+05    3.99e+05
conf_mean_pay     1.0123      0.070     14.397      0.000       0.873       1.152
==============================================================================
Omnibus:                       17.425   Durbin-Watson:                   1.746
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               30.306
Skew:                           0.716   Prob(JB):                     2.63e-07
Kurtosis:                       5.285   Cond. No.                     5.29e+06
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.29e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [111]:
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*3.409629e+06)
Out[111]:
3432823.1443898478
In [116]:
train_model_fit.params[0] + (train_model_fit.params[1]*3.409629e+06)
Out[116]:
3457075.752712704
In [117]:
data
Out[117]:
school conference coach school_pay bonus buyout conf_mean_pay
0 Air Force Mt. West Troy Calhoun 885000.0 247000.0 4000000.0 1.000642e+06
1 Akron MAC Terry Bowden 411000.0 225000.0 688500.0 5.798362e+05
2 Alabama SEC Nick Saban 8307000.0 1100000.0 33600000.0 4.642080e+06
3 Alabama at Birmingham C-USA Bill Clark 900000.0 950000.0 3847500.0 8.395185e+05
4 Appalachian State Sun Belt Scott Satterfield 712500.0 295000.0 2160417.0 6.506500e+05
... ... ... ... ... ... ... ...
124 West Virginia Big 12 Dana Holgorsen 3605000.0 1000000.0 7150000.0 4.016756e+06
125 Western Kentucky C-USA Mike Sanford Jr. 800000.0 400000.0 1200000.0 8.395185e+05
126 Western Michigan MAC Tim Lester 800000.0 346500.0 800000.0 5.798362e+05
127 Wisconsin Big Ten Paul Chryst 3750000.0 775000.0 6000000.0 4.304014e+06
128 Wyoming Mt. West Craig Bohl 1412000.0 450000.0 8016667.0 1.000642e+06

125 rows × 7 columns

Adding Win Loss

In [124]:
data['school_key'] = data.apply(lambda x: ''.join(x['school'].lower().split(' '))[:8], axis=1)
winloss = pd.read_csv('winloss_withkey.csv')
In [125]:
test = data.copy()
In [126]:
data_wl = test.merge(winloss, left_on="school_key", right_on="team_key")
In [128]:
data_wl.to_csv('dataset_3_winloss.csv', index=False)
In [158]:
data = data_wl.copy()
attr = 'Rank + conf_mean_pay '
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conference +' + attr)
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             school_pay   R-squared:                       0.732
Model:                            OLS   Adj. R-squared:                  0.699
Method:                 Least Squares   F-statistic:                     21.88
Date:                Fri, 24 Jan 2020   Prob (F-statistic):           1.06e-20
Time:                        18:07:04   Log-Likelihood:                -1526.0
No. Observations:                 100   AIC:                             3076.
Df Residuals:                      88   BIC:                             3107.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               7.806e+05   5.97e+05      1.309      0.194   -4.05e+05    1.97e+06
conference[T.ACC]       2.686e+04   4.61e+05      0.058      0.954   -8.88e+05    9.42e+05
conference[T.Big 12]   -2.127e+05    4.2e+05     -0.506      0.614   -1.05e+06    6.22e+05
conference[T.Big Ten]   6.048e+04   3.74e+05      0.162      0.872   -6.83e+05    8.04e+05
conference[T.C-USA]    -2.473e+05   6.96e+05     -0.355      0.723   -1.63e+06    1.14e+06
conference[T.Ind.]      4.471e+04   7.32e+05      0.061      0.951   -1.41e+06     1.5e+06
conference[T.MAC]       1.624e+04    6.6e+05      0.025      0.980    -1.3e+06    1.33e+06
conference[T.Mt. West]  1.683e+05   6.12e+05      0.275      0.784   -1.05e+06    1.38e+06
conference[T.Pac-12]   -1.342e+05   4.85e+05     -0.277      0.783    -1.1e+06     8.3e+05
conference[T.SEC]      -3.392e+05   3.68e+05     -0.921      0.359   -1.07e+06    3.92e+05
conference[T.Sun Belt] -1.309e+05   5.97e+05     -0.219      0.827   -1.32e+06    1.06e+06
Rank                   -1.142e+04   2916.734     -3.915      0.000   -1.72e+04   -5623.815
conf_mean_pay              1.0220      0.090     11.370      0.000       0.843       1.201
==============================================================================
Omnibus:                       11.720   Durbin-Watson:                   1.808
Prob(Omnibus):                  0.003   Jarque-Bera (JB):               15.401
Skew:                           0.585   Prob(JB):                     0.000453
Kurtosis:                       4.525   Cond. No.                     2.95e+22
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.04e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [159]:
syr = data[data['school'] == 'Syracuse']
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*3.409629e+06)
In [160]:
syr
Out[160]:
school conference coach school_pay bonus buyout conf_mean_pay school_key Rank Team W L T Pct team_key
98 Syracuse ACC Dino Babers 2401206.0 775000.0 4000000.0 3.409629e+06 syracuse 81 Syracuse (ACC) 5 7 0 0.417 syracuse
In [162]:
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr])
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-2]*syr['Rank']) + (train_model_fit.params[-1]*syr['conf_mean_pay'])
Out[162]:
98    3.366984e+06
dtype: float64
In [ ]:
 
In [ ]: