IST 718 | LAB 3 (V3)

1.24.20

In [85]:
import pandas as pd
data = pd.read_csv("Coaches9.csv")
attrs = list(data[data.columns[3:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
data.isnull().sum()
Out[85]:
School           0
Conference       0
Coach            0
SchoolPay        4
TotalPay         4
Bonus           22
BonusPaid       41
AssistantPay     0
Buyout          22
dtype: int64
In [86]:
todrop = data[data['SchoolPay'].isnull()]
todrop.School.values
Out[86]:
array(['Baylor', 'Brigham Young', 'Rice', 'Southern Methodist'],
      dtype=object)
In [87]:
data.dropna(subset=['SchoolPay'], inplace=True)
In [88]:
data.drop('BonusPaid', axis=1, inplace=True)
In [89]:
median_bonus = data['Bonus'].median()
median_buyout = data['Buyout'].median()
data["Bonus"].fillna(median_bonus, inplace = True)
data["Buyout"].fillna(median_buyout, inplace = True)
In [90]:
data.drop(['TotalPay','AssistantPay'], axis=1, inplace=True) 
In [91]:
data.head()
Out[91]:
School Conference Coach SchoolPay Bonus Buyout
0 Air Force Mt. West Troy Calhoun 885000.0 247000.0 4000000.0
1 Akron MAC Terry Bowden 411000.0 225000.0 688500.0
2 Alabama SEC Nick Saban 8307000.0 1100000.0 33600000.0
3 Alabama at Birmingham C-USA Bill Clark 900000.0 950000.0 3847500.0
4 Appalachian State Sun Belt Scott Satterfield 712500.0 295000.0 2160417.0
In [92]:
data.to_csv('dataset_1_coaches9.csv', index=False)
In [93]:
syr = data[data['School'] == 'Syracuse']
data_nosyr = data[data['School'] != 'Syracuse']

data.columns = "school conference coach school_pay bonus buyout".split()
In [94]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
In [95]:
import statsmodels.formula.api as smf
my_model = str('school_pay ~ conference')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             school_pay   R-squared:                       0.682
Model:                            OLS   Adj. R-squared:                  0.646
Method:                 Least Squares   F-statistic:                     19.05
Date:                Fri, 24 Jan 2020   Prob (F-statistic):           3.47e-18
Time:                        17:44:22   Log-Likelihood:                -1527.3
No. Observations:                 100   AIC:                             3077.
Df Residuals:                      89   BIC:                             3105.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               1.718e+06   3.67e+05      4.676      0.000    9.88e+05    2.45e+06
conference[T.ACC]       1.715e+06   4.86e+05      3.529      0.001    7.49e+05    2.68e+06
conference[T.Big 12]    2.588e+06   5.35e+05      4.834      0.000    1.52e+06    3.65e+06
conference[T.Big Ten]   2.663e+06   4.95e+05      5.376      0.000    1.68e+06    3.65e+06
conference[T.C-USA]    -8.334e+05   5.06e+05     -1.646      0.103   -1.84e+06    1.73e+05
conference[T.Ind.]     -9.513e+05   7.35e+05     -1.295      0.199   -2.41e+06    5.08e+05
conference[T.MAC]      -1.136e+06   5.19e+05     -2.187      0.031   -2.17e+06   -1.04e+05
conference[T.Mt. West] -6.646e+05   5.06e+05     -1.312      0.193   -1.67e+06    3.42e+05
conference[T.Pac-12]    1.139e+06   4.86e+05      2.343      0.021    1.73e+05     2.1e+06
conference[T.SEC]       2.817e+06   5.35e+05      5.260      0.000    1.75e+06    3.88e+06
conference[T.Sun Belt] -1.065e+06   5.35e+05     -1.988      0.050   -2.13e+06    -679.708
==============================================================================
Omnibus:                       21.218   Durbin-Watson:                   1.752
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               39.205
Skew:                           0.849   Prob(JB):                     3.07e-09
Kurtosis:                       5.554   Cond. No.                         12.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [96]:
train_model_fit.params[0] + train_model_fit.params[1] 
Out[96]:
3432823.000000003
In [97]:
syr
Out[97]:
School Conference Coach SchoolPay Bonus Buyout
102 Syracuse ACC Dino Babers 2401206.0 775000.0 4000000.0
In [98]:
train_model_fit.params[0] + train_model_fit.params[3] 
Out[98]:
4380583.363636367
In [101]:
conf_means = pd.DataFrame(data.groupby('conference')['school_pay'].mean())
conf_means_dict = conf_means.T.to_dict('records')
data['conf_mean_pay'] = data.apply(lambda x: conf_means_dict[0][(x['conference'])], axis=1)
In [102]:
data
Out[102]:
school conference coach school_pay bonus buyout conf_mean_pay
0 Air Force Mt. West Troy Calhoun 885000.0 247000.0 4000000.0 1.000642e+06
1 Akron MAC Terry Bowden 411000.0 225000.0 688500.0 5.798362e+05
2 Alabama SEC Nick Saban 8307000.0 1100000.0 33600000.0 4.642080e+06
3 Alabama at Birmingham C-USA Bill Clark 900000.0 950000.0 3847500.0 8.395185e+05
4 Appalachian State Sun Belt Scott Satterfield 712500.0 295000.0 2160417.0 6.506500e+05
... ... ... ... ... ... ... ...
124 West Virginia Big 12 Dana Holgorsen 3605000.0 1000000.0 7150000.0 4.016756e+06
125 Western Kentucky C-USA Mike Sanford Jr. 800000.0 400000.0 1200000.0 8.395185e+05
126 Western Michigan MAC Tim Lester 800000.0 346500.0 800000.0 5.798362e+05
127 Wisconsin Big Ten Paul Chryst 3750000.0 775000.0 6000000.0 4.304014e+06
128 Wyoming Mt. West Craig Bohl 1412000.0 450000.0 8016667.0 1.000642e+06

125 rows × 7 columns

In [103]:
data.to_csv('dataset_2_coaches9.csv', index=False)
In [115]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conf_mean_pay')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             school_pay   R-squared:                       0.679
Model:                            OLS   Adj. R-squared:                  0.676
Method:                 Least Squares   F-statistic:                     207.3
Date:                Fri, 24 Jan 2020   Prob (F-statistic):           6.43e-26
Time:                        17:52:23   Log-Likelihood:                -1527.7
No. Observations:                 100   AIC:                             3059.
Df Residuals:                      98   BIC:                             3065.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      5414.2909   1.98e+05      0.027      0.978   -3.88e+05    3.99e+05
conf_mean_pay     1.0123      0.070     14.397      0.000       0.873       1.152
==============================================================================
Omnibus:                       17.425   Durbin-Watson:                   1.746
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               30.306
Skew:                           0.716   Prob(JB):                     2.63e-07
Kurtosis:                       5.285   Cond. No.                     5.29e+06
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.29e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [111]:
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*3.409629e+06)
Out[111]:
3432823.1443898478
In [116]:
train_model_fit.params[0] + (train_model_fit.params[1]*3.409629e+06)
Out[116]:
3457075.752712704
In [117]:
data
Out[117]:
school conference coach school_pay bonus buyout conf_mean_pay
0 Air Force Mt. West Troy Calhoun 885000.0 247000.0 4000000.0 1.000642e+06
1 Akron MAC Terry Bowden 411000.0 225000.0 688500.0 5.798362e+05
2 Alabama SEC Nick Saban 8307000.0 1100000.0 33600000.0 4.642080e+06
3 Alabama at Birmingham C-USA Bill Clark 900000.0 950000.0 3847500.0 8.395185e+05
4 Appalachian State Sun Belt Scott Satterfield 712500.0 295000.0 2160417.0 6.506500e+05
... ... ... ... ... ... ... ...
124 West Virginia Big 12 Dana Holgorsen 3605000.0 1000000.0 7150000.0 4.016756e+06
125 Western Kentucky C-USA Mike Sanford Jr. 800000.0 400000.0 1200000.0 8.395185e+05
126 Western Michigan MAC Tim Lester 800000.0 346500.0 800000.0 5.798362e+05
127 Wisconsin Big Ten Paul Chryst 3750000.0 775000.0 6000000.0 4.304014e+06
128 Wyoming Mt. West Craig Bohl 1412000.0 450000.0 8016667.0 1.000642e+06

125 rows × 7 columns

Adding Win Loss

In [124]:
data['school_key'] = data.apply(lambda x: ''.join(x['school'].lower().split(' '))[:8], axis=1)
winloss = pd.read_csv('winloss_withkey.csv')
In [125]:
test = data.copy()
In [126]:
data_wl = test.merge(winloss, left_on="school_key", right_on="team_key")
In [128]:
data_wl.to_csv('dataset_3_winloss.csv', index=False)
In [158]:
data = data_wl.copy()
attr = 'Rank + conf_mean_pay '
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conference +' + attr)
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             school_pay   R-squared:                       0.732
Model:                            OLS   Adj. R-squared:                  0.699
Method:                 Least Squares   F-statistic:                     21.88
Date:                Fri, 24 Jan 2020   Prob (F-statistic):           1.06e-20
Time:                        18:07:04   Log-Likelihood:                -1526.0
No. Observations:                 100   AIC:                             3076.
Df Residuals:                      88   BIC:                             3107.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               7.806e+05   5.97e+05      1.309      0.194   -4.05e+05    1.97e+06
conference[T.ACC]       2.686e+04   4.61e+05      0.058      0.954   -8.88e+05    9.42e+05
conference[T.Big 12]   -2.127e+05    4.2e+05     -0.506      0.614   -1.05e+06    6.22e+05
conference[T.Big Ten]   6.048e+04   3.74e+05      0.162      0.872   -6.83e+05    8.04e+05
conference[T.C-USA]    -2.473e+05   6.96e+05     -0.355      0.723   -1.63e+06    1.14e+06
conference[T.Ind.]      4.471e+04   7.32e+05      0.061      0.951   -1.41e+06     1.5e+06
conference[T.MAC]       1.624e+04    6.6e+05      0.025      0.980    -1.3e+06    1.33e+06
conference[T.Mt. West]  1.683e+05   6.12e+05      0.275      0.784   -1.05e+06    1.38e+06
conference[T.Pac-12]   -1.342e+05   4.85e+05     -0.277      0.783    -1.1e+06     8.3e+05
conference[T.SEC]      -3.392e+05   3.68e+05     -0.921      0.359   -1.07e+06    3.92e+05
conference[T.Sun Belt] -1.309e+05   5.97e+05     -0.219      0.827   -1.32e+06    1.06e+06
Rank                   -1.142e+04   2916.734     -3.915      0.000   -1.72e+04   -5623.815
conf_mean_pay              1.0220      0.090     11.370      0.000       0.843       1.201
==============================================================================
Omnibus:                       11.720   Durbin-Watson:                   1.808
Prob(Omnibus):                  0.003   Jarque-Bera (JB):               15.401
Skew:                           0.585   Prob(JB):                     0.000453
Kurtosis:                       4.525   Cond. No.                     2.95e+22
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.04e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [159]:
syr = data[data['school'] == 'Syracuse']
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*3.409629e+06)
In [160]:
syr
Out[160]:
school conference coach school_pay bonus buyout conf_mean_pay school_key Rank Team W L T Pct team_key
98 Syracuse ACC Dino Babers 2401206.0 775000.0 4000000.0 3.409629e+06 syracuse 81 Syracuse (ACC) 5 7 0 0.417 syracuse
In [162]:
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr])
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-2]*syr['Rank']) + (train_model_fit.params[-1]*syr['conf_mean_pay'])
Out[162]:
98    3.366984e+06
dtype: float64

Adding Additional Data

In [269]:
df2 = pd.read_csv('../LAB3/kmodify_2018.csv')
df2.columns
syr = df2[df2['school'] == 'Syracuse']
In [270]:
curious_about = ['enrollment', 'stad_size', 'gsr', 'seat_rank', 'gsr_rank', 'combo_rank',
       'true_rank', 'rank_attend', 'total_attend', 'avg_attend', 'stad_size2',
       'rank_2018', 'school_pay_2018',
       'max_bonus_2018', 'bonuses_paid_2018', 'asst_pay_total_2018',
       'buyout_2018', 'W_2018', 'L_2018', 'C_2018', 'N_2018', 'P_2018',
       'AVG_PTS_FOR_2018', 'AVG_PTS_AGT_2018', 'AVG_MGN_2018',
       'AVE_RUN_YPG_FOR_2018', 'AVG_PAS_YPG_FOR_2018', 'AVG_TOT_YPG_FOR_2018',
       'AVG_RUN_YPG_AGT_2018', 'AVG_PAS_YPG_AGT_2018', 'AVG_TOT_YPG_AGT_2018',
       'AVG_NET_YPG_2018', 'NET_TO_2018', 'AVG_PWR_RTG_2018',
       'AVG_OPP_PWR_RTG_2018', 'NET_PWR_RTG_CHG_2018', 'LINE_POWR_2018',
       'GAME_PLAY_POWR_2018', 'COMP_POWR_2018', 'X_FCTR_2018',
       'vegas_year_2018']
In [279]:
syr = df2[df2['school'] == 'Syracuse']
df2_nosyr = df2[df2['school'] != 'Syracuse']
df2_nosyr_labels = df2_nosyr['school_pay_2018']
df2_nosyr_nolabels = df2_nosyr.drop('school_pay_2018', axis=1)
# df2_labels = df2.drop('school_pay', axis=1)
In [281]:
for attr in curious_about:
#     train_set, test_set = train_test_split(df2, test_size=0.2, random_state=42)
    my_model = str('school_pay_2018 ~ conf_y + '+ attr)
    train_model_fit = smf.ols(my_model, data = df2_nosyr).fit()
#     print(train_model_fit.summary())
    print('MODEL: predicting school_pay with conf and', attr, train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr]))
MODEL: predicting school_pay with conf and enrollment 81    3.426446e+06
Name: enrollment, dtype: float64
MODEL: predicting school_pay with conf and stad_size 81    3.097857e+06
Name: stad_size, dtype: float64
MODEL: predicting school_pay with conf and gsr 81    3.547032e+06
Name: gsr, dtype: float64
MODEL: predicting school_pay with conf and seat_rank 81    3.047003e+06
Name: seat_rank, dtype: float64
MODEL: predicting school_pay with conf and gsr_rank 81    3.570958e+06
Name: gsr_rank, dtype: float64
MODEL: predicting school_pay with conf and combo_rank 81    3.288697e+06
Name: combo_rank, dtype: float64
MODEL: predicting school_pay with conf and true_rank 81    3.362694e+06
Name: true_rank, dtype: float64
MODEL: predicting school_pay with conf and rank_attend 81    3.290928e+06
Name: rank_attend, dtype: float64
MODEL: predicting school_pay with conf and total_attend 81    3.210830e+06
Name: total_attend, dtype: float64
MODEL: predicting school_pay with conf and avg_attend 81    3.347712e+06
Name: avg_attend, dtype: float64
MODEL: predicting school_pay with conf and stad_size2 81    3.268970e+06
Name: stad_size2, dtype: float64
MODEL: predicting school_pay with conf and rank_2018 81    2.700787e+06
Name: rank_2018, dtype: float64
MODEL: predicting school_pay with conf and school_pay_2018 81    2401206.0
Name: school_pay_2018, dtype: float64
MODEL: predicting school_pay with conf and max_bonus_2018 81    3.491587e+06
Name: max_bonus_2018, dtype: float64
MODEL: predicting school_pay with conf and bonuses_paid_2018 81    3.225403e+06
Name: bonuses_paid_2018, dtype: float64
MODEL: predicting school_pay with conf and asst_pay_total_2018 81    3.479594e+06
Name: asst_pay_total_2018, dtype: float64
MODEL: predicting school_pay with conf and buyout_2018 81    2.675406e+06
Name: buyout_2018, dtype: float64
MODEL: predicting school_pay with conf and W_2018 81    4.047779e+06
Name: W_2018, dtype: float64
MODEL: predicting school_pay with conf and L_2018 81    4.103336e+06
Name: L_2018, dtype: float64
MODEL: predicting school_pay with conf and C_2018 81    3.479594e+06
Name: C_2018, dtype: float64
MODEL: predicting school_pay with conf and N_2018 81    3.716301e+06
Name: N_2018, dtype: float64
MODEL: predicting school_pay with conf and P_2018 81    3.315315e+06
Name: P_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_PTS_FOR_2018 81    3.479594e+06
Name: AVG_PTS_FOR_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_PTS_AGT_2018 81    4.196192e+06
Name: AVG_PTS_AGT_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_MGN_2018 81    3.572162e+06
Name: AVG_MGN_2018, dtype: float64
MODEL: predicting school_pay with conf and AVE_RUN_YPG_FOR_2018 81    4.050128e+06
Name: AVE_RUN_YPG_FOR_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_PAS_YPG_FOR_2018 81    3.496871e+06
Name: AVG_PAS_YPG_FOR_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_TOT_YPG_FOR_2018 81    3.690102e+06
Name: AVG_TOT_YPG_FOR_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_RUN_YPG_AGT_2018 81    3.858387e+06
Name: AVG_RUN_YPG_AGT_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_PAS_YPG_AGT_2018 81    3.622060e+06
Name: AVG_PAS_YPG_AGT_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_TOT_YPG_AGT_2018 81    3.269219e+06
Name: AVG_TOT_YPG_AGT_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_NET_YPG_2018 81    3.345199e+06
Name: AVG_NET_YPG_2018, dtype: float64
MODEL: predicting school_pay with conf and NET_TO_2018 81    3.659518e+06
Name: NET_TO_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_PWR_RTG_2018 81    3.731676e+06
Name: AVG_PWR_RTG_2018, dtype: float64
MODEL: predicting school_pay with conf and AVG_OPP_PWR_RTG_2018 81    3.351789e+06
Name: AVG_OPP_PWR_RTG_2018, dtype: float64
MODEL: predicting school_pay with conf and NET_PWR_RTG_CHG_2018 81    3.327357e+06
Name: NET_PWR_RTG_CHG_2018, dtype: float64
MODEL: predicting school_pay with conf and LINE_POWR_2018 81    3.175260e+06
Name: LINE_POWR_2018, dtype: float64
MODEL: predicting school_pay with conf and GAME_PLAY_POWR_2018 81    4.071811e+06
Name: GAME_PLAY_POWR_2018, dtype: float64
MODEL: predicting school_pay with conf and COMP_POWR_2018 81    3.839589e+06
Name: COMP_POWR_2018, dtype: float64
MODEL: predicting school_pay with conf and X_FCTR_2018 81    3.588508e+06
Name: X_FCTR_2018, dtype: float64
MODEL: predicting school_pay with conf and vegas_year_2018 81    3.479594e+06
Name: vegas_year_2018, dtype: float64
In [282]:
train_set, test_set = train_test_split(df2, test_size=0.2, random_state=42)
my_model = str('school_pay_2018 ~ conf_y + rank_2018')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
#     print('MODEL: predicting school_pay with conf and', attr, train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr]))
                            OLS Regression Results                            
==============================================================================
Dep. Variable:        school_pay_2018   R-squared:                       0.844
Model:                            OLS   Adj. R-squared:                  0.820
Method:                 Least Squares   F-statistic:                     35.32
Date:                Sat, 25 Jan 2020   Prob (F-statistic):           1.23e-24
Time:                        19:42:45   Log-Likelihood:                -1256.6
No. Observations:                  84   AIC:                             2537.
Df Residuals:                      72   BIC:                             2566.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
======================================================================================
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept           5.647e+06   4.67e+05     12.082      0.000    4.72e+06    6.58e+06
conf_y[T.ACC]      -3.044e+05   4.27e+05     -0.713      0.478   -1.16e+06    5.46e+05
conf_y[T.Big 12]   -1.901e+05   4.64e+05     -0.409      0.683   -1.12e+06    7.35e+05
conf_y[T.Big Ten]     6.6e+04   4.38e+05      0.151      0.881   -8.07e+05    9.39e+05
conf_y[T.C-USA]    -5.705e+04   6.55e+05     -0.087      0.931   -1.36e+06    1.25e+06
conf_y[T.Ind.]      1.205e+06   5.84e+05      2.063      0.043    4.04e+04    2.37e+06
conf_y[T.MAC]       5.544e+05   4.36e+05      1.272      0.207   -3.15e+05    1.42e+06
conf_y[T.Mt. West]  7.829e+04   4.35e+05      0.180      0.858   -7.89e+05    9.45e+05
conf_y[T.Pac-12]   -3.614e+05   4.26e+05     -0.849      0.399   -1.21e+06    4.87e+05
conf_y[T.SEC]       3.125e+05   4.35e+05      0.719      0.475   -5.54e+05    1.18e+06
conf_y[T.Sun Belt]   3.14e+05    4.9e+05      0.641      0.524   -6.63e+05    1.29e+06
rank_2018          -5.165e+04   4986.881    -10.356      0.000   -6.16e+04   -4.17e+04
==============================================================================
Omnibus:                       42.242   Durbin-Watson:                   1.681
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.039
Skew:                           1.788   Prob(JB):                     1.55e-23
Kurtosis:                       7.151   Cond. No.                         808.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [284]:
print(train_model_fit.params[0] + (train_model_fit.params[-1]*syr['rank_2018']))
81    2.806250e+06
Name: rank_2018, dtype: float64
In [285]:
2.806250e+06
Out[285]:
2806250.0
In [206]:
syr = df2[df2['school'] == 'Syracuse']
syr
Out[206]:
school nickname city state enrollment conf_x conf_y coach stad_size gsr ... AVG_NET_YPG_2018 NET_TO_2018 AVG_PWR_RTG_2018 AVG_OPP_PWR_RTG_2018 NET_PWR_RTG_CHG_2018 LINE_POWR_2018 GAME_PLAY_POWR_2018 COMP_POWR_2018 X_FCTR_2018 vegas_year_2018
81 Syracuse Orange Syracuse New York 22900.0 ACC ACC Dino Babers 49250.0 80.0 ... 427.0 37.0 9.0 187.0 179.0 23.0 111.2 107.7 154.4 2018.0

1 rows × 50 columns

In [201]:
train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr])
Out[201]:
81    3.186998e+06
Name: gsr, dtype: float64
In [208]:
syr['school_pay_2018']
Out[208]:
81    2401206.0
Name: school_pay_2018, dtype: float64
In [188]:
attrs = [
 'enrollment',
 'stad_size',
 'gsr',
 'seat_rank',
 'gsr_rank',
 'combo_rank',
 'true_rank',
 'rank_attend',
 'total_attend',
 'avg_attend',
 'stad_size2',
 'pct_capacity',
 'rank_2019',
 'conf_2019',
 'coach_2019',
 'max_bonus_2019',
 'bonuses_paid_2019',
 'asst_pay_total_2019']

# attrs = [
#  'enrollment']
In [233]:
# for attr in attrs:
#     train_set, test_set = train_test_split(df2, test_size=0.2, random_state=42)
#     my_model = str('school_pay_2019 ~ conf_y + ' + attr)
#     train_model_fit = smf.ols(my_model, data = train_set).fit()
#     print(attr, "--",train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr]))
In [182]:
df2.dtypes
Out[182]:
school                  object
nickname                object
city                    object
state                   object
enrollment              object
conf_x                  object
conf_y                  object
coach                   object
stad_size               object
gsr                    float64
seat_rank              float64
gsr_rank               float64
combo_rank             float64
true_rank              float64
rank_attend            float64
conf_x.1                object
total_attend            object
avg_attend              object
stad_size2              object
pct_capacity           float64
rank_2019              float64
conf_2019               object
coach_2019              object
school_pay_2019        float64
max_bonus_2019         float64
bonuses_paid_2019      float64
asst_pay_total_2019    float64
dtype: object
In [194]:
# kcoach2016[attrs] = kcoach2016[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
from pandas.plotting import scatter_matrix
attributes = ['enrollment', 'stad_size']
scatter_matrix(df2[attributes])
Out[194]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a26b850d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2c3e9a50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a2c421d90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2c456a50>]],
      dtype=object)
In [195]:
attributes = ['enrollment', 'gsr']
scatter_matrix(df2[attributes])
Out[195]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a2c30c0d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2c525d50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a2c55e550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2c677d50>]],
      dtype=object)
In [196]:
attributes = ['enrollment', 'total_attend']
scatter_matrix(df2[attributes])
Out[196]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a2c4fffd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2c78e510>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a2c7c2c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2c8a6dd0>]],
      dtype=object)
In [197]:
attributes = ['enrollment', 'avg_attend']
scatter_matrix(df2[attributes])
Out[197]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a2c747950>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2ca04b50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a2c981e90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2ca9bb50>]],
      dtype=object)
In [219]:
df3 = pd.read_csv('v2_coaches_modify.csv')
In [220]:
df3
Out[220]:
school conf coach ncaafbrev16 median_conf_sal school_pay total_pay max_bonus bonus_paid pay_plus_bonus ... grad_rate_rank combo true_rank w l ratio offensive_score defensive_score score points_per_game
0 Air Force Mt. West Troy Calhoun $59,577,780 879288.0 885000.0 885000.0 247000.0 NaN 885000.0 ... 97.0 65.0 67.0 10.0 3.0 3.333333 4.96 -0.87 4.09 78.16
1 Akron MAC Terry Bowden $35,331,217 492413.0 411000.0 412500.0 225000.0 50000.0 462500.0 ... 5.0 11.0 129.0 5.0 7.0 0.714286 -0.40 -6.12 -6.52 72.64
2 Alabama SEC Nick Saban $174,307,419 3929800.0 8307000.0 8307000.0 1100000.0 500000.0 8807000.0 ... 84.0 111.0 5.0 14.0 1.0 14.000000 8.01 17.62 25.62 90.38
3 Appalachian State Sun Belt Scott Satterfield $35,058,621 675000.0 712500.0 712500.0 295000.0 145000.0 857500.0 ... 13.0 20.0 122.0 10.0 3.0 3.333333 -0.86 9.68 8.83 70.76
4 Arizona Pac-12 Kevin Sumlin $90,976,758 2752233.0 1600000.0 2000000.0 2025000.0 NaN 2000000.0 ... 60.0 66.0 62.0 3.0 9.0 0.333333 1.01 -5.64 -4.63 74.42
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
113 Washington State Pac-12 Mike Leach $64,294,520 2752233.0 3500000.0 3500000.0 725000.0 75000.0 3575000.0 ... 121.0 83.0 37.0 8.0 5.0 1.600000 5.70 3.33 9.03 84.22
114 West Virginia Big 12 Dana Holgorsen $110,565,870 3775000.0 3605000.0 3617500.0 1000000.0 90000.0 3707500.0 ... 20.0 66.0 62.0 10.0 3.0 3.333333 4.01 5.56 9.57 78.75
115 Western Michigan MAC Tim Lester $38,516,531 492413.0 800000.0 800000.0 346500.0 39250.0 839250.0 ... 60.0 39.0 100.0 13.0 1.0 13.000000 7.61 4.42 12.03 86.24
116 Wisconsin Big Ten Paul Chryst $143,420,668 3775000.0 3750000.0 3750000.0 NaN 290000.0 4040000.0 ... 40.0 84.0 32.0 11.0 3.0 3.666667 2.13 13.17 15.30 78.74
117 Wyoming Mt. West Craig Bohl $40,372,222 879288.0 1412000.0 1412000.0 450000.0 236000.0 1648000.0 ... 125.0 66.0 59.0 8.0 6.0 1.333333 9.04 -7.68 1.37 81.23

118 rows × 23 columns

In [221]:
# attrs = [' NCAAFBREV16 ', 'MedianConfSal',
#        ' SchoolPay ', ' TotalPay ', ' Bonus ', ' BonusPaid ',
#        ' PayPlusBonus2016 ', ' StadSize ', ' Graduation Rate (GSR) ',
#        ' Seat Rank ', ' GSRank ', ' Combo Rank ', ' TrueRank ', 'W', 'L',
#        ' Ratio ', 'OffenceScore', 'Defense Score', 'Score', 'PointsPerGame']
# for attr in attrs:
#     if df3[attr].dtypes != 'float64':
#         df3[attr] = df3[attr].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
In [213]:
df3.columns
Out[213]:
Index(['School', 'Conf', 'Coach', ' NCAAFBREV16 ', 'MedianConfSal',
       ' SchoolPay ', ' TotalPay ', ' Bonus ', ' BonusPaid ',
       ' PayPlusBonus2016 ', ' StadSize ', ' Graduation Rate (GSR) ',
       ' Seat Rank ', ' GSRank ', ' Combo Rank ', ' TrueRank ', 'W', 'L',
       ' Ratio ', 'OffenceScore', 'Defense Score', 'Score', 'PointsPerGame'],
      dtype='object')
In [222]:
for i,col in enumerate(df3.columns):
    if df3[col].dtype == 'float64':
        median = df3[col].median()
        df3[col].fillna(median, inplace = True)
In [223]:
df3.isnull().sum()
Out[223]:
school              0
conf                0
coach               0
ncaafbrev16        19
median_conf_sal     0
school_pay          0
total_pay           0
max_bonus           0
bonus_paid          0
pay_plus_bonus      0
stad_size           0
grad_rate           0
seat_rank           0
grad_rate_rank      0
combo               0
true_rank           0
w                   0
l                   0
ratio               0
offensive_score     0
defensive_score     0
score               0
points_per_game     0
dtype: int64
In [225]:
df3.drop('ncaafbrev16', axis=1, inplace=True)
In [227]:
df3.drop('total_pay', axis=1, inplace=True)
In [228]:
df3.columns
Out[228]:
Index(['school', 'conf', 'coach', 'median_conf_sal', 'school_pay', 'max_bonus',
       'bonus_paid', 'pay_plus_bonus', 'stad_size', 'grad_rate', 'seat_rank',
       'grad_rate_rank', 'combo', 'true_rank', 'w', 'l', 'ratio',
       'offensive_score', 'defensive_score', 'score', 'points_per_game'],
      dtype='object')
In [292]:
syr = df3[df3['school'] == 'Syracuse']
syr
Out[292]:
school conf coach median_conf_sal school_pay max_bonus bonus_paid pay_plus_bonus stad_size grad_rate ... grad_rate_rank combo true_rank w l ratio offensive_score defensive_score score points_per_game
93 Syracuse ACC Dino Babers 2458033.0 2401206.0 807500.0 95000.0 2401206.0 49250.0 80.0 ... 87.0 73.0 49.0 4.0 8.0 0.5 0.08 -4.59 -4.51 75.51

1 rows × 21 columns

In [241]:
curious_about = ['median_conf_sal', 'max_bonus',
       'bonus_paid', 'pay_plus_bonus', 'stad_size', 'grad_rate', 'seat_rank',
       'grad_rate_rank', 'combo', 'true_rank', 'w', 'l', 'ratio',
       'offensive_score', 'defensive_score', 'score', 'points_per_game']
In [243]:
for attr in curious_about:
    train_set, test_set = train_test_split(df3, test_size=0.2, random_state=42)
    my_model = str('school_pay ~ conf + '+ attr)
    train_model_fit = smf.ols(my_model, data = train_set).fit()
#     print(train_model_fit.summary())
    print('MODEL: predicting school_pay with conf and', attr, (train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr[attr])))
MODEL: predicting school_pay with conf and median_conf_sal 93    3.481378e+06
Name: median_conf_sal, dtype: float64
MODEL: predicting school_pay with conf and max_bonus 93    3.539723e+06
Name: max_bonus, dtype: float64
MODEL: predicting school_pay with conf and bonus_paid 93    3.214852e+06
Name: bonus_paid, dtype: float64
MODEL: predicting school_pay with conf and pay_plus_bonus 93    2.328311e+06
Name: pay_plus_bonus, dtype: float64
MODEL: predicting school_pay with conf and stad_size 93    3.065754e+06
Name: stad_size, dtype: float64
MODEL: predicting school_pay with conf and grad_rate 93    3.537406e+06
Name: grad_rate, dtype: float64
MODEL: predicting school_pay with conf and seat_rank 93    3.080756e+06
Name: seat_rank, dtype: float64
MODEL: predicting school_pay with conf and grad_rate_rank 93    3.538355e+06
Name: grad_rate_rank, dtype: float64
MODEL: predicting school_pay with conf and combo 93    3.366290e+06
Name: combo, dtype: float64
MODEL: predicting school_pay with conf and true_rank 93    3.435262e+06
Name: true_rank, dtype: float64
MODEL: predicting school_pay with conf and w 93    2.582178e+06
Name: w, dtype: float64
MODEL: predicting school_pay with conf and l 93    2.660149e+06
Name: l, dtype: float64
MODEL: predicting school_pay with conf and ratio 93    2.879332e+06
Name: ratio, dtype: float64
MODEL: predicting school_pay with conf and offensive_score 93    3.301635e+06
Name: offensive_score, dtype: float64
MODEL: predicting school_pay with conf and defensive_score 93    2.494810e+06
Name: defensive_score, dtype: float64
MODEL: predicting school_pay with conf and score 93    2.431316e+06
Name: score, dtype: float64
MODEL: predicting school_pay with conf and points_per_game 93    3106894.15
Name: points_per_game, dtype: float64
In [290]:
train_set, test_set = train_test_split(df3, test_size=0.2, random_state=42)
my_model = str('school_pay ~ conf + score')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             school_pay   R-squared:                       0.792
Model:                            OLS   Adj. R-squared:                  0.765
Method:                 Least Squares   F-statistic:                     28.46
Date:                Sat, 25 Jan 2020   Prob (F-statistic):           1.69e-23
Time:                        20:25:45   Log-Likelihood:                -1417.5
No. Observations:                  94   AIC:                             2859.
Df Residuals:                      82   BIC:                             2890.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept         1.852e+06   3.24e+05      5.710      0.000    1.21e+06     2.5e+06
conf[T.ACC]       9.877e+05   4.35e+05      2.271      0.026    1.22e+05    1.85e+06
conf[T.Big 12]    1.708e+06   4.77e+05      3.583      0.001     7.6e+05    2.66e+06
conf[T.Big Ten]   2.144e+06   4.51e+05      4.750      0.000    1.25e+06    3.04e+06
conf[T.C-USA]    -9.357e+04   9.86e+05     -0.095      0.925   -2.05e+06    1.87e+06
conf[T.Ind.]     -4.947e+05   5.23e+05     -0.946      0.347   -1.54e+06    5.46e+05
conf[T.MAC]      -1.024e+06   4.28e+05     -2.394      0.019   -1.87e+06   -1.73e+05
conf[T.Mt. West] -5.132e+05   4.29e+05     -1.196      0.235   -1.37e+06     3.4e+05
conf[T.Pac-12]    5.452e+05   4.43e+05      1.231      0.222   -3.36e+05    1.43e+06
conf[T.SEC]       1.988e+06   4.35e+05      4.566      0.000    1.12e+06    2.85e+06
conf[T.Sun Belt] -8.987e+05   4.37e+05     -2.055      0.043   -1.77e+06   -2.86e+04
score             9.063e+04   1.25e+04      7.277      0.000    6.59e+04    1.15e+05
==============================================================================
Omnibus:                        4.012   Durbin-Watson:                   2.241
Prob(Omnibus):                  0.135   Jarque-Bera (JB):                3.322
Skew:                           0.374   Prob(JB):                        0.190
Kurtosis:                       3.537   Cond. No.                         114.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [293]:
train_model_fit.params[0] + train_model_fit.params[3] + (train_model_fit.params[-1]*syr['score'])
Out[293]:
93    3.587410e+06
Name: score, dtype: float64
In [236]:
syr['school_pay']
Out[236]:
93    2401206.0
Name: school_pay, dtype: float64
In [237]:
2.431316e+06
Out[237]:
2431316.0
In [294]:
3.587410e+06
Out[294]:
3587410.0
In [295]:
train_model_fit.params[0] + (train_model_fit.params[-1]*syr['score'])
Out[295]:
93    1.443604e+06
Name: score, dtype: float64
In [296]:
1.443604e+06
Out[296]:
1443604.0
In [ ]: