IST 718 | WK 2 | ASYNC

In [165]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np  # arrays and math functions
from scipy.stats import uniform  # for training-and-test split
import statsmodels.api as sm  # statistical models (including regression)
import statsmodels.formula.api as smf  # R-like model specification
import matplotlib.pyplot as plt  # 2D plotting

dodgers = pd.read_csv("dodgers_2.csv")
In [166]:
dodgers.head()
Out[166]:
month day attend day_of_week opponent temp skies day_night cap shirt fireworks bobblehead division
0 APR 10 56000 Tuesday Pirates 67 Clear Day NO NO NO NO Divisional
1 APR 11 29729 Wednesday Pirates 58 Cloudy Night NO NO NO NO Cross
2 APR 12 28328 Thursday Pirates 57 Cloudy Night NO NO NO NO Cross
3 APR 13 31601 Friday Padres 54 Cloudy Night NO NO YES NO Cross
4 APR 14 46549 Saturday Padres 57 Cloudy Night NO NO NO NO Divisional
In [167]:
sns.boxplot(x="day_of_week", y="attend", data=dodgers, color="gray", 
            order=["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"])
plt.xticks(rotation="vertical")
Out[167]:
(array([0, 1, 2, 3, 4, 5, 6]), <a list of 7 Text xticklabel objects>)
In [168]:
dodgers['attend'].max()
Out[168]:
56000
In [169]:
dodgers['attend_norm'] = dodgers.apply(lambda x: x['attend']/56000, axis=1)
In [170]:
dodgers['attend_n2'] = (dodgers['attend']/56000)*100
In [171]:
set(dodgers['opponent'].values)
Out[171]:
{'Angels',
 'Astros',
 'Braves',
 'Brewers',
 'Cardinals',
 'Cubs',
 'Giants',
 'Marlins',
 'Mets',
 'Nationals',
 'Padres',
 'Phillies',
 'Pirates',
 'Reds',
 'Rockies',
 'Snakes',
 'White Sox'}
In [172]:
dodgers['opponent'].value_counts()
Out[172]:
Rockies      9
Giants       9
Snakes       9
Padres       9
Cardinals    7
Mets         4
Brewers      4
Marlins      3
Reds         3
Cubs         3
Braves       3
Pirates      3
Nationals    3
Angels       3
White Sox    3
Phillies     3
Astros       3
Name: opponent, dtype: int64
In [173]:
sns.regplot(x="temp", y="attend_n2", data=dodgers);
In [174]:
sns.scatterplot(x="temp", y="attend_n2", hue="day_of_week", size="skies", palette="Set2", data=dodgers)
Out[174]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c1d2b4110>
In [175]:
sns.scatterplot(x="temp", y="attend_n2", hue="bobblehead", size="skies", palette="Set2", data=dodgers)
Out[175]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c1d2e6cd0>
In [176]:
with sns.plotting_context("paper"):
    sns.scatterplot(x="temp", y="attend", hue="day_of_week", size="skies", palette="Set2", data=dodgers)
    plt.title('Attend by Temp')
In [177]:
sns.lmplot(x="temp", y="attend_n2", col="day_of_week", col_wrap=3, data=dodgers);
In [149]:
dodgers.dtypes
Out[149]:
month           object
day              int64
attend           int64
day_of_week     object
opponent        object
temp             int64
skies           object
day_night       object
cap             object
shirt           object
fireworks       object
bobblehead      object
attend_norm    float64
attend_n2      float64
dtype: object
In [178]:
attrs = "month day_of_week opponent skies day_night cap shirt fireworks bobblehead division"

for attr in attrs.split():
    dodgers[attr] = dodgers[attr].astype("category")
In [179]:
dodgers.dtypes
Out[179]:
month          category
day               int64
attend            int64
day_of_week    category
opponent       category
temp              int64
skies          category
day_night      category
cap            category
shirt          category
fireworks      category
bobblehead     category
division       category
attend_norm     float64
attend_n2       float64
dtype: object
In [180]:
dodgers['temp_bins'] = pd.cut(dodgers["temp"], [0, 60, 70, 80, 110], labels=
                              ["below_sixty", "sixties", "seventies", "eighty_up"])
In [181]:
dodgers.dtypes
Out[181]:
month          category
day               int64
attend            int64
day_of_week    category
opponent       category
temp              int64
skies          category
day_night      category
cap            category
shirt          category
fireworks      category
bobblehead     category
division       category
attend_norm     float64
attend_n2       float64
temp_bins      category
dtype: object
In [182]:
import numpy as np
from scipy.stats import uniform
# employ training-and-test regimen for model validation
np.random.seed(1234)
dodgers['runiform'] = uniform.rvs(loc = 0, scale = 1, size = len(dodgers))
dodgers_train = dodgers[dodgers['runiform'] >= 0.33]
dodgers_test = dodgers[dodgers['runiform'] < 0.33]


# # check training data frame
# print('\ndodgers_train data frame (rows, columns): ',dodgers_train.shape)
# print(dodgers_train.head())
# # check test data frame
# print('\ndodgers_test data frame (rows, columns): ',dodgers_test.shape)
# print(dodgers_test.head())

# # specify a simple model with bobblehead entered last
# my_model = str('attend ~ ordered_month + ordered_day_of_week + bobblehead')

# # fit the model to the training set
# train_model_fit = smf.ols(my_model, data = dodgers_train).fit()
# # summary of model fit to the training set
# print(train_model_fit.summary())
# # training set predictions from the model fit to the training set
# dodgers_train['predict_attend'] = train_model_fit.fittedvalues

# # test set predictions from the model fit to the training set
# dodgers_test['predict_attend'] = train_model_fit.predict(dodgers_test)

dodgers_test
Out[182]:
month day attend day_of_week opponent temp skies day_night cap shirt fireworks bobblehead division attend_norm attend_n2 temp_bins runiform
0 APR 10 56000 Tuesday Pirates 67 Clear Day NO NO NO NO Divisional 1.000000 100.000000 sixties 0.191519
5 APR 15 38359 Sunday Padres 65 Clear Day NO NO NO NO Cross 0.684982 68.498214 sixties 0.272593
6 APR 23 26376 Monday Braves 60 Cloudy Night NO NO NO NO Cross 0.471000 47.100000 below_sixty 0.276464
17 MAY 13 49124 Sunday Rockies 70 Clear Day NO NO NO NO Divisional 0.877214 87.721429 sixties 0.013768
22 MAY 20 44005 Sunday Cardinals 77 Clear Night NO NO NO NO Divisional 0.785804 78.580357 seventies 0.075381
28 MAY 30 25509 Wednesday Brewers 69 Clear Night NO NO NO NO Cross 0.455518 45.551786 sixties 0.316836
33 JUN 15 40432 Friday White Sox 67 Clear Night NO NO YES NO Divisional 0.722000 72.200000 sixties 0.143767
36 JUN 28 49006 Thursday Mets 75 Clear Night NO NO NO YES Divisional 0.875107 87.510714 seventies 0.218792
40 JUL 2 34493 Monday Reds 70 Clear Night NO NO NO NO Cross 0.615946 61.594643 sixties 0.059809
41 JUL 3 33884 Tuesday Reds 70 Cloudy Night YES NO NO NO Cross 0.605071 60.507143 sixties 0.184287
42 JUL 4 53570 Wednesday Reds 70 Clear Night NO NO YES NO Divisional 0.956607 95.660714 sixties 0.047355
46 JUL 16 32238 Monday Phillies 67 Clear Night NO NO NO NO Cross 0.575679 57.567857 sixties 0.043324
48 JUL 18 39955 Wednesday Phillies 80 Cloudy Day NO NO NO NO Cross 0.713482 71.348214 seventies 0.329668
50 JUL 31 52832 Tuesday Snakes 75 Cloudy Night NO NO NO YES Divisional 0.943429 94.342857 seventies 0.111894
53 AUG 4 46588 Saturday Cubs 73 Cloudy Night NO NO NO NO Divisional 0.831929 83.192857 seventies 0.006764
60 AUG 22 40173 Wednesday Giants 75 Clear Night NO NO NO NO Divisional 0.717375 71.737500 seventies 0.285251
63 AUG 26 41907 Sunday Marlins 81 Clear Day NO NO NO NO Divisional 0.748339 74.833929 eighty_up 0.195675
65 AUG 31 37622 Friday Snakes 77 Clear Night NO NO YES NO Cross 0.671821 67.182143 seventies 0.053874
68 SEP 3 33540 Monday Padres 84 Cloudy Night NO NO NO NO Cross 0.598929 59.892857 eighty_up 0.123943
69 SEP 4 40619 Tuesday Padres 78 Clear Night NO YES NO NO Divisional 0.725339 72.533929 seventies 0.119381
73 SEP 15 42449 Saturday Cardinals 95 Clear Night NO NO NO NO Divisional 0.758018 75.801786 eighty_up 0.107127
74 SEP 16 35754 Sunday Cardinals 86 Clear Day NO NO NO NO Cross 0.638464 63.846429 eighty_up 0.229219
78 OCT 1 33624 Monday Giants 86 Clear Night NO NO NO NO Cross 0.600429 60.042857 eighty_up 0.006209
79 OCT 2 42473 Tuesday Giants 83 Clear Night NO NO NO NO Divisional 0.758446 75.844643 eighty_up 0.300642
In [183]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(dodgers, test_size=0.3, random_state=42)
In [184]:
print(len(train_set))
train_set.head()
56
Out[184]:
month day attend day_of_week opponent temp skies day_night cap shirt fireworks bobblehead division attend_norm attend_n2 temp_bins runiform
62 AUG 25 40284 Saturday Marlins 70 Clear Night NO NO NO NO Divisional 0.719357 71.935714 sixties 0.478094
42 JUL 4 53570 Wednesday Reds 70 Clear Night NO NO YES NO Divisional 0.956607 95.660714 sixties 0.047355
54 AUG 5 42495 Sunday Cubs 83 Clear Day YES NO NO NO Divisional 0.758839 75.883929 eighty_up 0.617442
16 MAY 12 33735 Saturday Rockies 65 Clear Night NO NO NO NO Cross 0.602411 60.241071 sixties 0.503083
39 JUL 1 55359 Sunday Mets 75 Clear Night NO NO NO YES Divisional 0.988554 98.855357 seventies 0.909316
In [185]:
print(len(dodgers_train))
dodgers_train.head()
57
Out[185]:
month day attend day_of_week opponent temp skies day_night cap shirt fireworks bobblehead division attend_norm attend_n2 temp_bins runiform
1 APR 11 29729 Wednesday Pirates 58 Cloudy Night NO NO NO NO Cross 0.530875 53.087500 below_sixty 0.622109
2 APR 12 28328 Thursday Pirates 57 Cloudy Night NO NO NO NO Cross 0.505857 50.585714 below_sixty 0.437728
3 APR 13 31601 Friday Padres 54 Cloudy Night NO NO YES NO Cross 0.564304 56.430357 below_sixty 0.785359
4 APR 14 46549 Saturday Padres 57 Cloudy Night NO NO NO NO Divisional 0.831232 83.123214 below_sixty 0.779976
7 APR 24 44014 Tuesday Braves 63 Cloudy Night NO NO NO NO Divisional 0.785964 78.596429 sixties 0.801872
In [197]:
my_model = str('attend ~ month + day + day_of_week + opponent + temp + skies + day_night + bobblehead + division + temp_bins')
In [1]:
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-1c137672b4ca> in <module>
----> 1 train_model_fit = smf.ols(my_model, data = train_set).fit()
      2 print(train_model_fit.summary())

NameError: name 'smf' is not defined
In [217]:
my_model_2 = str('attend ~ division + bobblehead + day_night')
In [218]:
train_model_fit = smf.ols(my_model_2, data = train_set).fit()
In [219]:
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.791
Model:                            OLS   Adj. R-squared:                  0.779
Method:                 Least Squares   F-statistic:                     65.64
Date:                Wed, 15 Jan 2020   Prob (F-statistic):           1.09e-17
Time:                        09:46:20   Log-Likelihood:                -536.30
No. Observations:                  56   AIC:                             1081.
Df Residuals:                      52   BIC:                             1089.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               3.591e+04   1263.209     28.425      0.000    3.34e+04    3.84e+04
division[T.Divisional]  9729.3500   1064.609      9.139      0.000    7593.057    1.19e+04
bobblehead[T.YES]       9102.1500   1549.516      5.874      0.000    5992.818    1.22e+04
day_night[T.Night]     -2182.1000   1292.386     -1.688      0.097   -4775.462     411.262
==============================================================================
Omnibus:                        0.499   Durbin-Watson:                   2.471
Prob(Omnibus):                  0.779   Jarque-Bera (JB):                0.109
Skew:                           0.074   Prob(JB):                        0.947
Kurtosis:                       3.158   Cond. No.                         5.58
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [207]:
dodgers.dtypes
Out[207]:
month          category
day               int64
attend            int64
day_of_week    category
opponent       category
temp              int64
skies          category
day_night      category
cap            category
shirt          category
fireworks      category
bobblehead     category
division       category
attend_norm     float64
attend_n2       float64
temp_bins      category
runiform        float64
dtype: object
In [220]:
train_model_fit = smf.ols(my_model_2, data = dodgers).fit()
In [221]:
print(train_model_fit.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.774
Model:                            OLS   Adj. R-squared:                  0.766
Method:                 Least Squares   F-statistic:                     88.10
Date:                Wed, 15 Jan 2020   Prob (F-statistic):           7.94e-25
Time:                        09:46:48   Log-Likelihood:                -785.05
No. Observations:                  81   AIC:                             1578.
Df Residuals:                      77   BIC:                             1588.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               3.646e+04   1130.580     32.252      0.000    3.42e+04    3.87e+04
division[T.Divisional]  1.142e+04    963.897     11.849      0.000    9501.647    1.33e+04
bobblehead[T.YES]       8501.2486   1427.217      5.957      0.000    5659.296    1.13e+04
day_night[T.Night]     -3241.0847   1170.228     -2.770      0.007   -5571.306    -910.864
==============================================================================
Omnibus:                        0.293   Durbin-Watson:                   2.042
Prob(Omnibus):                  0.864   Jarque-Bera (JB):                0.473
Skew:                           0.031   Prob(JB):                        0.789
Kurtosis:                       2.631   Cond. No.                         5.40
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [243]:
attrs_nobobble = [attr for attr in attrs.split() if attr not in ['bobblehead', 'division']]
attrs_nobobble
# attrs.remove('bobblehead')
Out[243]:
['month',
 'day_of_week',
 'opponent',
 'skies',
 'day_night',
 'cap',
 'shirt',
 'fireworks']
In [247]:
for attr in attrs_nobobble:
    m = 'attend ~ division + bobblehead + ' + attr
    my_model = str(m)
    train_model_fit = smf.ols(my_model, data = dodgers).fit()
#     print(train_model_fit.summary())
    print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ', 
          round(train_model_fit.params[2],0))
Estimated Effect of Bobblehead and  month  Promotion on Attendance:  7216.0

Estimated Effect of Bobblehead and  day_of_week  Promotion on Attendance:  8228.0

Estimated Effect of Bobblehead and  opponent  Promotion on Attendance:  8155.0

Estimated Effect of Bobblehead and  skies  Promotion on Attendance:  7787.0

Estimated Effect of Bobblehead and  day_night  Promotion on Attendance:  8501.0

Estimated Effect of Bobblehead and  cap  Promotion on Attendance:  7746.0

Estimated Effect of Bobblehead and  shirt  Promotion on Attendance:  7926.0

Estimated Effect of Bobblehead and  fireworks  Promotion on Attendance:  7992.0
In [253]:
for attr in attrs_nobobble:
    m = 'attend ~ division + bobblehead + ' + attr
    my_model = str(m)
    train_model_fit = smf.ols(my_model, data = test_set).fit()
#     print(train_model_fit.summary())
    print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ', 
          round(train_model_fit.params[2],0))
Estimated Effect of Bobblehead and  month  Promotion on Attendance:  5174.0

Estimated Effect of Bobblehead and  day_of_week  Promotion on Attendance:  5282.0

Estimated Effect of Bobblehead and  opponent  Promotion on Attendance:  9128.0

Estimated Effect of Bobblehead and  skies  Promotion on Attendance:  6969.0

Estimated Effect of Bobblehead and  day_night  Promotion on Attendance:  8170.0

Estimated Effect of Bobblehead and  cap  Promotion on Attendance:  7107.0

Estimated Effect of Bobblehead and  shirt  Promotion on Attendance:  7375.0

Estimated Effect of Bobblehead and  fireworks  Promotion on Attendance:  6880.0
In [250]:
m = 'attend ~ month + day + bobblehead'
my_model = str(m)
train_model_fit = smf.ols(my_model, data = dodgers).fit()
#     print(train_model_fit.summary())
print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ', 
      round(train_model_fit.params[3],0))
Estimated Effect of Bobblehead and  fireworks  Promotion on Attendance:  6519.0
In [251]:
m = 'attend ~ month + day + bobblehead'
my_model = str(m)
train_model_fit = smf.ols(my_model, data = train_set).fit()
#     print(train_model_fit.summary())
print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ', 
      round(train_model_fit.params[3],0))
Estimated Effect of Bobblehead and  fireworks  Promotion on Attendance:  10041.0
In [ ]: