IST 718 | WK 2 | ASYNC¶

import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np  # arrays and math functions
from scipy.stats import uniform  # for training-and-test split
import statsmodels.api as sm  # statistical models (including regression)
import statsmodels.formula.api as smf  # R-like model specification
import matplotlib.pyplot as plt  # 2D plotting

dodgers = pd.read_csv("dodgers_2.csv")

dodgers.head()

sns.boxplot(x="day_of_week", y="attend", data=dodgers, color="gray", 
            order=["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"])
plt.xticks(rotation="vertical")

(array([0, 1, 2, 3, 4, 5, 6]), <a list of 7 Text xticklabel objects>)

dodgers['attend'].max()

56000

dodgers['attend_norm'] = dodgers.apply(lambda x: x['attend']/56000, axis=1)

dodgers['attend_n2'] = (dodgers['attend']/56000)*100

set(dodgers['opponent'].values)

{'Angels',
 'Astros',
 'Braves',
 'Brewers',
 'Cardinals',
 'Cubs',
 'Giants',
 'Marlins',
 'Mets',
 'Nationals',
 'Padres',
 'Phillies',
 'Pirates',
 'Reds',
 'Rockies',
 'Snakes',
 'White Sox'}

dodgers['opponent'].value_counts()

Rockies      9
Giants       9
Snakes       9
Padres       9
Cardinals    7
Mets         4
Brewers      4
Marlins      3
Reds         3
Cubs         3
Braves       3
Pirates      3
Nationals    3
Angels       3
White Sox    3
Phillies     3
Astros       3
Name: opponent, dtype: int64

sns.regplot(x="temp", y="attend_n2", data=dodgers);

sns.scatterplot(x="temp", y="attend_n2", hue="day_of_week", size="skies", palette="Set2", data=dodgers)

<matplotlib.axes._subplots.AxesSubplot at 0x1c1d2b4110>

sns.scatterplot(x="temp", y="attend_n2", hue="bobblehead", size="skies", palette="Set2", data=dodgers)

<matplotlib.axes._subplots.AxesSubplot at 0x1c1d2e6cd0>

with sns.plotting_context("paper"):
    sns.scatterplot(x="temp", y="attend", hue="day_of_week", size="skies", palette="Set2", data=dodgers)
    plt.title('Attend by Temp')

sns.lmplot(x="temp", y="attend_n2", col="day_of_week", col_wrap=3, data=dodgers);

dodgers.dtypes

month           object
day              int64
attend           int64
day_of_week     object
opponent        object
temp             int64
skies           object
day_night       object
cap             object
shirt           object
fireworks       object
bobblehead      object
attend_norm    float64
attend_n2      float64
dtype: object

attrs = "month day_of_week opponent skies day_night cap shirt fireworks bobblehead division"

for attr in attrs.split():
    dodgers[attr] = dodgers[attr].astype("category")

dodgers.dtypes

month          category
day               int64
attend            int64
day_of_week    category
opponent       category
temp              int64
skies          category
day_night      category
cap            category
shirt          category
fireworks      category
bobblehead     category
division       category
attend_norm     float64
attend_n2       float64
dtype: object

dodgers['temp_bins'] = pd.cut(dodgers["temp"], [0, 60, 70, 80, 110], labels=
                              ["below_sixty", "sixties", "seventies", "eighty_up"])

dodgers.dtypes

month          category
day               int64
attend            int64
day_of_week    category
opponent       category
temp              int64
skies          category
day_night      category
cap            category
shirt          category
fireworks      category
bobblehead     category
division       category
attend_norm     float64
attend_n2       float64
temp_bins      category
dtype: object

import numpy as np
from scipy.stats import uniform
# employ training-and-test regimen for model validation
np.random.seed(1234)
dodgers['runiform'] = uniform.rvs(loc = 0, scale = 1, size = len(dodgers))
dodgers_train = dodgers[dodgers['runiform'] >= 0.33]
dodgers_test = dodgers[dodgers['runiform'] < 0.33]


# # check training data frame
# print('\ndodgers_train data frame (rows, columns): ',dodgers_train.shape)
# print(dodgers_train.head())
# # check test data frame
# print('\ndodgers_test data frame (rows, columns): ',dodgers_test.shape)
# print(dodgers_test.head())

# # specify a simple model with bobblehead entered last
# my_model = str('attend ~ ordered_month + ordered_day_of_week + bobblehead')

# # fit the model to the training set
# train_model_fit = smf.ols(my_model, data = dodgers_train).fit()
# # summary of model fit to the training set
# print(train_model_fit.summary())
# # training set predictions from the model fit to the training set
# dodgers_train['predict_attend'] = train_model_fit.fittedvalues

# # test set predictions from the model fit to the training set
# dodgers_test['predict_attend'] = train_model_fit.predict(dodgers_test)

dodgers_test

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(dodgers, test_size=0.3, random_state=42)

print(len(train_set))
train_set.head()

56

print(len(dodgers_train))
dodgers_train.head()

57

my_model = str('attend ~ month + day + day_of_week + opponent + temp + skies + day_night + bobblehead + division + temp_bins')

train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.926
Model:                            OLS   Adj. R-squared:                  0.786
Method:                 Least Squares   F-statistic:                     6.600
Date:                Wed, 15 Jan 2020   Prob (F-statistic):           2.92e-05
Time:                        09:39:28   Log-Likelihood:                -507.26
No. Observations:                  56   AIC:                             1089.
Df Residuals:                      19   BIC:                             1163.
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
============================================================================================
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                 3.321e+04    2.4e+04      1.386      0.182    -1.7e+04    8.34e+04
month[T.AUG]             -1695.4736   7991.532     -0.212      0.834   -1.84e+04     1.5e+04
month[T.JUL]             -2871.9145   7401.629     -0.388      0.702   -1.84e+04    1.26e+04
month[T.JUN]              4544.4002    1.1e+04      0.415      0.683   -1.84e+04    2.75e+04
month[T.MAY]             -3371.4908   7066.214     -0.477      0.639   -1.82e+04    1.14e+04
month[T.OCT]               587.8595   8842.896      0.066      0.948   -1.79e+04    1.91e+04
month[T.SEP]             -5755.5661   8295.387     -0.694      0.496   -2.31e+04    1.16e+04
day_of_week[T.Monday]    -2353.2287   3342.670     -0.704      0.490   -9349.518    4643.061
day_of_week[T.Saturday]   2204.8101   2118.534      1.041      0.311   -2229.333    6638.953
day_of_week[T.Sunday]     6157.8993   5086.319      1.211      0.241   -4487.888    1.68e+04
day_of_week[T.Thursday]  -4440.9482   3090.247     -1.437      0.167   -1.09e+04    2027.013
day_of_week[T.Tuesday]   -2396.8188   3200.757     -0.749      0.463   -9096.081    4302.443
day_of_week[T.Wednesday] -1917.3860   2945.882     -0.651      0.523   -8083.188    4248.416
opponent[T.Astros]        7234.6521    1.2e+04      0.601      0.555    -1.8e+04    3.24e+04
opponent[T.Braves]        1382.7246   1.19e+04      0.116      0.909   -2.35e+04    2.62e+04
opponent[T.Brewers]       5716.2851   1.37e+04      0.416      0.682   -2.31e+04    3.45e+04
opponent[T.Cardinals]     6213.2982   1.13e+04      0.550      0.589   -1.74e+04    2.98e+04
opponent[T.Cubs]         -2241.0001   1.06e+04     -0.212      0.835   -2.44e+04    1.99e+04
opponent[T.Giants]        2156.2737   1.17e+04      0.185      0.855   -2.22e+04    2.66e+04
opponent[T.Marlins]       -338.5501   1.18e+04     -0.029      0.977    -2.5e+04    2.43e+04
opponent[T.Mets]         -3423.0958   6208.790     -0.551      0.588   -1.64e+04    9572.050
opponent[T.Nationals]     -549.7977   1.31e+04     -0.042      0.967    -2.8e+04    2.69e+04
opponent[T.Padres]         406.8800   9720.716      0.042      0.967   -1.99e+04    2.08e+04
opponent[T.Phillies]      9118.9354   1.16e+04      0.789      0.440   -1.51e+04    3.33e+04
opponent[T.Pirates]       1084.6845   1.09e+04      0.100      0.921   -2.16e+04    2.38e+04
opponent[T.Reds]          1.241e+04   9297.610      1.335      0.198   -7051.280    3.19e+04
opponent[T.Rockies]       6514.6457   1.11e+04      0.589      0.563   -1.66e+04    2.97e+04
opponent[T.Snakes]        5168.6298   1.13e+04      0.457      0.653   -1.85e+04    2.88e+04
opponent[T.White Sox]     2.567e-11   1.91e-11      1.347      0.194   -1.42e-11    6.55e-11
skies[T.Cloudy]           -377.4258   2168.895     -0.174      0.864   -4916.975    4162.124
day_night[T.Night]        2703.6684   4041.819      0.669      0.512   -5755.955    1.12e+04
bobblehead[T.YES]         7192.5951   2638.742      2.726      0.013    1669.645    1.27e+04
division[T.Divisional]    1.002e+04   1648.236      6.076      0.000    6565.349    1.35e+04
temp_bins[T.sixties]        44.7022   3932.141      0.011      0.991   -8185.364    8274.768
temp_bins[T.seventies]    6874.3168   5413.118      1.270      0.219   -4455.469    1.82e+04
temp_bins[T.eighty_up]    2154.7693   7321.056      0.294      0.772   -1.32e+04    1.75e+04
day                        -89.7768    124.895     -0.719      0.481    -351.184     171.631
temp                       -58.8008    321.725     -0.183      0.857    -732.178     614.577
==============================================================================
Omnibus:                        0.950   Durbin-Watson:                   1.931
Prob(Omnibus):                  0.622   Jarque-Bera (JB):                0.441
Skew:                           0.192   Prob(JB):                        0.802
Kurtosis:                       3.205   Cond. No.                     1.01e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.17e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

my_model_2 = str('attend ~ division + bobblehead + day_night')

train_model_fit = smf.ols(my_model_2, data = train_set).fit()

print(train_model_fit.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.791
Model:                            OLS   Adj. R-squared:                  0.779
Method:                 Least Squares   F-statistic:                     65.64
Date:                Wed, 15 Jan 2020   Prob (F-statistic):           1.09e-17
Time:                        09:46:20   Log-Likelihood:                -536.30
No. Observations:                  56   AIC:                             1081.
Df Residuals:                      52   BIC:                             1089.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               3.591e+04   1263.209     28.425      0.000    3.34e+04    3.84e+04
division[T.Divisional]  9729.3500   1064.609      9.139      0.000    7593.057    1.19e+04
bobblehead[T.YES]       9102.1500   1549.516      5.874      0.000    5992.818    1.22e+04
day_night[T.Night]     -2182.1000   1292.386     -1.688      0.097   -4775.462     411.262
==============================================================================
Omnibus:                        0.499   Durbin-Watson:                   2.471
Prob(Omnibus):                  0.779   Jarque-Bera (JB):                0.109
Skew:                           0.074   Prob(JB):                        0.947
Kurtosis:                       3.158   Cond. No.                         5.58
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

dodgers.dtypes

month          category
day               int64
attend            int64
day_of_week    category
opponent       category
temp              int64
skies          category
day_night      category
cap            category
shirt          category
fireworks      category
bobblehead     category
division       category
attend_norm     float64
attend_n2       float64
temp_bins      category
runiform        float64
dtype: object

train_model_fit = smf.ols(my_model_2, data = dodgers).fit()

print(train_model_fit.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.774
Model:                            OLS   Adj. R-squared:                  0.766
Method:                 Least Squares   F-statistic:                     88.10
Date:                Wed, 15 Jan 2020   Prob (F-statistic):           7.94e-25
Time:                        09:46:48   Log-Likelihood:                -785.05
No. Observations:                  81   AIC:                             1578.
Df Residuals:                      77   BIC:                             1588.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               3.646e+04   1130.580     32.252      0.000    3.42e+04    3.87e+04
division[T.Divisional]  1.142e+04    963.897     11.849      0.000    9501.647    1.33e+04
bobblehead[T.YES]       8501.2486   1427.217      5.957      0.000    5659.296    1.13e+04
day_night[T.Night]     -3241.0847   1170.228     -2.770      0.007   -5571.306    -910.864
==============================================================================
Omnibus:                        0.293   Durbin-Watson:                   2.042
Prob(Omnibus):                  0.864   Jarque-Bera (JB):                0.473
Skew:                           0.031   Prob(JB):                        0.789
Kurtosis:                       2.631   Cond. No.                         5.40
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

attrs_nobobble = [attr for attr in attrs.split() if attr not in ['bobblehead', 'division']]
attrs_nobobble
# attrs.remove('bobblehead')

['month',
 'day_of_week',
 'opponent',
 'skies',
 'day_night',
 'cap',
 'shirt',
 'fireworks']

for attr in attrs_nobobble:
    m = 'attend ~ division + bobblehead + ' + attr
    my_model = str(m)
    train_model_fit = smf.ols(my_model, data = dodgers).fit()
#     print(train_model_fit.summary())
    print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ', 
          round(train_model_fit.params[2],0))

Estimated Effect of Bobblehead and  month  Promotion on Attendance:  7216.0

Estimated Effect of Bobblehead and  day_of_week  Promotion on Attendance:  8228.0

Estimated Effect of Bobblehead and  opponent  Promotion on Attendance:  8155.0

Estimated Effect of Bobblehead and  skies  Promotion on Attendance:  7787.0

Estimated Effect of Bobblehead and  day_night  Promotion on Attendance:  8501.0

Estimated Effect of Bobblehead and  cap  Promotion on Attendance:  7746.0

Estimated Effect of Bobblehead and  shirt  Promotion on Attendance:  7926.0

Estimated Effect of Bobblehead and  fireworks  Promotion on Attendance:  7992.0

for attr in attrs_nobobble:
    m = 'attend ~ division + bobblehead + ' + attr
    my_model = str(m)
    train_model_fit = smf.ols(my_model, data = test_set).fit()
#     print(train_model_fit.summary())
    print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ', 
          round(train_model_fit.params[2],0))

Estimated Effect of Bobblehead and  month  Promotion on Attendance:  5174.0

Estimated Effect of Bobblehead and  day_of_week  Promotion on Attendance:  5282.0

Estimated Effect of Bobblehead and  opponent  Promotion on Attendance:  9128.0

Estimated Effect of Bobblehead and  skies  Promotion on Attendance:  6969.0

Estimated Effect of Bobblehead and  day_night  Promotion on Attendance:  8170.0

Estimated Effect of Bobblehead and  cap  Promotion on Attendance:  7107.0

Estimated Effect of Bobblehead and  shirt  Promotion on Attendance:  7375.0

Estimated Effect of Bobblehead and  fireworks  Promotion on Attendance:  6880.0

m = 'attend ~ month + day + bobblehead'
my_model = str(m)
train_model_fit = smf.ols(my_model, data = dodgers).fit()
#     print(train_model_fit.summary())
print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ', 
      round(train_model_fit.params[3],0))

Estimated Effect of Bobblehead and  fireworks  Promotion on Attendance:  6519.0

m = 'attend ~ month + day + bobblehead'
my_model = str(m)
train_model_fit = smf.ols(my_model, data = train_set).fit()
#     print(train_model_fit.summary())
print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ', 
      round(train_model_fit.params[3],0))

Estimated Effect of Bobblehead and  fireworks  Promotion on Attendance:  10041.0

	month	day	attend	day_of_week	opponent	temp	skies	day_night	cap	shirt	fireworks	bobblehead	division	attend_norm	attend_n2	temp_bins	runiform
0	APR	10	56000	Tuesday	Pirates	67	Clear	Day	NO	NO	NO	NO	Divisional	1.000000	100.000000	sixties	0.191519
5	APR	15	38359	Sunday	Padres	65	Clear	Day	NO	NO	NO	NO	Cross	0.684982	68.498214	sixties	0.272593
6	APR	23	26376	Monday	Braves	60	Cloudy	Night	NO	NO	NO	NO	Cross	0.471000	47.100000	below_sixty	0.276464
17	MAY	13	49124	Sunday	Rockies	70	Clear	Day	NO	NO	NO	NO	Divisional	0.877214	87.721429	sixties	0.013768
22	MAY	20	44005	Sunday	Cardinals	77	Clear	Night	NO	NO	NO	NO	Divisional	0.785804	78.580357	seventies	0.075381
28	MAY	30	25509	Wednesday	Brewers	69	Clear	Night	NO	NO	NO	NO	Cross	0.455518	45.551786	sixties	0.316836
33	JUN	15	40432	Friday	White Sox	67	Clear	Night	NO	NO	YES	NO	Divisional	0.722000	72.200000	sixties	0.143767
36	JUN	28	49006	Thursday	Mets	75	Clear	Night	NO	NO	NO	YES	Divisional	0.875107	87.510714	seventies	0.218792
40	JUL	2	34493	Monday	Reds	70	Clear	Night	NO	NO	NO	NO	Cross	0.615946	61.594643	sixties	0.059809
41	JUL	3	33884	Tuesday	Reds	70	Cloudy	Night	YES	NO	NO	NO	Cross	0.605071	60.507143	sixties	0.184287
42	JUL	4	53570	Wednesday	Reds	70	Clear	Night	NO	NO	YES	NO	Divisional	0.956607	95.660714	sixties	0.047355
46	JUL	16	32238	Monday	Phillies	67	Clear	Night	NO	NO	NO	NO	Cross	0.575679	57.567857	sixties	0.043324
48	JUL	18	39955	Wednesday	Phillies	80	Cloudy	Day	NO	NO	NO	NO	Cross	0.713482	71.348214	seventies	0.329668
50	JUL	31	52832	Tuesday	Snakes	75	Cloudy	Night	NO	NO	NO	YES	Divisional	0.943429	94.342857	seventies	0.111894
53	AUG	4	46588	Saturday	Cubs	73	Cloudy	Night	NO	NO	NO	NO	Divisional	0.831929	83.192857	seventies	0.006764
60	AUG	22	40173	Wednesday	Giants	75	Clear	Night	NO	NO	NO	NO	Divisional	0.717375	71.737500	seventies	0.285251
63	AUG	26	41907	Sunday	Marlins	81	Clear	Day	NO	NO	NO	NO	Divisional	0.748339	74.833929	eighty_up	0.195675
65	AUG	31	37622	Friday	Snakes	77	Clear	Night	NO	NO	YES	NO	Cross	0.671821	67.182143	seventies	0.053874
68	SEP	3	33540	Monday	Padres	84	Cloudy	Night	NO	NO	NO	NO	Cross	0.598929	59.892857	eighty_up	0.123943
69	SEP	4	40619	Tuesday	Padres	78	Clear	Night	NO	YES	NO	NO	Divisional	0.725339	72.533929	seventies	0.119381
73	SEP	15	42449	Saturday	Cardinals	95	Clear	Night	NO	NO	NO	NO	Divisional	0.758018	75.801786	eighty_up	0.107127
74	SEP	16	35754	Sunday	Cardinals	86	Clear	Day	NO	NO	NO	NO	Cross	0.638464	63.846429	eighty_up	0.229219
78	OCT	1	33624	Monday	Giants	86	Clear	Night	NO	NO	NO	NO	Cross	0.600429	60.042857	eighty_up	0.006209
79	OCT	2	42473	Tuesday	Giants	83	Clear	Night	NO	NO	NO	NO	Divisional	0.758446	75.844643	eighty_up	0.300642

	month	day	attend	day_of_week	opponent	temp	skies	day_night	cap	shirt	fireworks	bobblehead	division	attend_norm	attend_n2	temp_bins	runiform
62	AUG	25	40284	Saturday	Marlins	70	Clear	Night	NO	NO	NO	NO	Divisional	0.719357	71.935714	sixties	0.478094
42	JUL	4	53570	Wednesday	Reds	70	Clear	Night	NO	NO	YES	NO	Divisional	0.956607	95.660714	sixties	0.047355
54	AUG	5	42495	Sunday	Cubs	83	Clear	Day	YES	NO	NO	NO	Divisional	0.758839	75.883929	eighty_up	0.617442
16	MAY	12	33735	Saturday	Rockies	65	Clear	Night	NO	NO	NO	NO	Cross	0.602411	60.241071	sixties	0.503083
39	JUL	1	55359	Sunday	Mets	75	Clear	Night	NO	NO	NO	YES	Divisional	0.988554	98.855357	seventies	0.909316

	month	day	attend	day_of_week	opponent	temp	skies	day_night	cap	shirt	fireworks	bobblehead	division	attend_norm	attend_n2	temp_bins	runiform
1	APR	11	29729	Wednesday	Pirates	58	Cloudy	Night	NO	NO	NO	NO	Cross	0.530875	53.087500	below_sixty	0.622109
2	APR	12	28328	Thursday	Pirates	57	Cloudy	Night	NO	NO	NO	NO	Cross	0.505857	50.585714	below_sixty	0.437728
3	APR	13	31601	Friday	Padres	54	Cloudy	Night	NO	NO	YES	NO	Cross	0.564304	56.430357	below_sixty	0.785359
4	APR	14	46549	Saturday	Padres	57	Cloudy	Night	NO	NO	NO	NO	Divisional	0.831232	83.123214	below_sixty	0.779976
7	APR	24	44014	Tuesday	Braves	63	Cloudy	Night	NO	NO	NO	NO	Divisional	0.785964	78.596429	sixties	0.801872