IST 718 | WK2 | ASYNC 2.5

In [1]:
# Predictive Model for Los Angeles Dodgers Promotion and Attendance (Python)

# BASED ON EXHIBIT 2.1 FROM MILLER (2015)

# import packages for analysis and modeling
import pandas as pd  # data frame operations

import numpy as np  # arrays and math functions
from scipy.stats import uniform  # for training-and-test split
import statsmodels.api as sm  # statistical models (including regression)
import statsmodels.formula.api as smf  # R-like model specification
import matplotlib.pyplot as plt  # 2D plotting

import seaborn as sns  # PROVIDES TRELLIS AND SMALL MULTIPLE PLOTTING

# read in Dodgers bobbleheads data and create data frame
dodgers = pd.read_csv("dodgers.csv")

# examine the structure of the data frame
print("\nContents of dodgers data frame ---------------")

# attendance in thousands for plotting 
dodgers['attend_000'] = dodgers['attend']/1000

# print the first five rows of the data frame
print(pd.DataFrame.head(dodgers))
dodgerDF = pd.DataFrame(dodgers)

mondays = dodgers[dodgers['day_of_week'] == 'Monday']
tuesdays = dodgers[dodgers['day_of_week'] == 'Tuesday']
wednesdays = dodgers[dodgers['day_of_week'] == 'Wednesday']
thursdays = dodgers[dodgers['day_of_week'] == 'Thursday']
fridays = dodgers[dodgers['day_of_week'] == 'Friday']
saturdays = dodgers[dodgers['day_of_week'] == 'Saturday']
sundays = dodgers[dodgers['day_of_week'] == 'Sunday']

# convert days' attendance into list of vectors for box plot
data = [mondays['attend_000'], tuesdays['attend_000'], 
    wednesdays['attend_000'], thursdays['attend_000'], 
    fridays['attend_000'], saturdays['attend_000'], 
    sundays['attend_000']]
ordered_day_names = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun']

ordered_team_names = (sorted(set(dodgers['opponent']), reverse = True))
Contents of dodgers data frame ---------------
  month  day  attend day_of_week opponent  temp   skies day_night cap shirt  \
0   APR   10   56000     Tuesday  Pirates    67  Clear        Day  NO    NO   
1   APR   11   29729   Wednesday  Pirates    58  Cloudy     Night  NO    NO   
2   APR   12   28328    Thursday  Pirates    57  Cloudy     Night  NO    NO   
3   APR   13   31601      Friday   Padres    54  Cloudy     Night  NO    NO   
4   APR   14   46549    Saturday   Padres    57  Cloudy     Night  NO    NO   

  fireworks bobblehead  attend_000  
0        NO         NO      56.000  
1        NO         NO      29.729  
2        NO         NO      28.328  
3       YES         NO      31.601  
4        NO         NO      46.549  
In [21]:
## SUMMARY STATISTICS

np.mean(dodgers['attend'])
# np.min(dodgers['attend'])
# np.max(dodgers['attend'])

sns.boxplot(y="attend", data=dodgers)
plt.show()

# KO -- UPDATED TO CORRECTLY ORDER DAYS OF WEEK
sns.boxplot(x="day_of_week", y="attend", data=dodgers, color = "gray", 
            order=['Sunday','Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']);
g.add_legend();
plt.show()

# NUMBER OF PROMOTIONS - ONLY WORKS WITH BINARY VARIABLES

#cap = np.count_nonzero(dodgers['cap'])
#print(cap)
#shirt = np.count_nonzero(dodgers['shirt'])
#print(shirt)
#fire = np.count_nonzero(dodgers['fireworks'])
#print(fire)
#bob = np.count_nonzero(dodgers['bobblehead'])
#print(bob)

#print(cap + shirt + fire + bob)

sns.boxplot(x="month", y="attend", data=dodgers, color = "gray")
g.add_legend();
plt.show()
In [9]:
# exploratory data analysis: box plot for day of the week
# FROM MILLER (2015)
fig, axis = plt.subplots()
axis.set_xlabel('Day of Week')
axis.set_ylabel('Attendance (thousands)')
day_plot = plt.boxplot(data, sym='o', vert=1, whis=1.5)
plt.setp(day_plot['boxes'], color = 'black')    
plt.setp(day_plot['whiskers'], color = 'black')    
plt.setp(day_plot['fliers'], color = 'black', marker = 'o')
axis.set_xticklabels(ordered_day_names)
plt.show()
plt.savefig('fig_advert_promo_dodgers_eda_day_of_week_Python.pdf', 
    bbox_inches = 'tight', dpi=None, facecolor='w', edgecolor='b', 
    orientation='portrait', papertype=None, format=None, 
    transparent=True, pad_inches=0.25, frameon=None) 
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:15: MatplotlibDeprecationWarning: 
The frameon kwarg was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use facecolor instead.
  from ipykernel import kernelapp as app
<Figure size 432x288 with 0 Axes>
In [10]:
# exploratory data analysis: box plot for day of the week

april = dodgers[dodgers['month'] == 'APR']
may = dodgers[dodgers['month'] == 'MAY']
june = dodgers[dodgers['month'] == 'JUN']
july = dodgers[dodgers['month'] == 'JUL']
august = dodgers[dodgers['month'] == 'AUG']
september = dodgers[dodgers['month'] == 'SEP']
october = dodgers[dodgers['month'] == 'OCT']

data = [april['attend_000'], may['attend_000'], 
    june['attend_000'], july['attend_000'], 
    august['attend_000'], september['attend_000'], 
    october['attend_000']]
ordered_month_names = ['April', 'May', 'June', 'July', 'Aug', 'Sept', 'Oct']

fig, axis = plt.subplots()
axis.set_xlabel('Month')
axis.set_ylabel('Attendance (thousands)')
day_plot = plt.boxplot(data, sym='o', vert=1, whis=1.5)
plt.setp(day_plot['boxes'], color = 'black')    
plt.setp(day_plot['whiskers'], color = 'black')    
plt.setp(day_plot['fliers'], color = 'black', marker = 'o')
axis.set_xticklabels(ordered_month_names)
plt.show()

## CODE TO SAVE IMAGE FILE
# plt.savefig('fig_advert_promo_dodgers_eda_month_Python.pdf', 
#    bbox_inches = 'tight', dpi=None, facecolor='w', edgecolor='b', 
#    orientation='portrait', papertype=None, format=None, 
#    transparent=True, pad_inches=0.25, frameon=None)  
In [11]:
# trellis/lattice plot attendance by temp, conditioning on skies 
# and day_night with bobblehead NO/YES shown in distinct colors
import seaborn as sns

sns.set(style="darkgrid")

g = sns.FacetGrid(dodgers, col="skies", hue="fireworks", 
                  hue_order=["YES", "NO"],
                  hue_kws=dict(marker=["^", "v"]))
g.map(plt.scatter, "temp", "attend", alpha=.7)
g.add_legend();
plt.show()

g = sns.FacetGrid(dodgers, col="day_night", hue="fireworks",
                hue_order=["YES", "NO"],)
g.map(plt.scatter, "temp", "attend", alpha=.7)
g.add_legend();
plt.show()

g = sns.FacetGrid(dodgers, col="day_night", hue="bobblehead",
                hue_order=["YES", "NO"],)
g.map(plt.scatter, "temp", "attend", alpha=.7)
g.add_legend();
plt.show()
In [12]:
# DODGER PROMOTIONS BY TYPE

sns.countplot(y="fireworks", data=dodgers, palette="Blues_d",
             order=["YES", "NO"])
g.add_legend();
plt.show()

sns.countplot(y="bobblehead", hue="day_night",
                data=dodgers, palette="Blues_d",
                order=["YES", "NO"])
g.add_legend();
plt.show()
In [13]:
# Dodgers attendance by Visiting Team
# Figure 2.4 from Miller

# teams = sorted(["opponent"], reverse=True)
sns.swarmplot(x="attend", y="opponent", hue="day_night", 
              data=dodgers, order =ordered_team_names)
g.add_legend();
plt.show()
In [14]:
# CREATE A HISTOGRAM OF ATTENDANCE

# CREATE BINS

plt.hist(dodgers['attend'], normed = False, stacked = False, rwidth = .9)
plt.title("Attendance Histogram")
plt.xlabel('Attendance')
plt.ylabel('Frequency')

plt.show()
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: MatplotlibDeprecationWarning: 
The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  """
In [15]:
# BLOCK FOR ORDERING DATA

# map day_of_week to ordered_day_of_week 
day_to_ordered_day = {'Monday' : '1Monday', 
    'Tuesday' : '2Tuesday', 
    'Wednesday' : '3Wednesday', 
    'Thursday' : '4Thursday', 
    'Friday' : '5Friday',
    'Saturday' : '6Saturday',
    'Sunday' : '7Sunday'}
dodgers['ordered_day_of_week'] = dodgers['day_of_week'].map(day_to_ordered_day)   

# map month to ordered_month
month_to_ordered_month = {'APR' : '1April', 
     'MAY' : '2May', 
     'JUN' : '3June', 
     'JUL' : '4July', 
     'AUG' : '5Aug',
     'SEP' : '6Sept',
     'OCT' : '7Oct'}
dodgers['ordered_month'] = dodgers['month'].map(month_to_ordered_month)    
In [16]:
# employ training-and-test regimen for model validation
np.random.seed(1234)
dodgers['runiform'] = uniform.rvs(loc = 0, scale = 1, size = len(dodgers))
dodgers_train = dodgers[dodgers['runiform'] >= 0.33]
dodgers_test = dodgers[dodgers['runiform'] < 0.33]
# check training data frame
print('\ndodgers_train data frame (rows, columns): ',dodgers_train.shape)
print(dodgers_train.head())
# check test data frame
print('\ndodgers_test data frame (rows, columns): ',dodgers_test.shape)
print(dodgers_test.head())

# specify a simple model with bobblehead entered last
my_model = str('attend ~ ordered_month + ordered_day_of_week + bobblehead')

# fit the model to the training set
train_model_fit = smf.ols(my_model, data = dodgers_train).fit()
# summary of model fit to the training set
print(train_model_fit.summary())
# training set predictions from the model fit to the training set
dodgers_train['predict_attend'] = train_model_fit.fittedvalues

# test set predictions from the model fit to the training set
dodgers_test['predict_attend'] = train_model_fit.predict(dodgers_test)

  
dodgers_train data frame (rows, columns):  (57, 16)
  month  day  attend day_of_week opponent  temp   skies day_night cap shirt  \
1   APR   11   29729   Wednesday  Pirates    58  Cloudy     Night  NO    NO   
2   APR   12   28328    Thursday  Pirates    57  Cloudy     Night  NO    NO   
3   APR   13   31601      Friday   Padres    54  Cloudy     Night  NO    NO   
4   APR   14   46549    Saturday   Padres    57  Cloudy     Night  NO    NO   
7   APR   24   44014     Tuesday   Braves    63  Cloudy     Night  NO    NO   

  fireworks bobblehead  attend_000 ordered_day_of_week ordered_month  runiform  
1        NO         NO      29.729          3Wednesday        1April  0.622109  
2        NO         NO      28.328           4Thursday        1April  0.437728  
3       YES         NO      31.601             5Friday        1April  0.785359  
4        NO         NO      46.549           6Saturday        1April  0.779976  
7        NO         NO      44.014            2Tuesday        1April  0.801872  

dodgers_test data frame (rows, columns):  (24, 16)
   month  day  attend day_of_week   opponent  temp   skies day_night cap  \
0    APR   10   56000     Tuesday    Pirates    67  Clear        Day  NO   
5    APR   15   38359      Sunday     Padres    65  Clear        Day  NO   
6    APR   23   26376      Monday     Braves    60  Cloudy     Night  NO   
17   MAY   13   49124      Sunday    Rockies    70  Clear        Day  NO   
22   MAY   20   44005      Sunday  Cardinals    77  Clear      Night  NO   

   shirt fireworks bobblehead  attend_000 ordered_day_of_week ordered_month  \
0     NO        NO         NO      56.000            2Tuesday        1April   
5     NO        NO         NO      38.359             7Sunday        1April   
6     NO        NO         NO      26.376             1Monday        1April   
17    NO        NO         NO      49.124             7Sunday          2May   
22    NO        NO         NO      44.005             7Sunday          2May   

    runiform  
0   0.191519  
5   0.272593  
6   0.276464  
17  0.013768  
22  0.075381  
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.639
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     5.864
Date:                Tue, 14 Jan 2020   Prob (F-statistic):           4.70e-06
Time:                        10:37:03   Log-Likelihood:                -566.87
No. Observations:                  57   AIC:                             1162.
Df Residuals:                      43   BIC:                             1190.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
=====================================================================================================
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Intercept                          3.599e+04   3133.268     11.486      0.000    2.97e+04    4.23e+04
ordered_month[T.2May]             -3049.4581   2532.400     -1.204      0.235   -8156.529    2057.613
ordered_month[T.3June]             8776.8177   2979.201      2.946      0.005    2768.686    1.48e+04
ordered_month[T.4July]             3705.6266   3237.723      1.145      0.259   -2823.865    1.02e+04
ordered_month[T.5Aug]              2027.5628   2698.095      0.751      0.456   -3413.664    7468.789
ordered_month[T.6Sept]             1280.8090   2863.529      0.447      0.657   -4494.048    7055.666
ordered_month[T.7Oct]             -1234.2437   6407.369     -0.193      0.848   -1.42e+04    1.17e+04
ordered_day_of_week[T.2Tuesday]    4734.3477   3466.721      1.366      0.179   -2256.961    1.17e+04
ordered_day_of_week[T.3Wednesday]  -741.4427   3228.255     -0.230      0.819   -7251.839    5768.954
ordered_day_of_week[T.4Thursday]   -878.9914   3849.182     -0.228      0.820   -8641.606    6883.624
ordered_day_of_week[T.5Friday]     3420.7852   2885.730      1.185      0.242   -2398.844    9240.414
ordered_day_of_week[T.6Saturday]   3060.5245   2965.589      1.032      0.308   -2920.156    9041.205
ordered_day_of_week[T.7Sunday]     2796.4262   3153.300      0.887      0.380   -3562.810    9155.662
bobblehead[T.YES]                  1.233e+04   2683.012      4.595      0.000    6918.497    1.77e+04
==============================================================================
Omnibus:                        2.353   Durbin-Watson:                   2.084
Prob(Omnibus):                  0.308   Jarque-Bera (JB):                1.932
Skew:                           0.451   Prob(JB):                        0.381
Kurtosis:                       2.993   Cond. No.                         10.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
In [17]:
# compute the proportion of response variance
# accounted for when predicting out-of-sample
print('\nProportion of Test Set Variance Accounted for: ',\
    round(np.power(dodgers_test['attend'].corr(dodgers_test['predict_attend']),2),3))

# use the full data set to obtain an estimate of the increase in
# attendance due to bobbleheads, controlling for other factors 
my_model_fit = smf.ols(my_model, data = dodgers).fit()
print(my_model_fit.summary())

print('\nEstimated Effect of Bobblehead Promotion on Attendance: ',\
    round(my_model_fit.params[13],0))
    
# Suggestions for the student: Reproduce the figures in this chapter
# using matplotlib, ggplot, and/or rpy2 calls to R graphics. 
# Examine regression diagnostics for the fitted model.
# Examine other linear predictors and other explanatory variables.
# See if you can improve upon the model with variable transformations. 
Proportion of Test Set Variance Accounted for:  0.217
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.544
Model:                            OLS   Adj. R-squared:                  0.456
Method:                 Least Squares   F-statistic:                     6.158
Date:                Tue, 14 Jan 2020   Prob (F-statistic):           2.08e-07
Time:                        10:37:07   Log-Likelihood:                -813.52
No. Observations:                  81   AIC:                             1655.
Df Residuals:                      67   BIC:                             1689.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
=====================================================================================================
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Intercept                          3.391e+04   2521.806     13.446      0.000    2.89e+04    3.89e+04
ordered_month[T.2May]             -2385.6248   2291.216     -1.041      0.302   -6958.912    2187.662
ordered_month[T.3June]             7163.2336   2732.721      2.621      0.011    1708.699    1.26e+04
ordered_month[T.4July]             2849.8281   2578.600      1.105      0.273   -2297.079    7996.735
ordered_month[T.5Aug]              2377.9243   2402.915      0.990      0.326   -2418.314    7174.162
ordered_month[T.6Sept]               29.0302   2521.249      0.012      0.991   -5003.404    5061.464
ordered_month[T.7Oct]              -662.6677   4046.452     -0.164      0.870   -8739.419    7414.083
ordered_day_of_week[T.2Tuesday]    7911.4936   2702.208      2.928      0.005    2517.864    1.33e+04
ordered_day_of_week[T.3Wednesday]  2460.0232   2514.029      0.979      0.331   -2558.000    7478.046
ordered_day_of_week[T.4Thursday]    775.3638   3486.154      0.222      0.825   -6183.029    7733.757
ordered_day_of_week[T.5Friday]     4883.8183   2504.653      1.950      0.055    -115.490    9883.127
ordered_day_of_week[T.6Saturday]   6372.0558   2552.084      2.497      0.015    1278.075    1.15e+04
ordered_day_of_week[T.7Sunday]     6724.0027   2506.721      2.682      0.009    1720.567    1.17e+04
bobblehead[T.YES]                  1.071e+04   2419.520      4.429      0.000    5885.521    1.55e+04
==============================================================================
Omnibus:                        6.343   Durbin-Watson:                   2.130
Prob(Omnibus):                  0.042   Jarque-Bera (JB):                5.908
Skew:                           0.654   Prob(JB):                       0.0521
Kurtosis:                       3.205   Cond. No.                         9.76
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Estimated Effect of Bobblehead Promotion on Attendance:  10715.0
In [18]:
corr = dodgerDF.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
#f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
In [ ]: