IST 718 | WK2 | ASYNC 2.5

In [1]:
# Predictive Model for Los Angeles Dodgers Promotion and Attendance (Python)

# BASED ON EXHIBIT 2.1 FROM MILLER (2015)

# import packages for analysis and modeling
import pandas as pd  # data frame operations

import numpy as np  # arrays and math functions
from scipy.stats import uniform  # for training-and-test split
import statsmodels.api as sm  # statistical models (including regression)
import statsmodels.formula.api as smf  # R-like model specification
import matplotlib.pyplot as plt  # 2D plotting

import seaborn as sns  # PROVIDES TRELLIS AND SMALL MULTIPLE PLOTTING

# read in Dodgers bobbleheads data and create data frame
dodgers = pd.read_csv("dodgers.csv")

# examine the structure of the data frame
print("\nContents of dodgers data frame ---------------")

# attendance in thousands for plotting 
dodgers['attend_000'] = dodgers['attend']/1000

# print the first five rows of the data frame
print(pd.DataFrame.head(dodgers))
dodgerDF = pd.DataFrame(dodgers)

mondays = dodgers[dodgers['day_of_week'] == 'Monday']
tuesdays = dodgers[dodgers['day_of_week'] == 'Tuesday']
wednesdays = dodgers[dodgers['day_of_week'] == 'Wednesday']
thursdays = dodgers[dodgers['day_of_week'] == 'Thursday']
fridays = dodgers[dodgers['day_of_week'] == 'Friday']
saturdays = dodgers[dodgers['day_of_week'] == 'Saturday']
sundays = dodgers[dodgers['day_of_week'] == 'Sunday']

# convert days' attendance into list of vectors for box plot
data = [mondays['attend_000'], tuesdays['attend_000'], 
    wednesdays['attend_000'], thursdays['attend_000'], 
    fridays['attend_000'], saturdays['attend_000'], 
    sundays['attend_000']]
ordered_day_names = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun']

ordered_team_names = (sorted(set(dodgers['opponent']), reverse = True))
Contents of dodgers data frame ---------------
  month  day  attend day_of_week opponent  temp   skies day_night cap shirt  \
0   APR   10   56000     Tuesday  Pirates    67  Clear        Day  NO    NO   
1   APR   11   29729   Wednesday  Pirates    58  Cloudy     Night  NO    NO   
2   APR   12   28328    Thursday  Pirates    57  Cloudy     Night  NO    NO   
3   APR   13   31601      Friday   Padres    54  Cloudy     Night  NO    NO   
4   APR   14   46549    Saturday   Padres    57  Cloudy     Night  NO    NO   

  fireworks bobblehead  attend_000  
0        NO         NO      56.000  
1        NO         NO      29.729  
2        NO         NO      28.328  
3       YES         NO      31.601  
4        NO         NO      46.549  
In [21]:
## SUMMARY STATISTICS

np.mean(dodgers['attend'])
# np.min(dodgers['attend'])
# np.max(dodgers['attend'])

sns.boxplot(y="attend", data=dodgers)
plt.show()

# KO -- UPDATED TO CORRECTLY ORDER DAYS OF WEEK
sns.boxplot(x="day_of_week", y="attend", data=dodgers, color = "gray", 
            order=['Sunday','Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']);
g.add_legend();
plt.show()

# NUMBER OF PROMOTIONS - ONLY WORKS WITH BINARY VARIABLES

#cap = np.count_nonzero(dodgers['cap'])
#print(cap)
#shirt = np.count_nonzero(dodgers['shirt'])
#print(shirt)
#fire = np.count_nonzero(dodgers['fireworks'])
#print(fire)
#bob = np.count_nonzero(dodgers['bobblehead'])
#print(bob)

#print(cap + shirt + fire + bob)

sns.boxplot(x="month", y="attend", data=dodgers, color = "gray")
g.add_legend();
plt.show()
In [9]:
# exploratory data analysis: box plot for day of the week
# FROM MILLER (2015)
fig, axis = plt.subplots()
axis.set_xlabel('Day of Week')
axis.set_ylabel('Attendance (thousands)')
day_plot = plt.boxplot(data, sym='o', vert=1, whis=1.5)
plt.setp(day_plot['boxes'], color = 'black')    
plt.setp(day_plot['whiskers'], color = 'black')    
plt.setp(day_plot['fliers'], color = 'black', marker = 'o')
axis.set_xticklabels(ordered_day_names)
plt.show()
plt.savefig('fig_advert_promo_dodgers_eda_day_of_week_Python.pdf', 
    bbox_inches = 'tight', dpi=None, facecolor='w', edgecolor='b', 
    orientation='portrait', papertype=None, format=None, 
    transparent=True, pad_inches=0.25, frameon=None) 
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:15: MatplotlibDeprecationWarning: 
The frameon kwarg was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use facecolor instead.
  from ipykernel import kernelapp as app
<Figure size 432x288 with 0 Axes>
In [10]:
# exploratory data analysis: box plot for day of the week

april = dodgers[dodgers['month'] == 'APR']
may = dodgers[dodgers['month'] == 'MAY']
june = dodgers[dodgers['month'] == 'JUN']
july = dodgers[dodgers['month'] == 'JUL']
august = dodgers[dodgers['month'] == 'AUG']
september = dodgers[dodgers['month'] == 'SEP']
october = dodgers[dodgers['month'] == 'OCT']

data = [april['attend_000'], may['attend_000'], 
    june['attend_000'], july['attend_000'], 
    august['attend_000'], september['attend_000'], 
    october['attend_000']]
ordered_month_names = ['April', 'May', 'June', 'July', 'Aug', 'Sept', 'Oct']

fig, axis = plt.subplots()
axis.set_xlabel('Month')
axis.set_ylabel('Attendance (thousands)')
day_plot = plt.boxplot(data, sym='o', vert=1, whis=1.5)
plt.setp(day_plot['boxes'], color = 'black')    
plt.setp(day_plot['whiskers'], color = 'black')    
plt.setp(day_plot['fliers'], color = 'black', marker = 'o')
axis.set_xticklabels(ordered_month_names)
plt.show()

## CODE TO SAVE IMAGE FILE
# plt.savefig('fig_advert_promo_dodgers_eda_month_Python.pdf', 
#    bbox_inches = 'tight', dpi=None, facecolor='w', edgecolor='b', 
#    orientation='portrait', papertype=None, format=None, 
#    transparent=True, pad_inches=0.25, frameon=None)  
In [11]:
# trellis/lattice plot attendance by temp, conditioning on skies 
# and day_night with bobblehead NO/YES shown in distinct colors
import seaborn as sns

sns.set(style="darkgrid")

g = sns.FacetGrid(dodgers, col="skies", hue="fireworks", 
                  hue_order=["YES", "NO"],
                  hue_kws=dict(marker=["^", "v"]))
g.map(plt.scatter, "temp", "attend", alpha=.7)
g.add_legend();
plt.show()

g = sns.FacetGrid(dodgers, col="day_night", hue="fireworks",
                hue_order=["YES", "NO"],)
g.map(plt.scatter, "temp", "attend", alpha=.7)
g.add_legend();
plt.show()

g = sns.FacetGrid(dodgers, col="day_night", hue="bobblehead",
                hue_order=["YES", "NO"],)
g.map(plt.scatter, "temp", "attend", alpha=.7)
g.add_legend();
plt.show()
In [12]:
# DODGER PROMOTIONS BY TYPE

sns.countplot(y="fireworks", data=dodgers, palette="Blues_d",
             order=["YES", "NO"])
g.add_legend();
plt.show()

sns.countplot(y="bobblehead", hue="day_night",
                data=dodgers, palette="Blues_d",
                order=["YES", "NO"])
g.add_legend();
plt.show()
In [13]:
# Dodgers attendance by Visiting Team
# Figure 2.4 from Miller

# teams = sorted(["opponent"], reverse=True)
sns.swarmplot(x="attend", y="opponent", hue="day_night", 
              data=dodgers, order =ordered_team_names)
g.add_legend();
plt.show()
In [14]:
# CREATE A HISTOGRAM OF ATTENDANCE

# CREATE BINS

plt.hist(dodgers['attend'], normed = False, stacked = False, rwidth = .9)
plt.title("Attendance Histogram")
plt.xlabel('Attendance')
plt.ylabel('Frequency')

plt.show()
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: MatplotlibDeprecationWarning: 
The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
  """
In [15]:
# BLOCK FOR ORDERING DATA

# map day_of_week to ordered_day_of_week 
day_to_ordered_day = {'Monday' : '1Monday', 
    'Tuesday' : '2Tuesday', 
    'Wednesday' : '3Wednesday', 
    'Thursday' : '4Thursday', 
    'Friday' : '5Friday',
    'Saturday' : '6Saturday',
    'Sunday' : '7Sunday'}
dodgers['ordered_day_of_week'] = dodgers['day_of_week'].map(day_to_ordered_day)   

# map month to ordered_month
month_to_ordered_month = {'APR' : '1April', 
     'MAY' : '2May', 
     'JUN' : '3June', 
     'JUL' : '4July', 
     'AUG' : '5Aug',
     'SEP' : '6Sept',
     'OCT' : '7Oct'}
dodgers['ordered_month'] = dodgers['month'].map(month_to_ordered_month)    
In [25]:
dodgers.dtypes
Out[25]:
month                   object
day                      int64
attend                   int64
day_of_week             object
opponent                object
temp                     int64
skies                   object
day_night               object
cap                     object
shirt                   object
fireworks               object
bobblehead              object
attend_000             float64
ordered_day_of_week     object
ordered_month           object
runiform               float64
dtype: object
In [16]:
# employ training-and-test regimen for model validation
np.random.seed(1234)
dodgers['runiform'] = uniform.rvs(loc = 0, scale = 1, size = len(dodgers))
dodgers_train = dodgers[dodgers['runiform'] >= 0.33]
dodgers_test = dodgers[dodgers['runiform'] < 0.33]
# check training data frame
print('\ndodgers_train data frame (rows, columns): ',dodgers_train.shape)
print(dodgers_train.head())
# check test data frame
print('\ndodgers_test data frame (rows, columns): ',dodgers_test.shape)
print(dodgers_test.head())

# specify a simple model with bobblehead entered last
my_model = str('attend ~ ordered_month + ordered_day_of_week + bobblehead')

# fit the model to the training set
train_model_fit = smf.ols(my_model, data = dodgers_train).fit()
# summary of model fit to the training set
print(train_model_fit.summary())
# training set predictions from the model fit to the training set
dodgers_train['predict_attend'] = train_model_fit.fittedvalues

# test set predictions from the model fit to the training set
dodgers_test['predict_attend'] = train_model_fit.predict(dodgers_test)
 
  
dodgers_train data frame (rows, columns):  (57, 16)
  month  day  attend day_of_week opponent  temp   skies day_night cap shirt  \
1   APR   11   29729   Wednesday  Pirates    58  Cloudy     Night  NO    NO   
2   APR   12   28328    Thursday  Pirates    57  Cloudy     Night  NO    NO   
3   APR   13   31601      Friday   Padres    54  Cloudy     Night  NO    NO   
4   APR   14   46549    Saturday   Padres    57  Cloudy     Night  NO    NO   
7   APR   24   44014     Tuesday   Braves    63  Cloudy     Night  NO    NO   

  fireworks bobblehead  attend_000 ordered_day_of_week ordered_month  runiform  
1        NO         NO      29.729          3Wednesday        1April  0.622109  
2        NO         NO      28.328           4Thursday        1April  0.437728  
3       YES         NO      31.601             5Friday        1April  0.785359  
4        NO         NO      46.549           6Saturday        1April  0.779976  
7        NO         NO      44.014            2Tuesday        1April  0.801872  

dodgers_test data frame (rows, columns):  (24, 16)
   month  day  attend day_of_week   opponent  temp   skies day_night cap  \
0    APR   10   56000     Tuesday    Pirates    67  Clear        Day  NO   
5    APR   15   38359      Sunday     Padres    65  Clear        Day  NO   
6    APR   23   26376      Monday     Braves    60  Cloudy     Night  NO   
17   MAY   13   49124      Sunday    Rockies    70  Clear        Day  NO   
22   MAY   20   44005      Sunday  Cardinals    77  Clear      Night  NO   

   shirt fireworks bobblehead  attend_000 ordered_day_of_week ordered_month  \
0     NO        NO         NO      56.000            2Tuesday        1April   
5     NO        NO         NO      38.359             7Sunday        1April   
6     NO        NO         NO      26.376             1Monday        1April   
17    NO        NO         NO      49.124             7Sunday          2May   
22    NO        NO         NO      44.005             7Sunday          2May   

    runiform  
0   0.191519  
5   0.272593  
6   0.276464  
17  0.013768  
22  0.075381  
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.639
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     5.864
Date:                Tue, 14 Jan 2020   Prob (F-statistic):           4.70e-06
Time:                        10:37:03   Log-Likelihood:                -566.87
No. Observations:                  57   AIC:                             1162.
Df Residuals:                      43   BIC:                             1190.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
=====================================================================================================
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Intercept                          3.599e+04   3133.268     11.486      0.000    2.97e+04    4.23e+04
ordered_month[T.2May]             -3049.4581   2532.400     -1.204      0.235   -8156.529    2057.613
ordered_month[T.3June]             8776.8177   2979.201      2.946      0.005    2768.686    1.48e+04
ordered_month[T.4July]             3705.6266   3237.723      1.145      0.259   -2823.865    1.02e+04
ordered_month[T.5Aug]              2027.5628   2698.095      0.751      0.456   -3413.664    7468.789
ordered_month[T.6Sept]             1280.8090   2863.529      0.447      0.657   -4494.048    7055.666
ordered_month[T.7Oct]             -1234.2437   6407.369     -0.193      0.848   -1.42e+04    1.17e+04
ordered_day_of_week[T.2Tuesday]    4734.3477   3466.721      1.366      0.179   -2256.961    1.17e+04
ordered_day_of_week[T.3Wednesday]  -741.4427   3228.255     -0.230      0.819   -7251.839    5768.954
ordered_day_of_week[T.4Thursday]   -878.9914   3849.182     -0.228      0.820   -8641.606    6883.624
ordered_day_of_week[T.5Friday]     3420.7852   2885.730      1.185      0.242   -2398.844    9240.414
ordered_day_of_week[T.6Saturday]   3060.5245   2965.589      1.032      0.308   -2920.156    9041.205
ordered_day_of_week[T.7Sunday]     2796.4262   3153.300      0.887      0.380   -3562.810    9155.662
bobblehead[T.YES]                  1.233e+04   2683.012      4.595      0.000    6918.497    1.77e+04
==============================================================================
Omnibus:                        2.353   Durbin-Watson:                   2.084
Prob(Omnibus):                  0.308   Jarque-Bera (JB):                1.932
Skew:                           0.451   Prob(JB):                        0.381
Kurtosis:                       2.993   Cond. No.                         10.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
In [23]:
dodgers_train
Out[23]:
month day attend day_of_week opponent temp skies day_night cap shirt fireworks bobblehead attend_000 ordered_day_of_week ordered_month runiform predict_attend
1 APR 11 29729 Wednesday Pirates 58 Cloudy Night NO NO NO NO 29.729 3Wednesday 1April 0.622109 35248.243689
2 APR 12 28328 Thursday Pirates 57 Cloudy Night NO NO NO NO 28.328 4Thursday 1April 0.437728 35110.694994
3 APR 13 31601 Friday Padres 54 Cloudy Night NO NO YES NO 31.601 5Friday 1April 0.785359 39410.471508
4 APR 14 46549 Saturday Padres 57 Cloudy Night NO NO NO NO 46.549 6Saturday 1April 0.779976 39050.210817
7 APR 24 44014 Tuesday Braves 63 Cloudy Night NO NO NO NO 44.014 2Tuesday 1April 0.801872 40724.034076
8 APR 25 26345 Wednesday Braves 64 Cloudy Night NO NO NO NO 26.345 3Wednesday 1April 0.958139 35248.243689
9 APR 27 44807 Friday Nationals 66 Clear Night NO NO YES NO 44.807 5Friday 1April 0.875933 39410.471508
10 APR 28 54242 Saturday Nationals 71 Clear Night NO NO NO YES 54.242 6Saturday 1April 0.357817 51379.517195
11 APR 29 48753 Sunday Nationals 74 Clear Day NO YES NO NO 48.753 7Sunday 1April 0.500995 38786.112523
12 MAY 7 43713 Monday Giants 67 Clear Night NO NO NO NO 43.713 1Monday 2May 0.683463 32940.228208
13 MAY 8 32799 Tuesday Giants 75 Clear Night NO NO NO NO 32.799 2Tuesday 2May 0.712702 37674.575927
14 MAY 9 33993 Wednesday Giants 71 Clear Night NO NO NO NO 33.993 3Wednesday 2May 0.370251 32198.785539
15 MAY 11 35591 Friday Rockies 65 Clear Night NO NO YES NO 35.591 5Friday 2May 0.561196 36361.013359
16 MAY 12 33735 Saturday Rockies 65 Clear Night NO NO NO NO 33.735 6Saturday 2May 0.503083 36000.752668
18 MAY 14 24312 Monday Snakes 67 Clear Night NO NO NO NO 24.312 1Monday 2May 0.772827 32940.228208
19 MAY 15 47077 Tuesday Snakes 70 Clear Night NO NO NO YES 47.077 2Tuesday 2May 0.882641 50003.882305
20 MAY 18 40906 Friday Cardinals 64 Clear Night NO NO YES NO 40.906 5Friday 2May 0.364886 36361.013359
21 MAY 19 39383 Saturday Cardinals 67 Clear Night NO NO NO NO 39.383 6Saturday 2May 0.615396 36000.752668
23 MAY 25 36283 Friday Astros 59 Cloudy Night NO NO YES NO 36.283 5Friday 2May 0.368824 36361.013359
24 MAY 26 36561 Saturday Astros 61 Cloudy Night NO NO NO NO 36.561 6Saturday 2May 0.933140 36000.752668
25 MAY 27 33306 Sunday Astros 70 Clear Day NO NO NO NO 33.306 7Sunday 2May 0.651378 35736.654374
26 MAY 28 38016 Monday Brewers 73 Clear Night NO NO NO NO 38.016 1Monday 2May 0.397203 32940.228208
27 MAY 29 51137 Tuesday Brewers 74 Clear Night NO NO NO YES 51.137 2Tuesday 2May 0.788730 50003.882305
29 MAY 31 26773 Thursday Brewers 70 Clear Night NO NO NO NO 26.773 4Thursday 2May 0.568099 32061.236845
30 JUN 11 50559 Monday Angels 68 Clear Night NO YES NO NO 50.559 1Monday 3June 0.869127 44766.504047
31 JUN 12 55279 Tuesday Angels 66 Cloudy Night NO NO NO YES 55.279 2Tuesday 3June 0.436173 61830.158145
32 JUN 13 43494 Wednesday Angels 67 Clear Night NO NO NO NO 43.494 3Wednesday 3June 0.802148 44025.061379
34 JUN 16 45210 Saturday White Sox 68 Clear Night NO NO NO NO 45.210 6Saturday 3June 0.704261 47827.028508
35 JUN 17 53504 Sunday White Sox 74 Clear Day NO NO NO NO 53.504 7Sunday 3June 0.704581 47562.930214
37 JUN 29 49763 Friday Mets 72 Clear Night NO NO YES NO 49.763 5Friday 3June 0.924868 48187.289199
38 JUN 30 44217 Saturday Mets 78 Clear Day NO NO NO NO 44.217 6Saturday 3June 0.442141 47827.028508
39 JUL 1 55359 Sunday Mets 75 Clear Night NO NO NO YES 55.359 7Sunday 4July 0.909316 54821.045474
43 JUL 13 43873 Friday Padres 76 Cloudy Night NO NO YES NO 43.873 5Friday 4July 0.674881 43116.098082
44 JUL 14 54014 Saturday Padres 75 Clear Night NO NO NO YES 54.014 6Saturday 4July 0.594625 55085.143768
45 JUL 15 39715 Sunday Padres 77 Clear Day NO NO NO NO 39.715 7Sunday 4July 0.533310 42491.739097
47 JUL 17 53498 Tuesday Phillies 70 Clear Night NO NO NO NO 53.498 2Tuesday 4July 0.561433 44429.660649
49 JUL 30 33180 Monday Snakes 73 Clear Night NO NO NO NO 33.180 1Monday 4July 0.502967 39695.312930
51 AUG 1 36596 Wednesday Snakes 79 Clear Day NO NO NO NO 36.596 3Wednesday 5Aug 0.607194 37275.806532
52 AUG 3 43537 Friday Cubs 73 Clear Night NO NO YES NO 43.537 5Friday 5Aug 0.565945 41438.034351
54 AUG 5 42495 Sunday Cubs 83 Clear Day YES NO NO NO 42.495 7Sunday 5Aug 0.617442 40813.675366
55 AUG 6 32659 Monday Rockies 79 Clear Night NO NO NO NO 32.659 1Monday 5Aug 0.912123 38017.249200
56 AUG 7 55024 Tuesday Rockies 80 Clear Night NO NO NO YES 55.024 2Tuesday 5Aug 0.790524 55080.903297
57 AUG 8 37084 Wednesday Rockies 84 Clear Night NO NO NO NO 37.084 3Wednesday 5Aug 0.992081 37275.806532
58 AUG 20 36878 Monday Giants 80 Clear Night NO NO NO NO 36.878 1Monday 5Aug 0.958802 38017.249200
59 AUG 21 56000 Tuesday Giants 75 Clear Night NO NO NO YES 56.000 2Tuesday 5Aug 0.791964 55080.903297
61 AUG 24 39805 Friday Marlins 71 Clear Night NO NO YES NO 39.805 5Friday 5Aug 0.624917 41438.034351
62 AUG 25 40284 Saturday Marlins 70 Clear Night NO NO NO NO 40.284 6Saturday 5Aug 0.478094 41077.773660
64 AUG 30 54621 Thursday Snakes 80 Clear Night NO NO NO YES 54.621 4Thursday 5Aug 0.382317 49467.564215
66 SEP 1 35992 Saturday Snakes 81 Clear Night NO NO NO NO 35.992 6Saturday 6Sept 0.451648 40331.019770
67 SEP 2 31607 Sunday Snakes 89 Clear Day NO NO NO NO 31.607 7Sunday 6Sept 0.982005 40066.921476
70 SEP 5 50560 Wednesday Padres 77 Cloudy Night NO NO NO NO 50.560 3Wednesday 6Sept 0.738523 36529.052641
71 SEP 13 43309 Thursday Cardinals 80 Clear Night NO NO NO NO 43.309 4Thursday 6Sept 0.587304 36391.503946
72 SEP 14 40167 Friday Cardinals 85 Clear Night NO NO YES NO 40.167 5Friday 6Sept 0.471633 40691.280461
75 SEP 28 37133 Friday Rockies 77 Clear Night NO NO YES NO 37.133 5Friday 6Sept 0.899965 40691.280461
76 SEP 29 40724 Saturday Rockies 84 Cloudy Night NO NO NO NO 40.724 6Saturday 6Sept 0.416754 40331.019770
77 SEP 30 35607 Sunday Rockies 95 Clear Day NO NO NO NO 35.607 7Sunday 6Sept 0.535852 40066.921476
80 OCT 3 34014 Wednesday Giants 82 Cloudy Night NO NO NO NO 34.014 3Wednesday 7Oct 0.436893 34014.000000
In [17]:
# compute the proportion of response variance
# accounted for when predicting out-of-sample
print('\nProportion of Test Set Variance Accounted for: ',\
    round(np.power(dodgers_test['attend'].corr(dodgers_test['predict_attend']),2),3))

# use the full data set to obtain an estimate of the increase in
# attendance due to bobbleheads, controlling for other factors 
my_model_fit = smf.ols(my_model, data = dodgers).fit()
print(my_model_fit.summary())

print('\nEstimated Effect of Bobblehead Promotion on Attendance: ',\
    round(my_model_fit.params[13],0))
    
# Suggestions for the student: Reproduce the figures in this chapter
# using matplotlib, ggplot, and/or rpy2 calls to R graphics. 
# Examine regression diagnostics for the fitted model.
# Examine other linear predictors and other explanatory variables.
# See if you can improve upon the model with variable transformations. 
Proportion of Test Set Variance Accounted for:  0.217
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 attend   R-squared:                       0.544
Model:                            OLS   Adj. R-squared:                  0.456
Method:                 Least Squares   F-statistic:                     6.158
Date:                Tue, 14 Jan 2020   Prob (F-statistic):           2.08e-07
Time:                        10:37:07   Log-Likelihood:                -813.52
No. Observations:                  81   AIC:                             1655.
Df Residuals:                      67   BIC:                             1689.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
=====================================================================================================
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Intercept                          3.391e+04   2521.806     13.446      0.000    2.89e+04    3.89e+04
ordered_month[T.2May]             -2385.6248   2291.216     -1.041      0.302   -6958.912    2187.662
ordered_month[T.3June]             7163.2336   2732.721      2.621      0.011    1708.699    1.26e+04
ordered_month[T.4July]             2849.8281   2578.600      1.105      0.273   -2297.079    7996.735
ordered_month[T.5Aug]              2377.9243   2402.915      0.990      0.326   -2418.314    7174.162
ordered_month[T.6Sept]               29.0302   2521.249      0.012      0.991   -5003.404    5061.464
ordered_month[T.7Oct]              -662.6677   4046.452     -0.164      0.870   -8739.419    7414.083
ordered_day_of_week[T.2Tuesday]    7911.4936   2702.208      2.928      0.005    2517.864    1.33e+04
ordered_day_of_week[T.3Wednesday]  2460.0232   2514.029      0.979      0.331   -2558.000    7478.046
ordered_day_of_week[T.4Thursday]    775.3638   3486.154      0.222      0.825   -6183.029    7733.757
ordered_day_of_week[T.5Friday]     4883.8183   2504.653      1.950      0.055    -115.490    9883.127
ordered_day_of_week[T.6Saturday]   6372.0558   2552.084      2.497      0.015    1278.075    1.15e+04
ordered_day_of_week[T.7Sunday]     6724.0027   2506.721      2.682      0.009    1720.567    1.17e+04
bobblehead[T.YES]                  1.071e+04   2419.520      4.429      0.000    5885.521    1.55e+04
==============================================================================
Omnibus:                        6.343   Durbin-Watson:                   2.130
Prob(Omnibus):                  0.042   Jarque-Bera (JB):                5.908
Skew:                           0.654   Prob(JB):                       0.0521
Kurtosis:                       3.205   Cond. No.                         9.76
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Estimated Effect of Bobblehead Promotion on Attendance:  10715.0
In [18]:
corr = dodgerDF.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
#f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
In [ ]: