import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # arrays and math functions
from scipy.stats import uniform # for training-and-test split
import statsmodels.api as sm # statistical models (including regression)
import statsmodels.formula.api as smf # R-like model specification
import matplotlib.pyplot as plt # 2D plotting
dodgers = pd.read_csv("dodgers_2.csv")
dodgers.head()
sns.boxplot(x="day_of_week", y="attend", data=dodgers, color="gray",
order=["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"])
plt.xticks(rotation="vertical")
dodgers['attend'].max()
dodgers['attend_norm'] = dodgers.apply(lambda x: x['attend']/56000, axis=1)
dodgers['attend_n2'] = (dodgers['attend']/56000)*100
set(dodgers['opponent'].values)
dodgers['opponent'].value_counts()
sns.regplot(x="temp", y="attend_n2", data=dodgers);
sns.scatterplot(x="temp", y="attend_n2", hue="day_of_week", size="skies", palette="Set2", data=dodgers)
sns.scatterplot(x="temp", y="attend_n2", hue="bobblehead", size="skies", palette="Set2", data=dodgers)
with sns.plotting_context("paper"):
sns.scatterplot(x="temp", y="attend", hue="day_of_week", size="skies", palette="Set2", data=dodgers)
plt.title('Attend by Temp')
sns.lmplot(x="temp", y="attend_n2", col="day_of_week", col_wrap=3, data=dodgers);
dodgers.dtypes
attrs = "month day_of_week opponent skies day_night cap shirt fireworks bobblehead division"
for attr in attrs.split():
dodgers[attr] = dodgers[attr].astype("category")
dodgers.dtypes
dodgers['temp_bins'] = pd.cut(dodgers["temp"], [0, 60, 70, 80, 110], labels=
["below_sixty", "sixties", "seventies", "eighty_up"])
dodgers.dtypes
import numpy as np
from scipy.stats import uniform
# employ training-and-test regimen for model validation
np.random.seed(1234)
dodgers['runiform'] = uniform.rvs(loc = 0, scale = 1, size = len(dodgers))
dodgers_train = dodgers[dodgers['runiform'] >= 0.33]
dodgers_test = dodgers[dodgers['runiform'] < 0.33]
# # check training data frame
# print('\ndodgers_train data frame (rows, columns): ',dodgers_train.shape)
# print(dodgers_train.head())
# # check test data frame
# print('\ndodgers_test data frame (rows, columns): ',dodgers_test.shape)
# print(dodgers_test.head())
# # specify a simple model with bobblehead entered last
# my_model = str('attend ~ ordered_month + ordered_day_of_week + bobblehead')
# # fit the model to the training set
# train_model_fit = smf.ols(my_model, data = dodgers_train).fit()
# # summary of model fit to the training set
# print(train_model_fit.summary())
# # training set predictions from the model fit to the training set
# dodgers_train['predict_attend'] = train_model_fit.fittedvalues
# # test set predictions from the model fit to the training set
# dodgers_test['predict_attend'] = train_model_fit.predict(dodgers_test)
dodgers_test
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(dodgers, test_size=0.3, random_state=42)
print(len(train_set))
train_set.head()
print(len(dodgers_train))
dodgers_train.head()
my_model = str('attend ~ month + day + day_of_week + opponent + temp + skies + day_night + bobblehead + division + temp_bins')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
my_model_2 = str('attend ~ division + bobblehead + day_night')
train_model_fit = smf.ols(my_model_2, data = train_set).fit()
print(train_model_fit.summary())
dodgers.dtypes
train_model_fit = smf.ols(my_model_2, data = dodgers).fit()
print(train_model_fit.summary())
attrs_nobobble = [attr for attr in attrs.split() if attr not in ['bobblehead', 'division']]
attrs_nobobble
# attrs.remove('bobblehead')
for attr in attrs_nobobble:
m = 'attend ~ division + bobblehead + ' + attr
my_model = str(m)
train_model_fit = smf.ols(my_model, data = dodgers).fit()
# print(train_model_fit.summary())
print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ',
round(train_model_fit.params[2],0))
for attr in attrs_nobobble:
m = 'attend ~ division + bobblehead + ' + attr
my_model = str(m)
train_model_fit = smf.ols(my_model, data = test_set).fit()
# print(train_model_fit.summary())
print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ',
round(train_model_fit.params[2],0))
m = 'attend ~ month + day + bobblehead'
my_model = str(m)
train_model_fit = smf.ols(my_model, data = dodgers).fit()
# print(train_model_fit.summary())
print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ',
round(train_model_fit.params[3],0))
m = 'attend ~ month + day + bobblehead'
my_model = str(m)
train_model_fit = smf.ols(my_model, data = train_set).fit()
# print(train_model_fit.summary())
print('\nEstimated Effect of Bobblehead and ', attr, ' Promotion on Attendance: ',
round(train_model_fit.params[3],0))