import pandas as pd
data = pd.read_csv('coaches2019_clean.csv')
data.head()
data = pd.read_csv('coach_salary_df.csv')
data
attrs = list(data[data.columns[5:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
data.columns = "id rank school conf coach school_pay year".split()
# data['year'] = data['year'].astype('int').astype('category')
data['year'] = data['year'].astype('int')
data
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
import statsmodels.formula.api as smf
my_model = str('school_pay ~ conf + year')
train_model_fit = smf.ols(my_model, data = train_set).fit()
print(train_model_fit.summary())
syr = data[data['school'] == 'Syracuse']
syr
# train_model_fit.params[0] + train_model_fit.params[1] + (train_model_fit.params[-1]*syr['2019.0'])
train_model_fit.params[0] + train_model_fit.params[1] + train_model_fit.params[-1]*syr[year]
kwiki = pd.read_csv('kwiki.csv')
kwiki.dropna(inplace=True)
kwiki
kwiki.columns = "school nickname city state enrollment conf".split()
kstatic = pd.read_csv('kstatic.csv')
kstatic.columns = "school conf coach stad_size gsr seat_rank gsr_rank combo_rank true_rank".split()
kstatic
kwiki_copy = kwiki.copy()
kstatic_copy = kstatic.copy()
# test = kstatic_copy.merge(kwiki_copy, on="school")
test = kwiki_copy.merge(kstatic_copy, on="school", how="left")
test
test.to_csv('kwiki_kstatic.csv', index=False)
kwiki_kstatic = test.copy()
kattend = pd.read_csv('kattend.csv')
kattend.columns = "rank_attend school conf total_attend avg_attend stad_size2 pct_capacity num_games_2019".split()
kattend
kwiki_kstatic_copy = kwiki_kstatic.copy()
kattend_copy = kattend.copy()
# test = kstatic_copy.merge(kwiki_copy, on="school")
test = kwiki_kstatic_copy.merge(kattend_copy, on="school", how="left")
test
kwiki_kstatic_kattend = test.copy()
kwiki_kstatic_kattend.to_csv('kwiki_kstatic_kattend.csv', index=False)
k00 = pd.read_csv('k00.csv')
k00.dropna(inplace=True)
import re
k00['school'] = k00.apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x['school']), axis=1 )
k00
kwiki_kstatic_kattend_copy = kwiki_kstatic_kattend.copy()
k00_copy = k00.copy()
test = kwiki_kstatic_kattend_copy.merge(k00_copy, on="school", how="left")
kwiki_kstatic_kattend_k00 = test.copy()
kwiki_kstatic_kattend_k00.to_csv('kwiki_kstatic_kattend_k00.csv', index=False)
kwiki_kstatic_kattend_k00
kcent = pd.read_csv('kcent.csv')
kcent.dropna(inplace=True)
kcent['school'] = kcent.apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x['school']), axis=1 )
kwiki_kstatic_kattend_k00_copy = kwiki_kstatic_kattend_k00.copy()
kcent_copy = kcent.copy()
test = kwiki_kstatic_kattend_k00_copy.merge(kcent_copy, on="school", how="left")
test
kwiki_kstatic_kattend_k00_kcent = test.copy()
kwiki_kstatic_kattend_k00_kcent.to_csv('kwiki_kstatic_kattend_k00_kcent.csv', index=False)
import glob
path = r'/Users/danielcaraway/Documents/IST_718_Big_Data/WK2/ncaa_vegas/'
all_og_files = glob.glob(path + "/*.csv")
# coach_df = pd.concat((pd.read_csv(f) for f in all_coach_files))
for f in all_og_files:
year = f.split('_')[6].split('.')[0]
df = pd.read_csv(f)
df['year'] = year
df.columns = ['school', 'W', 'L', 'C', 'N', 'P', 'AVG_PTS_FOR', 'AVG_PTS_AGT',
'AVG_MGN', 'AVE_RUN_YPG_FOR', 'AVG_PAS_YPG_FOR', 'AVG_TOT_YPG_FOR',
'AVG_RUN_YPG_AGT', 'AVG_PAS_YPG_AGT', 'AVG_TOT_YPG_AGT',
'AVG_NET_YPG', 'NET_TO', 'AVG_PWR_RTG', 'AVG_OPP_PWR_RTG',
'NET_PWR_RTG_CHG', 'LINE_POWR', 'GAME_PLAY_POWR', 'COMP_POWR',
'X_FCTR', 'vegas_year']
df.to_csv(year + '.csv', index=False)
# df = pd.read_csv(f)
# df[]
path = r'/Users/danielcaraway/Documents/IST_718_Big_Data/WK2/ncaa_vegas_years/'
all_vegas_files = glob.glob(path + "/*.csv")
vegas_df = pd.concat((pd.read_csv(f) for f in all_vegas_files))
kvegas = vegas_df.copy()
base = pd.read_csv('ncaa_vegas_years/2019.csv')
base.columns.values
kwiki_kstatic_kattend_k00_kcent_copy = kwiki_kstatic_kattend_k00_kcent.copy()
kvegas_copy = kvegas.copy()
test = kwiki_kstatic_kattend_k00_kcent_copy.merge(kvegas_copy, on="school", how="left")
test
kvegas_long = kvegas.copy()
kvegas_long.to_csv('kvegas_long.csv', index=False)
kvegas
kcoach2019 = pd.read_csv('kcoach_2019.csv')
attrs = kcoach2019.columns[4:]
kcoach2019[attrs] = kcoach2019[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
kwiki_kstatic_kattend_k00_kcent_copy = kwiki_kstatic_kattend_k00_kcent.copy()
kcoach2019_copy = kcoach2019.copy()
test = kwiki_kstatic_kattend_k00_kcent_copy.merge(kcoach2019_copy, on="school", how="left")
test
kcoach2018 = pd.read_csv('kcoach_2018.csv')
attrs = kcoach2018.columns[4:]
kcoach2018[attrs] = kcoach2018[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
kcoach2019_copy = kcoach2019.copy()
kcoach2018_copy = kcoach2018.copy()
test = kcoach2019_copy.merge(kcoach2018_copy, on="school", how="left")
test
kcoach1918 = test.copy()
kcoach2017 = pd.read_csv('kcoach_2017.csv')
attrs = kcoach2017.columns[4:]
kcoach2017[attrs] = kcoach2017[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
kcoach1918_copy = kcoach1918.copy()
kcoach2017_copy = kcoach2017.copy()
test = kcoach1918_copy.merge(kcoach2017_copy, on="school", how="left")
test
kcoach191817 = test.copy()
kcoach2016 = pd.read_csv('kcoach_2016.csv')
attrs = kcoach2016.columns[4:]
kcoach2016[attrs] = kcoach2016[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
kcoach191817_copy = kcoach191817.copy()
kcoach2016_copy = kcoach2016.copy()
test = kcoach191817_copy.merge(kcoach2016_copy, on="school", how="left")
test
kcoach19181716 = test.copy()
kcoach2015 = pd.read_csv('kcoach_2015.csv')
attrs = kcoach2015.columns[4:]
kcoach2015[attrs] = kcoach2015[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
kcoach19181716_copy = kcoach19181716.copy()
kcoach2015_copy = kcoach2015.copy()
test = kcoach19181716_copy.merge(kcoach2015_copy, on="school", how="left")
test
kcoach1918171615 = test.copy()
kcoach2014 = pd.read_csv('kcoach_2014.csv')
attrs = kcoach2014.columns[4:]
kcoach2014[attrs] = kcoach2014[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
kcoach1918171615_copy = kcoach1918171615.copy()
kcoach2014_copy = kcoach2014.copy()
test = kcoach1918171615_copy.merge(kcoach2014_copy, on="school", how="left")
test
kcoach191817161514 = test.copy()
kcoach6years = test.copy()
kcoach6years.to_csv('kcoach6years.csv', index=False)
kwiki_kstatic_kattend_k00_kcent
kwiki_kstatic_kattend_k00_kcent_copy = kwiki_kstatic_kattend_k00_kcent.copy()
kcoach6years_copy = kcoach6years.copy()
test = kwiki_kstatic_kattend_k00_kcent_copy.merge(kcoach6years_copy, on="school", how="left")
test
kwiki_kstatic_kattend_k00_kcent_kcoach6years = test.copy()
kwiki_kstatic_kattend_k00_kcent_kcoach6years.to_csv('kwiki_kstatic_kattend_k00_kcent_kcoach6years.csv', index=False)
kwiki_kstatic_kattend_k00_kcent_kcoach6years.isnull().sum()
# s = pd.pivot_table(kvegas, index='school', columns=
# df_out = kvegas.groupby(['col0','col1','col2']).mean().unstack([1,2])
# df_out.columns = df_out.columns.map('_'.join)
# df_out.reset_index()
import glob
path = r'/Users/danielcaraway/Documents/IST_718_Big_Data/WK2/ncaa_vegas/'
all_og_files = glob.glob(path + "/*.csv")
columns = ['school', 'W', 'L', 'C', 'N', 'P', 'AVG_PTS_FOR', 'AVG_PTS_AGT',
'AVG_MGN', 'AVE_RUN_YPG_FOR', 'AVG_PAS_YPG_FOR', 'AVG_TOT_YPG_FOR',
'AVG_RUN_YPG_AGT', 'AVG_PAS_YPG_AGT', 'AVG_TOT_YPG_AGT',
'AVG_NET_YPG', 'NET_TO', 'AVG_PWR_RTG', 'AVG_OPP_PWR_RTG',
'NET_PWR_RTG_CHG', 'LINE_POWR', 'GAME_PLAY_POWR', 'COMP_POWR',
'X_FCTR', 'vegas_year']
# coach_df = pd.concat((pd.read_csv(f) for f in all_coach_files))
for f in all_og_files:
year = f.split('_')[6].split('.')[0]
df = pd.read_csv(f)
df['year'] = year
newcolumns = ['school']
renamed = [newcolumns.append(col + '_' + year) for col in columns[1:]]
df.columns = newcolumns
df.to_csv(year + '.csv', index=False)
# df = pd.read_csv(f)
# df[]
columns = ['school', 'W', 'L', 'C', 'N', 'P', 'AVG_PTS_FOR', 'AVG_PTS_AGT',
'AVG_MGN', 'AVE_RUN_YPG_FOR', 'AVG_PAS_YPG_FOR', 'AVG_TOT_YPG_FOR',
'AVG_RUN_YPG_AGT', 'AVG_PAS_YPG_AGT', 'AVG_TOT_YPG_AGT',
'AVG_NET_YPG', 'NET_TO', 'AVG_PWR_RTG', 'AVG_OPP_PWR_RTG',
'NET_PWR_RTG_CHG', 'LINE_POWR', 'GAME_PLAY_POWR', 'COMP_POWR',
'X_FCTR', 'vegas_year']
newcolumns = ['school']
renamed = [newcolumns.append(col + '_2018') for col in columns[1:]]
# newcolumns = ['school']
# newcolumns.append(renamed)
newcolumns
kvegas2019 = pd.read_csv('ncaa_vegas2/2019.csv')
kvegas2018 = pd.read_csv('ncaa_vegas2/2018.csv')
kvegas2019_copy = kvegas2019.copy()
kvegas2018_copy = kvegas2018.copy()
test = kvegas2019_copy.merge(kvegas2018_copy, on="school", how="left")
test
kvegas1918 = test.copy()
kvegas2017 = pd.read_csv('ncaa_vegas2/2017.csv')
kvegas1918_copy = kvegas1918.copy()
kvegas2017_copy = kvegas2017.copy()
test = kvegas1918_copy.merge(kvegas2017_copy, on="school", how="left")
test
kvegas191817 = test.copy()
kvegas2016 = pd.read_csv('ncaa_vegas2/2016.csv')
kvegas191817_copy = kvegas191817.copy()
kvegas2016_copy = kvegas2016.copy()
test = kvegas191817_copy.merge(kvegas2016_copy, on="school", how="left")
test
kvegas19181716 = test.copy()
kvegas19181716.to_csv('kvegas19181716.csv', index=False)
kwiki_kstatic_kattend_k00_kcent_kcoach6years_copy = kwiki_kstatic_kattend_k00_kcent_kcoach6years.copy()
kvegas4years = kvegas19181716.copy()
test = kwiki_kstatic_kattend_k00_kcent_kcoach6years_copy.merge(kvegas4years, on="school", how="left")
test
kwiki_kstatic_kattend_k00_kcent_kcoach6years_kvegas4years = test.copy()
kwiki_kstatic_kattend_k00_kcent_kcoach6years_kvegas4years.to_csv('kwiki_kstatic_kattend_k00_kcent_kcoach6years_kvegas4years.csv', index=False)
kmega = test.copy()
kmega.to_csv('kmega.csv', index=False)