01-22-20
import plotly
import plotly.graph_objects as go
import pandas as pd
df = pd.read_csv('ncaa_wikipedia.csv')
df
counts = pd.DataFrame(df['State'].value_counts())
counts.reset_index(inplace=True)
counts.columns = ['state', 'num_teams']
us_state_abbrev = {
'Alabama': 'AL',
'Alaska': 'AK',
'Arizona': 'AZ',
'Arkansas': 'AR',
'California': 'CA',
'Colorado': 'CO',
'Connecticut': 'CT',
'Delaware': 'DE',
'District of Columbia': 'DC',
'Florida': 'FL',
'Georgia': 'GA',
'Hawaii': 'HI',
'Idaho': 'ID',
'Illinois': 'IL',
'Indiana': 'IN',
'Iowa': 'IA',
'Kansas': 'KS',
'Kentucky': 'KY',
'Louisiana': 'LA',
'Maine': 'ME',
'Maryland': 'MD',
'Massachusetts': 'MA',
'Michigan': 'MI',
'Minnesota': 'MN',
'Mississippi': 'MS',
'Missouri': 'MO',
'Montana': 'MT',
'Nebraska': 'NE',
'Nevada': 'NV',
'New Hampshire': 'NH',
'New Jersey': 'NJ',
'New Mexico': 'NM',
'New York': 'NY',
'North Carolina': 'NC',
'North Dakota': 'ND',
'Northern Mariana Islands':'MP',
'Ohio': 'OH',
'Oklahoma': 'OK',
'Oregon': 'OR',
'Palau': 'PW',
'Pennsylvania': 'PA',
'Puerto Rico': 'PR',
'Rhode Island': 'RI',
'South Carolina': 'SC',
'South Dakota': 'SD',
'Tennessee': 'TN',
'Texas': 'TX',
'Utah': 'UT',
'Vermont': 'VT',
'Virgin Islands': 'VI',
'Virginia': 'VA',
'Washington': 'WA',
'West Virginia': 'WV',
'Wisconsin': 'WI',
'Wyoming': 'WY',
}
counts['abbrev'] = counts.apply(lambda x: us_state_abbrev[x['state']], axis=1)
fig = go.Figure(data=go.Choropleth(
locations=counts['abbrev'],
z = counts['num_teams'].astype(float),
locationmode = 'USA-states',
colorscale = 'Reds',
colorbar_title = "Number of Teams",
))
fig.update_layout(
title_text = 'Number of Teams in NCAA in D1 FBS',
geo_scope='usa',
)
fig.show()
coaches = pd.read_csv('all_coach_files.csv')
coaches
import numpy as np
coaches.replace('-', np.nan)
coaches = coaches.dropna(subset=['Division'])
coaches = coaches.dropna(subset=['Conference'])
coaches = coaches.dropna(subset=['Wins'])
coaches['Conference'].value_counts()
coaches['year_edits'] = coaches.apply(lambda x: x['Year'].split('-')[0] + "-01-01", axis=1)
coaches
clean = pd.DataFrame()
clean['date'] = coaches['year_edits']
clean['name'] = coaches['Conference']
# clean['value'] = coaches['Wins']
# clean = clean.dropna(subset=['value'])
clean['value'] = coaches['Wins'].astype('int')
clean
clean.to_csv('coaches_race.csv')
cleaner = clean[clean.date > '2000-01-01']
cleaner.to_csv('coaches_race_2000.csv')
endowments = pd.read_csv('endowments.csv')
endowments
endowments.groupby('school')
endowments_melt = endowments.melt(id_vars=["school", "code"],
var_name="date",
value_name="value")
endowments_melt.to_csv('endowments_melt.csv')
endowments_melt['date_edits'] = endowments_melt.apply(lambda x: x['date'] + "-01-01", axis=1)
def money_to_float(money):
try:
return money.split('$')[1]
except:
return 0
endowments_melt['value_edits'] = endowments_melt.apply(lambda x: money_to_float(x['value']) , axis=1)
endowments_melt
endowment_chart = endowments_melt.filter(['school', 'date_edits', 'value_edits'], axis=1)
endowment_chart
endowment_chart.columns = ["name","date", "value"]
endowment_chart.to_csv('endowments_clean.csv')
# endowments = pd.read_csv('endowments.csv')
import pandas as pd
import glob
path = r'/Users/danielcaraway/Documents/IST_718_Big_Data/WK2/coach_salary_2/' # use your path
# path = r'/Users/danielcaraway/Documents/IST_718_Big_Data/WK2/' # use your path
all_files = glob.glob(path + "/*.csv")
coach_salary_df = pd.concat((pd.read_csv(f) for f in all_files))
coach_salary_df.to_csv('coach_salary_df.csv')
data = coach_salary_df.copy()
data['value'] = data['SCHOOL PAY'].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
coach_salary_df.head()
data['value_2'] = data['value'] * .0001
data['year_2'] = data.apply(lambda x: str(x['YEAR']).split('.')[0] + "-01-01", axis=1)
# endowments_melt['date_edits'] = endowments_melt.apply(lambda x: x['date'] + "-01-01", axis=1)
data
coaches_conf_chart = data.filter(['CONF', 'value_2', 'year_2'], axis=1)
coaches_conf_chart.columns = ["name","value", "date"]
coaches_conf_chart = coaches_conf_chart.dropna()
coaches_conf_chart.to_csv('coaches_conf_chart.csv')
coaches_school_chart = data.filter(['SCHOOL', 'value_2', 'year_2'], axis=1)
coaches_school_chart.columns = ["name","value", "date"]
# coaches_school_chart = coaches_school_chart.dropna()
coaches_school_chart.to_csv('coaches_school_chart.csv')
data
coaches_conf_chart.isna().sum()
coaches_conf_chart = coaches_conf_chart.dropna()
coaches_conf_chart.isna().sum()
coaches_conf_chart.to_csv('coaches_conf_chart.csv')
coaches_conf_chart.sort_values(by="date")