IST718 | LAB 3 | NCAA D1 FOOTBALL MAP

01-22-20

In [1]:
import plotly
import plotly.graph_objects as go
In [2]:
import pandas as pd
df = pd.read_csv('ncaa_wikipedia.csv')
In [3]:
df
Out[3]:
Team Nickname City State Unnamed: 4 Current Former First Joined
0 Air Force Falcons Colorado Springs Colorado 4,237 Mountain West WAC 1955 NaN
1 Akron Zips Akron Ohio 19,200 MAC OAC, MCC, OVC 1891 1987
2 Alabama Crimson Tide Tuscaloosa Alabama 38,500 SEC SIAA, SoCon 1892 NaN
3 Appalachian State Mountaineers Boone North Carolina 19,100 Sun Belt NSC, SoCon 1928 2014[n 1]
4 Arizona Wildcats Tucson Arizona 45,200 Pac-12 BIAA, WAC 1899 NaN
... ... ... ... ... ... ... ... ... ...
125 West Virginia Mountaineers Morgantown West Virginia 30,000 Big 12 SoCon, WVIAC, Big East 1891 NaN
126 Western Kentucky Hilltoppers Bowling Green Kentucky 20,300 C-USA SIAA, KIAC, OVC, MVFC, Sun Belt 1913 2009
127 Western Michigan Broncos Kalamazoo Michigan 22,900 MAC MCC 1905 NaN
128 Wisconsin Badgers Madison Wisconsin 44,400 Big Ten NaN 1889 NaN
129 Wyoming Cowboys Laramie Wyoming 12,400 Mountain West Colorado Football Association, RMAC, WAC 1893 NaN

130 rows × 9 columns

In [4]:
counts = pd.DataFrame(df['State'].value_counts())
In [5]:
counts.reset_index(inplace=True)
In [6]:
counts.columns = ['state', 'num_teams']
In [7]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}
In [8]:
counts['abbrev'] = counts.apply(lambda x: us_state_abbrev[x['state']], axis=1)
In [9]:
fig = go.Figure(data=go.Choropleth(
    locations=counts['abbrev'], 
    z = counts['num_teams'].astype(float), 
    locationmode = 'USA-states', 
    colorscale = 'Reds',
    colorbar_title = "Number of Teams",
))

fig.update_layout(
    title_text = 'Number of Teams in NCAA in D1 FBS',
    geo_scope='usa',
)

fig.show()
In [105]:
coaches = pd.read_csv('all_coach_files.csv')
In [106]:
coaches
Out[106]:
Year Head Coaches Division Conference Wins Losses Ties WL% Notes school_num school_name
0 2019-20 Frank Wilson FBS C-USA 4.0 8.0 0.0 0.333 NaN 706 utsa_c-usa
1 2018-19 Frank Wilson FBS C-USA 3.0 9.0 0.0 0.250 NaN 706 utsa_c-usa
2 2017-18 Frank Wilson FBS C-USA 6.0 5.0 0.0 0.545 NaN 706 utsa_c-usa
3 2016-17 Frank Wilson FBS C-USA 6.0 7.0 0.0 0.462 NaN 706 utsa_c-usa
4 2015-16 Larry Coker FBS C-USA 3.0 9.0 0.0 0.250 NaN 706 utsa_c-usa
... ... ... ... ... ... ... ... ... ... ... ...
14390 2012-13 Bobby Wilder FCS CAA 11.0 2.0 0.0 0.846 FCS Playoffs 523 olddominion_c-usa
14391 2011-12 Bobby Wilder FCS CAA 10.0 3.0 0.0 0.769 FCS Playoffs 523 olddominion_c-usa
14392 2010-11 Bobby Wilder FCS CAA 8.0 3.0 0.0 0.727 NaN 523 olddominion_c-usa
14393 2009-10 Bobby Wilder NaN - 9.0 2.0 0.0 0.818 First varsity season 523 olddominion_c-usa
14394 NaN NaN NaN NaN 77.0 56.0 0.0 0.579 NaN 523 olddominion_c-usa

14395 rows × 11 columns

In [107]:
import numpy as np
coaches.replace('-', np.nan)
Out[107]:
Year Head Coaches Division Conference Wins Losses Ties WL% Notes school_num school_name
0 2019-20 Frank Wilson FBS C-USA 4.0 8.0 0.0 0.333 NaN 706 utsa_c-usa
1 2018-19 Frank Wilson FBS C-USA 3.0 9.0 0.0 0.250 NaN 706 utsa_c-usa
2 2017-18 Frank Wilson FBS C-USA 6.0 5.0 0.0 0.545 NaN 706 utsa_c-usa
3 2016-17 Frank Wilson FBS C-USA 6.0 7.0 0.0 0.462 NaN 706 utsa_c-usa
4 2015-16 Larry Coker FBS C-USA 3.0 9.0 0.0 0.250 NaN 706 utsa_c-usa
... ... ... ... ... ... ... ... ... ... ... ...
14390 2012-13 Bobby Wilder FCS CAA 11.0 2.0 0.0 0.846 FCS Playoffs 523 olddominion_c-usa
14391 2011-12 Bobby Wilder FCS CAA 10.0 3.0 0.0 0.769 FCS Playoffs 523 olddominion_c-usa
14392 2010-11 Bobby Wilder FCS CAA 8.0 3.0 0.0 0.727 NaN 523 olddominion_c-usa
14393 2009-10 Bobby Wilder NaN NaN 9.0 2.0 0.0 0.818 First varsity season 523 olddominion_c-usa
14394 NaN NaN NaN NaN 77.0 56.0 0.0 0.579 NaN 523 olddominion_c-usa

14395 rows × 11 columns

In [108]:
coaches = coaches.dropna(subset=['Division'])
In [109]:
coaches = coaches.dropna(subset=['Conference'])
In [110]:
coaches = coaches.dropna(subset=['Wins'])
In [111]:
coaches['Conference'].value_counts()
Out[111]:
-                  11449
MAC                  260
SEC                  254
C-USA                240
Big Ten              240
ACC                  237
Big 12               225
Pac-12               218
Mountain West        196
AAC                  188
Sun Belt             162
WAC                  112
FBS Independent       38
Big South             28
SoCon                 25
Southland             15
Big West              10
Atlantic 10            9
CAA                    8
MVFC                   6
OVC                    3
ECAC                   2
ASUN                   2
Patriot                1
FCS Independent        1
Name: Conference, dtype: int64
In [112]:
coaches['year_edits'] = coaches.apply(lambda x: x['Year'].split('-')[0] + "-01-01", axis=1)
In [113]:
coaches
Out[113]:
Year Head Coaches Division Conference Wins Losses Ties WL% Notes school_num school_name year_edits
0 2019-20 Frank Wilson FBS C-USA 4.0 8.0 0.0 0.333 NaN 706 utsa_c-usa 2019-01-01
1 2018-19 Frank Wilson FBS C-USA 3.0 9.0 0.0 0.250 NaN 706 utsa_c-usa 2018-01-01
2 2017-18 Frank Wilson FBS C-USA 6.0 5.0 0.0 0.545 NaN 706 utsa_c-usa 2017-01-01
3 2016-17 Frank Wilson FBS C-USA 6.0 7.0 0.0 0.462 NaN 706 utsa_c-usa 2016-01-01
4 2015-16 Larry Coker FBS C-USA 3.0 9.0 0.0 0.250 NaN 706 utsa_c-usa 2015-01-01
... ... ... ... ... ... ... ... ... ... ... ... ...
14388 2014-15 Bobby Wilder FBS C-USA 6.0 6.0 0.0 0.500 Reclassifying to FBS 523 olddominion_c-usa 2014-01-01
14389 2013-14 Bobby Wilder FBS FBS Independent 8.0 4.0 0.0 0.667 Reclassifying to FBS 523 olddominion_c-usa 2013-01-01
14390 2012-13 Bobby Wilder FCS CAA 11.0 2.0 0.0 0.846 FCS Playoffs 523 olddominion_c-usa 2012-01-01
14391 2011-12 Bobby Wilder FCS CAA 10.0 3.0 0.0 0.769 FCS Playoffs 523 olddominion_c-usa 2011-01-01
14392 2010-11 Bobby Wilder FCS CAA 8.0 3.0 0.0 0.727 NaN 523 olddominion_c-usa 2010-01-01

13929 rows × 12 columns

In [114]:
clean = pd.DataFrame()
clean['date'] = coaches['year_edits']
In [115]:
clean['name'] = coaches['Conference']
In [116]:
# clean['value'] = coaches['Wins']
In [117]:
# clean = clean.dropna(subset=['value'])
clean['value'] = coaches['Wins'].astype('int')
In [118]:
clean
Out[118]:
date name value
0 2019-01-01 C-USA 4
1 2018-01-01 C-USA 3
2 2017-01-01 C-USA 6
3 2016-01-01 C-USA 6
4 2015-01-01 C-USA 3
... ... ... ...
14388 2014-01-01 C-USA 6
14389 2013-01-01 FBS Independent 8
14390 2012-01-01 CAA 11
14391 2011-01-01 CAA 10
14392 2010-01-01 CAA 8

13929 rows × 3 columns

In [119]:
clean.to_csv('coaches_race.csv')
In [127]:
cleaner = clean[clean.date > '2000-01-01']
In [129]:
cleaner.to_csv('coaches_race_2000.csv')
In [146]:
endowments = pd.read_csv('endowments.csv')
In [147]:
endowments
Out[147]:
school code 2018 2017 2016 2015 2014 2013 2012 2011 2010 2009 2008 2007
0 Amherst College MA $2.38 $2.25 $2.03 $2.19 $2.15 $1.82 $1.64 $1.64 $1.39 $1.31 $1.71 $1.66
1 Baylor College of Medicine TX $1.27 $1.17 $1.06 $1.10 $1.02 $0.87 $0.80 $0.84 $0.80 $0.73 $1.09 $1.28
2 Baylor University TX $1.31 $1.23 $1.14 $1.17 $1.15 $1.06 $0.96 $1.00 $0.87 $0.88 $1.06 $1.02
3 Berea College KY $1.20 $1.15 $1.05 $1.10 $1.14 $1.01 $0.94 $0.98 $0.85 $0.79 $1.02 $1.10
4 Boston College MA $2.48 $2.32 $2.06 $2.22 $2.13 $1.81 $1.65 $1.73 $1.48 $1.34 $1.63 $1.67
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99 University of Washington WA $2.76 $2.53 $2.97 $3.08 $2.83 $2.35 $2.11 $2.15 $1.91 $1.65 $2.26 $2.18
100 University of Wisconsin–Madison WI $2.99 $2.75 $2.42 $2.47 $2.33 $2.02 $1.81 $1.87 $1.55 $1.37 $1.74 $1.65
101 Virginia Commonwealth University VA $1.95 $1.84 $1.56 $1.64 $1.51 $1.33 $0.44 $0.35 $0.28 $0.25 $0.33 $0.33
102 Virginia Tech Foundation VA $1.15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
103 Washington State University WA $1.02 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

104 rows × 14 columns

In [148]:
endowments.groupby('school')
Out[148]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x114b36c10>
In [149]:
endowments_melt = endowments.melt(id_vars=["school", "code"], 
        var_name="date", 
        value_name="value")
In [150]:
endowments_melt.to_csv('endowments_melt.csv')
In [151]:
endowments_melt['date_edits'] = endowments_melt.apply(lambda x: x['date'] + "-01-01", axis=1)
In [155]:
def money_to_float(money):
    try: 
        return money.split('$')[1] 
    except: 
        return 0

endowments_melt['value_edits'] = endowments_melt.apply(lambda x: money_to_float(x['value']) , axis=1)
In [156]:
endowments_melt
Out[156]:
school code date value date_edits value_edits
0 Amherst College MA 2018 $2.38 2018-01-01 2.38
1 Baylor College of Medicine TX 2018 $1.27 2018-01-01 1.27
2 Baylor University TX 2018 $1.31 2018-01-01 1.31
3 Berea College KY 2018 $1.20 2018-01-01 1.20
4 Boston College MA 2018 $2.48 2018-01-01 2.48
... ... ... ... ... ... ...
1243 University of Washington WA 2007 $2.18 2007-01-01 2.18
1244 University of Wisconsin–Madison WI 2007 $1.65 2007-01-01 1.65
1245 Virginia Commonwealth University VA 2007 $0.33 2007-01-01 0.33
1246 Virginia Tech Foundation VA 2007 NaN 2007-01-01 0
1247 Washington State University WA 2007 NaN 2007-01-01 0

1248 rows × 6 columns

In [159]:
endowment_chart = endowments_melt.filter(['school', 'date_edits', 'value_edits'], axis=1)

endowment_chart
Out[159]:
school date_edits value_edits
0 Amherst College 2018-01-01 2.38
1 Baylor College of Medicine 2018-01-01 1.27
2 Baylor University 2018-01-01 1.31
3 Berea College 2018-01-01 1.20
4 Boston College 2018-01-01 2.48
... ... ... ...
1243 University of Washington 2007-01-01 2.18
1244 University of Wisconsin–Madison 2007-01-01 1.65
1245 Virginia Commonwealth University 2007-01-01 0.33
1246 Virginia Tech Foundation 2007-01-01 0
1247 Washington State University 2007-01-01 0

1248 rows × 3 columns

In [160]:
endowment_chart.columns = ["name","date", "value"]
In [163]:
endowment_chart.to_csv('endowments_clean.csv')
In [164]:
# endowments = pd.read_csv('endowments.csv')
In [171]:
import pandas as pd
import glob

path = r'/Users/danielcaraway/Documents/IST_718_Big_Data/WK2/coach_salary_2/' # use your path
# path = r'/Users/danielcaraway/Documents/IST_718_Big_Data/WK2/' # use your path
all_files = glob.glob(path + "/*.csv")
In [172]:
coach_salary_df = pd.concat((pd.read_csv(f) for f in all_files))
In [174]:
coach_salary_df.to_csv('coach_salary_df.csv')
In [177]:
data = coach_salary_df.copy()
data['value'] = data['SCHOOL PAY'].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
In [176]:
coach_salary_df.head()
Out[176]:
RANK SCHOOL CONF COACH SCHOOL PAY YEAR
0 1.0 Alabama SEC Nick Saban $11,132,000 2017.0
1 2.0 Clemson ACC Dabo Swinney $8,504,600 2017.0
2 3.0 Michigan Big Ten Jim Harbaugh $7,004,000 2017.0
3 4.0 Ohio State Big Ten Urban Meyer $6,431,240 2017.0
4 5.0 Arizona Pac-12 Rich Rodriguez $5,631,563 2017.0
In [183]:
data['value_2'] = data['value'] * .0001
In [189]:
data['year_2'] = data.apply(lambda x: str(x['YEAR']).split('.')[0] + "-01-01", axis=1)

# endowments_melt['date_edits'] = endowments_melt.apply(lambda x: x['date'] + "-01-01", axis=1)
In [190]:
data
Out[190]:
RANK SCHOOL CONF COACH SCHOOL PAY YEAR value value_2 year_2
0 1.0 Alabama SEC Nick Saban $11,132,000 2017.0 11132000.0 1113.2000 2017-01-01
1 2.0 Clemson ACC Dabo Swinney $8,504,600 2017.0 8504600.0 850.4600 2017-01-01
2 3.0 Michigan Big Ten Jim Harbaugh $7,004,000 2017.0 7004000.0 700.4000 2017-01-01
3 4.0 Ohio State Big Ten Urban Meyer $6,431,240 2017.0 6431240.0 643.1240 2017-01-01
4 5.0 Arizona Pac-12 Rich Rodriguez $5,631,563 2017.0 5631563.0 563.1563 2017-01-01
... ... ... ... ... ... ... ... ... ...
125 126.0 Army INDEP Jeff Monken -- 2019.0 0.0 0.0000 2019-01-01
126 127.0 Liberty INDEP Hugh Freeze -- 2019.0 0.0 0.0000 2019-01-01
127 128.0 Brigham Young INDEP Kalani Sitake -- 2019.0 0.0 0.0000 2019-01-01
128 129.0 Air Force MWC Troy Calhoun -- 2019.0 0.0 0.0000 2019-01-01
129 130.0 Southern Methodist AAC Sonny Dykes -- 2019.0 0.0 0.0000 2019-01-01

777 rows × 9 columns

In [206]:
coaches_conf_chart = data.filter(['CONF', 'value_2', 'year_2'], axis=1)
coaches_conf_chart.columns = ["name","value", "date"]
coaches_conf_chart = coaches_conf_chart.dropna()
coaches_conf_chart.to_csv('coaches_conf_chart.csv')
In [196]:
coaches_school_chart = data.filter(['SCHOOL', 'value_2', 'year_2'], axis=1)
coaches_school_chart.columns = ["name","value", "date"]
# coaches_school_chart = coaches_school_chart.dropna()
coaches_school_chart.to_csv('coaches_school_chart.csv')
In [195]:
data
Out[195]:
RANK SCHOOL CONF COACH SCHOOL PAY YEAR value value_2 year_2
0 1.0 Alabama SEC Nick Saban $11,132,000 2017.0 11132000.0 1113.2000 2017-01-01
1 2.0 Clemson ACC Dabo Swinney $8,504,600 2017.0 8504600.0 850.4600 2017-01-01
2 3.0 Michigan Big Ten Jim Harbaugh $7,004,000 2017.0 7004000.0 700.4000 2017-01-01
3 4.0 Ohio State Big Ten Urban Meyer $6,431,240 2017.0 6431240.0 643.1240 2017-01-01
4 5.0 Arizona Pac-12 Rich Rodriguez $5,631,563 2017.0 5631563.0 563.1563 2017-01-01
... ... ... ... ... ... ... ... ... ...
125 126.0 Army INDEP Jeff Monken -- 2019.0 0.0 0.0000 2019-01-01
126 127.0 Liberty INDEP Hugh Freeze -- 2019.0 0.0 0.0000 2019-01-01
127 128.0 Brigham Young INDEP Kalani Sitake -- 2019.0 0.0 0.0000 2019-01-01
128 129.0 Air Force MWC Troy Calhoun -- 2019.0 0.0 0.0000 2019-01-01
129 130.0 Southern Methodist AAC Sonny Dykes -- 2019.0 0.0 0.0000 2019-01-01

777 rows × 9 columns

In [199]:
coaches_conf_chart.isna().sum()
Out[199]:
name     3
date     3
value    0
dtype: int64
In [200]:
coaches_conf_chart = coaches_conf_chart.dropna()
In [201]:
coaches_conf_chart.isna().sum()
Out[201]:
name     0
date     0
value    0
dtype: int64
In [202]:
coaches_conf_chart.to_csv('coaches_conf_chart.csv')
In [205]:
coaches_conf_chart.sort_values(by="date")
Out[205]:
name date value
129 AAC 0.00 2019-01-01
121 AAC 0.00 2014-01-01
123 AAC 0.00 2019-01-01
127 AAC 0.00 2016-01-01
126 ACC 0.00 2016-01-01
... ... ... ...
1 ACC 850.46 2017-01-01
1 SEC 870.70 2019-01-01
0 Big Ten 900.40 2016-01-01
0 ACC 925.50 2019-01-01
0 SEC 1113.20 2017-01-01

774 rows × 3 columns

In [ ]: