IST 718 LAB 3 | OBTAIN & SCRUB

OUTPUT: Clean CSV

O: OBTAIN

In [35]:
import pandas as pd
# data = pd.read_csv("Coaches9.csv")
# data = pd.read_csv("coaches2019.csv")
data = pd.read_csv("coaches_modify.csv")
In [36]:
data.head()
Out[36]:
School Conf Coach NCAAFBREV16 MedianConfSal SchoolPay TotalPay Bonus BonusPaid PayPlusBonus2016 ... GSRank Combo Rank TrueRank W L Ratio OffenceScore Defense Score Score PointsPerGame
0 Air Force Mt. West Troy Calhoun $59,577,780 $879,288 $885,000 $885,000 $247,000 NaN $885,000 ... 97 65 67 10.0 3.0 3.333333 4.96 -0.87 4.09 78.16
1 Akron MAC Terry Bowden $35,331,217 $492,413 $411,000 $412,500 $225,000 $50,000 $462,500 ... 5 11 129 5.0 7.0 0.714286 -0.40 -6.12 -6.52 72.64
2 Alabama SEC Nick Saban $174,307,419 $3,929,800 $8,307,000 $8,307,000 $1,100,000 $500,000 $8,807,000 ... 84 111 5 14.0 1.0 14.000000 8.01 17.62 25.62 90.38
3 Appalachian State Sun Belt Scott Satterfield $35,058,621 $675,000 $712,500 $712,500 $295,000 $145,000 $857,500 ... 13 20 122 10.0 3.0 3.333333 -0.86 9.68 8.83 70.76
4 Arizona Pac-12 Kevin Sumlin $90,976,758 $2,752,233 $1,600,000 $2,000,000 $2,025,000 NaN $2,000,000 ... 60 66 62 3.0 9.0 0.333333 1.01 -5.64 -4.63 74.42

5 rows × 23 columns

In [37]:
# data.columns = ['school', 'conf', 'coach', 'school_pay', 'total_pay', 'max_bonus', 'bonus_paid', 'asst_pay', 'buyout']
# data.columns = ['rank', 'school', 'conf', 'coach', 'school_pay', 'total_pay', 'max_bonus', 'bonus_paid', 'asst_pay', 'buyout']
data.columns = ['school', 'conf', 'coach', 'ncaafbrev16', 'median_conf_sal', 'school_pay', 'total_pay', 
                'max_bonus', 'bonus_paid', 'pay_plus_bonus', 'stad_size', 'grad_rate', 'seat_rank', 'grad_rate_rank', 
                'combo', 'true_rank', 'w', 'l', 'ratio', 'offensive_score', 'defensive_score', 'score', 'points_per_game']
data
Out[37]:
school conf coach ncaafbrev16 median_conf_sal school_pay total_pay max_bonus bonus_paid pay_plus_bonus ... grad_rate_rank combo true_rank w l ratio offensive_score defensive_score score points_per_game
0 Air Force Mt. West Troy Calhoun $59,577,780 $879,288 $885,000 $885,000 $247,000 NaN $885,000 ... 97 65 67 10.0 3.0 3.333333 4.96 -0.87 4.09 78.16
1 Akron MAC Terry Bowden $35,331,217 $492,413 $411,000 $412,500 $225,000 $50,000 $462,500 ... 5 11 129 5.0 7.0 0.714286 -0.40 -6.12 -6.52 72.64
2 Alabama SEC Nick Saban $174,307,419 $3,929,800 $8,307,000 $8,307,000 $1,100,000 $500,000 $8,807,000 ... 84 111 5 14.0 1.0 14.000000 8.01 17.62 25.62 90.38
3 Appalachian State Sun Belt Scott Satterfield $35,058,621 $675,000 $712,500 $712,500 $295,000 $145,000 $857,500 ... 13 20 122 10.0 3.0 3.333333 -0.86 9.68 8.83 70.76
4 Arizona Pac-12 Kevin Sumlin $90,976,758 $2,752,233 $1,600,000 $2,000,000 $2,025,000 NaN $2,000,000 ... 60 66 62 3.0 9.0 0.333333 1.01 -5.64 -4.63 74.42
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
113 Washington State Pac-12 Mike Leach $64,294,520 $2,752,233 $3,500,000 $3,500,000 $725,000 $75,000 $3,575,000 ... 121 83 37 8.0 5.0 1.600000 5.70 3.33 9.03 84.22
114 West Virginia Big 12 Dana Holgorsen $110,565,870 $3,775,000 $3,605,000 $3,617,500 $1,000,000 $90,000 $3,707,500 ... 20 66 62 10.0 3.0 3.333333 4.01 5.56 9.57 78.75
115 Western Michigan MAC Tim Lester $38,516,531 $492,413 $800,000 $800,000 $346,500 $39,250 $839,250 ... 60 39 100 13.0 1.0 13.000000 7.61 4.42 12.03 86.24
116 Wisconsin Big Ten Paul Chryst $143,420,668 $3,775,000 $3,750,000 $3,750,000 NaN $290,000 $4,040,000 ... 40 84 32 11.0 3.0 3.666667 2.13 13.17 15.30 78.74
117 Wyoming Mt. West Craig Bohl $40,372,222 $879,288 $1,412,000 $1,412,000 $450,000 $236,000 $1,648,000 ... 125 66 59 8.0 6.0 1.333333 9.04 -7.68 1.37 81.23

118 rows × 23 columns

S: SCRUB

In [38]:
attrs = "school conf coach".split() 
data[attrs] = data[attrs].astype('category')
In [40]:
# attrs = "school_pay total_pay max_bonus bonus_paid asst_pay buyout".split()
attrs = list(data[data.columns[4:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
In [41]:
data.isnull().sum()
Out[41]:
school              0
conf                0
coach               0
ncaafbrev16        19
median_conf_sal     0
school_pay          5
total_pay           5
max_bonus          22
bonus_paid         55
pay_plus_bonus      4
stad_size           0
grad_rate           0
seat_rank           0
grad_rate_rank      0
combo               0
true_rank           0
w                  11
l                  11
ratio              11
offensive_score    11
defensive_score    11
score              11
points_per_game    11
dtype: int64
In [43]:
data
Out[43]:
school conf coach ncaafbrev16 median_conf_sal school_pay total_pay max_bonus bonus_paid pay_plus_bonus ... grad_rate_rank combo true_rank w l ratio offensive_score defensive_score score points_per_game
0 Air Force Mt. West Troy Calhoun $59,577,780 879288.0 885000.0 885000.0 247000.0 NaN 885000.0 ... 97.0 65.0 67.0 10.0 3.0 3.333333 4.96 -0.87 4.09 78.16
1 Akron MAC Terry Bowden $35,331,217 492413.0 411000.0 412500.0 225000.0 50000.0 462500.0 ... 5.0 11.0 129.0 5.0 7.0 0.714286 -0.40 -6.12 -6.52 72.64
2 Alabama SEC Nick Saban $174,307,419 3929800.0 8307000.0 8307000.0 1100000.0 500000.0 8807000.0 ... 84.0 111.0 5.0 14.0 1.0 14.000000 8.01 17.62 25.62 90.38
3 Appalachian State Sun Belt Scott Satterfield $35,058,621 675000.0 712500.0 712500.0 295000.0 145000.0 857500.0 ... 13.0 20.0 122.0 10.0 3.0 3.333333 -0.86 9.68 8.83 70.76
4 Arizona Pac-12 Kevin Sumlin $90,976,758 2752233.0 1600000.0 2000000.0 2025000.0 NaN 2000000.0 ... 60.0 66.0 62.0 3.0 9.0 0.333333 1.01 -5.64 -4.63 74.42
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
113 Washington State Pac-12 Mike Leach $64,294,520 2752233.0 3500000.0 3500000.0 725000.0 75000.0 3575000.0 ... 121.0 83.0 37.0 8.0 5.0 1.600000 5.70 3.33 9.03 84.22
114 West Virginia Big 12 Dana Holgorsen $110,565,870 3775000.0 3605000.0 3617500.0 1000000.0 90000.0 3707500.0 ... 20.0 66.0 62.0 10.0 3.0 3.333333 4.01 5.56 9.57 78.75
115 Western Michigan MAC Tim Lester $38,516,531 492413.0 800000.0 800000.0 346500.0 39250.0 839250.0 ... 60.0 39.0 100.0 13.0 1.0 13.000000 7.61 4.42 12.03 86.24
116 Wisconsin Big Ten Paul Chryst $143,420,668 3775000.0 3750000.0 3750000.0 NaN 290000.0 4040000.0 ... 40.0 84.0 32.0 11.0 3.0 3.666667 2.13 13.17 15.30 78.74
117 Wyoming Mt. West Craig Bohl $40,372,222 879288.0 1412000.0 1412000.0 450000.0 236000.0 1648000.0 ... 125.0 66.0 59.0 8.0 6.0 1.333333 9.04 -7.68 1.37 81.23

118 rows × 23 columns

In [44]:
data.to_csv('v2_coaches_modify.csv', index=False)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

CLEANING OTHER DATA

In [58]:
import pandas as pd
winloss = pd.read_csv('win_loss_2019_wiki.csv')
In [59]:
winloss.sort_values(by="Team")
Out[59]:
Rank Team W L T Pct
13 10 Air Force (Mountain West) 11 2 0 0.846
129 130 Akron (MAC) 0 12 0 0.000
10 10 Alabama (SEC) 11 2 0 0.846
3 3 Appalachian St. (Sun Belt) 13 1 0 0.929
107 95 Arizona (Pac-12) 4 8 0 0.333
... ... ... ... ... ... ...
90 81 West Virginia (Big 12) 5 7 0 0.417
29 28 Western Ky. (C-USA) 9 4 0 0.692
64 57 Western Mich. (MAC) 7 6 0 0.538
26 27 Wisconsin (Big Ten) 10 4 0 0.714
45 34 Wyoming (Mountain West) 8 5 0 0.615

130 rows × 6 columns

In [60]:
winloss['Team'] = winloss['Team'].replace(regex=['St.'], value='State')
winloss['Team'] = winloss['Team'].replace(regex=['Southern'], value='so')
In [72]:
# school_small = ''.join(school.lower().replace('(','_').replace(')', '').split())

winloss['team_key'] = winloss.apply(lambda x: ''.join(x['Team'].split('(')[0].lower().split(' '))[:8], axis=1)
In [73]:
winloss
Out[73]:
Rank Team W L T Pct team_key
0 1 LSU (SEC) 15 0 0 1.000 lsu
1 2 Clemson (ACC) 14 1 0 0.933 clemson
2 3 Ohio State (Big Ten) 13 1 0 0.929 ohiostat
3 3 Appalachian State (Sun Belt) 13 1 0 0.929 appalach
4 5 Oregon (Pac-12) 12 2 0 0.857 oregon
... ... ... ... ... ... ... ...
125 121 New Mexico (Mountain West) 2 10 0 0.167 newmexic
126 127 Massachusetts (FBS Independent) 1 11 0 0.083 massachu
127 127 Old Dominion (C-USA) 1 11 0 0.083 olddomin
128 127 UTEP (C-USA) 1 11 0 0.083 utep
129 130 Akron (MAC) 0 12 0 0.000 akron

130 rows × 7 columns

In [74]:
winloss.sort_values(by="Team")[:50]
Out[74]:
Rank Team W L T Pct team_key
13 10 Air Force (Mountain West) 11 2 0 0.846 airforce
129 130 Akron (MAC) 0 12 0 0.000 akron
10 10 Alabama (SEC) 11 2 0 0.846 alabama
3 3 Appalachian State (Sun Belt) 13 1 0 0.929 appalach
107 95 Arizona (Pac-12) 4 8 0 0.333 arizona
33 34 Arizona State (Pac-12) 8 5 0 0.615 arizonas
121 121 Arkansas (SEC) 2 10 0 0.167 arkansas
34 34 Arkansas State (Sun Belt) 8 5 0 0.615 arkansas
93 94 Army West Point (FBS Independent) 5 8 0 0.385 armywest
27 28 Auburn (SEC) 9 4 0 0.692 auburn
69 57 BYU (FBS Independent) 7 6 0 0.538 byu
85 81 Ball State (MAC) 5 7 0 0.417 ballstat
20 17 Baylor (Big 12) 11 3 0 0.786 baylor
5 5 Boise State (Mountain West) 12 2 0 0.857 boisesta
76 73 Boston College (ACC) 6 7 0 0.462 bostonco
114 113 Bowling Green (MAC) 3 9 0 0.250 bowlingg
36 34 Buffalo (MAC) 8 5 0 0.615 buffalo
35 34 California (Pac-12) 8 5 0 0.615 californ
55 55 Central Mich. (MAC) 8 6 0 0.571 centralm
60 57 Charlotte (C-USA) 7 6 0 0.538 charlott
18 17 Cincinnati (AAC) 11 3 0 0.786 cincinna
1 2 Clemson (ACC) 14 1 0 0.933 clemson
82 81 Coastal Carolina (Sun Belt) 5 7 0 0.417 coastalc
81 81 Colorado (Pac-12) 5 7 0 0.417 colorado
108 95 Colorado State (Mountain West) 4 8 0 0.333 colorado
87 81 Duke (ACC) 5 7 0 0.417 duke
111 95 East Carolina (AAC) 4 8 0 0.333 eastcaro
74 73 Eastern Mich. (MAC) 6 7 0 0.462 easternm
72 73 FIU (C-USA) 6 7 0 0.462 fiu
16 17 Fla. Atlantic (C-USA) 11 3 0 0.786 fla.atla
9 10 Florida (SEC) 11 2 0 0.846 florida
77 73 Florida State (ACC) 6 7 0 0.462 floridas
109 95 Fresno State (Mountain West) 4 8 0 0.333 fresnost
68 57 Ga. so (Sun Belt) 7 6 0 0.538 ga.so
7 5 Georgia (SEC) 12 2 0 0.857 georgia
67 57 Georgia State (Sun Belt) 7 6 0 0.538 georgias
117 113 Georgia Tech (ACC) 3 9 0 0.250 georgiat
30 31 Hawaii (Mountain West) 10 5 0 0.667 hawaii
110 95 Houston (AAC) 4 8 0 0.333 houston
73 73 Illinois (Big Ten) 6 7 0 0.462 illinois
37 34 Indiana (Big Ten) 8 5 0 0.615 indiana
23 22 Iowa (Big Ten) 10 3 0 0.769 iowa
66 57 Iowa State (Big 12) 7 6 0 0.538 iowastat
112 113 Kansas (Big 12) 3 9 0 0.250 kansas
38 34 Kansas State (Big 12) 8 5 0 0.615 kansasst
65 57 Kent State (MAC) 7 6 0 0.538 kentstat
39 34 Kentucky (SEC) 8 5 0 0.615 kentucky
0 1 LSU (SEC) 15 0 0 1.000 lsu
92 81 La.-Monroe (Sun Belt) 5 7 0 0.417 la.-monr
40 34 Liberty (FBS Independent) 8 5 0 0.615 liberty
In [75]:
winloss.team_key.values
Out[75]:
array(['lsu', 'clemson', 'ohiostat', 'appalach', 'oregon', 'boisesta',
       'memphis', 'georgia', 'oklahoma', 'florida', 'alabama', 'navy',
       'minnesot', 'airforce', 'notredam', 'pennstat', 'fla.atla', 'utah',
       'cincinna', 'louisian', 'baylor', 'smu', 'ucf', 'iowa', 'louisian',
       'sandiego', 'wisconsi', 'auburn', 'michigan', 'westernk', 'hawaii',
       'uab', 'virginia', 'arizonas', 'arkansas', 'californ', 'buffalo',
       'indiana', 'kansasst', 'kentucky', 'liberty', 'louisvil',
       'washingt', 'virginia', 'wakefore', 'wyoming', 'marshall',
       'oklahoma', 'pittsbur', 'socalifo', 'temple', 'tennesse', 'texas',
       'texasa&m', 'miami', 'centralm', 'utahstat', 'tulane', 'somiss.',
       'ohio', 'charlott', 'northcar', 'nevada', 'michigan', 'westernm',
       'kentstat', 'iowastat', 'georgias', 'ga.so', 'byu', 'missouri',
       'toledo', 'fiu', 'illinois', 'easternm', 'mississi', 'bostonco',
       'floridas', 'washingt', 'miami', 'oregonst', 'colorado',
       'coastalc', 'sanjoses', 'tcu', 'ballstat', 'nebraska', 'duke',
       'troy', 'syracuse', 'westvirg', 'northern', 'la.-monr', 'armywest',
       'middlete', 'unlv', 'ncstatet', 'northtex', 'purdue', 'southcar',
       'southfla', 'statenfo', 'texastec', 'utsa', 'ucla', 'tulsa',
       'olemiss', 'arizona', 'colorado', 'fresnost', 'houston',
       'eastcaro', 'kansas', 'vanderbi', 'bowlingg', 'rice', 'northwes',
       'georgiat', 'texassta', 'maryland', 'newmexic', 'arkansas',
       'rutgers', 'southala', 'uconn', 'newmexic', 'massachu', 'olddomin',
       'utep', 'akron'], dtype=object)
In [76]:
winloss.to_csv('winloss_withkey.csv', index=False)
In [77]:
import pandas as pd
c_modify = pd.read_csv('coaches_modify.csv')
In [78]:
c_modify.columns
Out[78]:
Index(['School', 'Conf', 'Coach', ' NCAAFBREV16 ', 'MedianConfSal',
       ' SchoolPay ', ' TotalPay ', ' Bonus ', ' BonusPaid ',
       ' PayPlusBonus2016 ', ' StadSize ', ' Graduation Rate (GSR) ',
       ' Seat Rank ', ' GSRank ', ' Combo Rank ', ' TrueRank ', 'W', 'L',
       ' Ratio ', 'OffenceScore', 'Defense Score', 'Score', 'PointsPerGame'],
      dtype='object')
In [26]:
c_modify_small = pd.DataFrame()
In [27]:
c_modify_small['school'] = c_modify['School']
In [28]:
c_modify_small['stad_size'] = c_modify[' StadSize ']
In [29]:
c_modify_small['gsr'] = c_modify[' Graduation Rate (GSR) ']
In [30]:
c_modify_small['med_conf_sal'] = c_modify['MedianConfSal']
In [31]:
c_modify_small['seat_rank'] = c_modify[' Seat Rank ']
c_modify_small['combo_rank'] = c_modify[' Combo Rank ']
c_modify_small['true_rank'] = c_modify[' TrueRank ']
c_modify_small['gs_rank'] = c_modify[' GSRank ']
In [82]:
c_modify_small['coach'] = c_modify['Coach']
In [83]:
c_modify_small.isna().sum()
Out[83]:
school          0
stad_size       0
gsr             0
med_conf_sal    0
seat_rank       0
combo_rank      0
true_rank       0
gs_rank         0
team_key        0
coach           0
dtype: int64
In [96]:
# c_modify_small['team_key'] = c_modify_small.apply(lambda x: ''.join(x['school'].split('(')[0].lower().split(' '))[:8], axis=1)

c_modify_small['team_coach_key'] = c_modify_small.apply(lambda x: (''.join(x['coach'].split())+x['school'][:4]).lower(), axis=1)
In [97]:
c_modify_small
Out[97]:
school stad_size gsr med_conf_sal seat_rank combo_rank true_rank gs_rank team_key coach team_coach team_coach_key
0 Alabama 101,821 79 $3,929,800 124 111 5 84 alabama Nick Saban nicksabanalab nicksabanalab
1 Clemson 81,500 56 $2,458,033 114 81 39 10 clemson Dabo Swinney daboswinneyclem daboswinneyclem
2 Western Michigan 30,200 74 $492,413 26 39 100 60 westernm Tim Lester timlesterwest timlesterwest
3 Washington 70,500 93 $2,752,233 103 111 6 121 washingt Chris Petersen chrispetersenwash chrispetersenwash
4 Ohio State 104,944 99 $3,775,000 128 126 1 127 ohiostat Urban Meyer urbanmeyerohio urbanmeyerohio
... ... ... ... ... ... ... ... ... ... ... ... ...
113 Miami (Ohio) 24,286 73 $492,413 12 27 114 55 miami Chuck Martin chuckmartinmiam chuckmartinmiam
114 Mississippi 60,580 85 $3,929,800 84 91 22 107 mississi Matt Luke mattlukemiss mattlukemiss
115 Pittsburgh 68,400 84 $2,458,033 100 95 20 101 pittsbur Pat Narduzzi patnarduzzipitt patnarduzzipitt
116 Southern California 93,607 63 $2,752,233 122 71 53 15 southern Clay Helton clayheltonsout clayheltonsout
117 Southern Methodist 32,000 75 $2,458,033 36 35 107 67 southern Sonny Dykes sonnydykessout sonnydykessout

118 rows × 12 columns

In [98]:
c_modify_small.to_csv('c_modify_small.csv', index=False)
In [ ]: