IST 718 LAB 3 | OBTAIN & SCRUB

OUTPUT: Clean CSV

O: OBTAIN

In [18]:
import pandas as pd
data = pd.read_csv("Coaches9.csv")
# data = pd.read_csv("coaches2019.csv")
In [19]:
data.head()
Out[19]:
School Conference Coach SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout
0 Air Force Mt. West Troy Calhoun 885000 885000 247000 -- $0 --
1 Akron MAC Terry Bowden $411,000 $412,500 $225,000 $50,000 $0 $688,500
2 Alabama SEC Nick Saban $8,307,000 $8,307,000 $1,100,000 $500,000 $0 $33,600,000
3 Alabama at Birmingham C-USA Bill Clark $900,000 $900,000 $950,000 $165,471 $0 $3,847,500
4 Appalachian State Sun Belt Scott Satterfield $712,500 $712,500 $295,000 $145,000 $0 $2,160,417
In [20]:
# data.columns = ['rank', 'school', 'conf', 'coach', 'school_pay', 'total_pay', 'max_bonus', 'bonus_paid', 'asst_pay', 'buyout']
data.columns = ['school', 'conf', 'coach', 'school_pay', 'total_pay', 'max_bonus', 'bonus_paid', 'asst_pay', 'buyout']
data
Out[20]:
school conf coach school_pay total_pay max_bonus bonus_paid asst_pay buyout
0 Air Force Mt. West Troy Calhoun 885000 885000 247000 -- $0 --
1 Akron MAC Terry Bowden $411,000 $412,500 $225,000 $50,000 $0 $688,500
2 Alabama SEC Nick Saban $8,307,000 $8,307,000 $1,100,000 $500,000 $0 $33,600,000
3 Alabama at Birmingham C-USA Bill Clark $900,000 $900,000 $950,000 $165,471 $0 $3,847,500
4 Appalachian State Sun Belt Scott Satterfield $712,500 $712,500 $295,000 $145,000 $0 $2,160,417
... ... ... ... ... ... ... ... ... ...
124 West Virginia Big 12 Dana Holgorsen $3,605,000 $3,617,500 $1,000,000 $90,000 $0 $7,150,000
125 Western Kentucky C-USA Mike Sanford Jr. $800,000 $805,850 $400,000 $0 $0 $1,200,000
126 Western Michigan MAC Tim Lester $800,000 $800,000 $346,500 $39,250 $0 $800,000
127 Wisconsin Big Ten Paul Chryst $3,750,000 $3,750,000 -- $290,000 $0 $6,000,000
128 Wyoming Mt. West Craig Bohl $1,412,000 $1,412,000 $450,000 $236,000 $0 $8,016,667

129 rows × 9 columns

S: SCRUB

In [21]:
attrs = "school conf coach".split() 
data[attrs] = data[attrs].astype('category')
In [22]:
attrs = "school_pay total_pay max_bonus bonus_paid asst_pay buyout".split()
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
In [23]:
data.isnull().sum()
Out[23]:
school        0
conf          0
coach         0
school_pay    0
total_pay     0
max_bonus     0
bonus_paid    0
asst_pay      0
buyout        0
dtype: int64
In [24]:
data
Out[24]:
school conf coach school_pay total_pay max_bonus bonus_paid asst_pay buyout
0 Air Force Mt. West Troy Calhoun 885000.0 885000.0 247000.0 0.0 0.0 0.0
1 Akron MAC Terry Bowden 411000.0 412500.0 225000.0 50000.0 0.0 688500.0
2 Alabama SEC Nick Saban 8307000.0 8307000.0 1100000.0 500000.0 0.0 33600000.0
3 Alabama at Birmingham C-USA Bill Clark 900000.0 900000.0 950000.0 165471.0 0.0 3847500.0
4 Appalachian State Sun Belt Scott Satterfield 712500.0 712500.0 295000.0 145000.0 0.0 2160417.0
... ... ... ... ... ... ... ... ... ...
124 West Virginia Big 12 Dana Holgorsen 3605000.0 3617500.0 1000000.0 90000.0 0.0 7150000.0
125 Western Kentucky C-USA Mike Sanford Jr. 800000.0 805850.0 400000.0 0.0 0.0 1200000.0
126 Western Michigan MAC Tim Lester 800000.0 800000.0 346500.0 39250.0 0.0 800000.0
127 Wisconsin Big Ten Paul Chryst 3750000.0 3750000.0 0.0 290000.0 0.0 6000000.0
128 Wyoming Mt. West Craig Bohl 1412000.0 1412000.0 450000.0 236000.0 0.0 8016667.0

129 rows × 9 columns

In [25]:
data.to_csv('v2_coaches9.csv')
In [ ]: