IST 718 | WK 2 | COACHES

O: Obtain

In [33]:
import pandas as pd
data = pd.read_excel("coaches_modify.xlsx")

S: Scrub

* Check and change types

In [34]:
data.dtypes
Out[34]:
School                    object
Conf                      object
Coach                     object
NCAAFBREV16              float64
MedianConfSal            float64
SchoolPay                float64
TotalPay                 float64
Bonus                    float64
BonusPaid                float64
PayPlusBonus2016         float64
StadSize                   int64
Graduation Rate (GSR)      int64
Seat Rank                  int64
GSRank                     int64
Combo Rank               float64
TrueRank                   int64
W                        float64
L                        float64
Ratio                    float64
OffenceScore             float64
Defense Score            float64
Score                    float64
PointsPerGame            float64
dtype: object
In [35]:
attrs = "School Conf Coach"

for attr in attrs.split():
    data[attr] = data[attr].astype("category")
In [36]:
data.dtypes
Out[36]:
School                   category
Conf                     category
Coach                    category
NCAAFBREV16               float64
MedianConfSal             float64
SchoolPay                 float64
TotalPay                  float64
Bonus                     float64
BonusPaid                 float64
PayPlusBonus2016          float64
StadSize                    int64
Graduation Rate (GSR)       int64
Seat Rank                   int64
GSRank                      int64
Combo Rank                float64
TrueRank                    int64
W                         float64
L                         float64
Ratio                     float64
OffenceScore              float64
Defense Score             float64
Score                     float64
PointsPerGame             float64
dtype: object

* Check for missing values

In [37]:
data.isnull().values.any()
Out[37]:
True
In [38]:
data.isnull().sum()
Out[38]:
School                    0
Conf                      0
Coach                     0
NCAAFBREV16              19
MedianConfSal             0
SchoolPay                 5
TotalPay                  5
Bonus                    22
BonusPaid                55
PayPlusBonus2016          4
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                        11
L                        11
Ratio                    11
OffenceScore             11
Defense Score            11
Score                    11
PointsPerGame            11
dtype: int64
In [39]:
# Drop schools without the value we are trying to predict

data.dropna(subset=['SchoolPay'], inplace=True)

# Drop the variables `BonusPaid` and `Bonus` as they are too close to
# What are are trying to predict

data.drop('BonusPaid', axis =1, inplace=True)
data.drop('Bonus', axis =1, inplace=True)
In [40]:
data.isnull().sum()
Out[40]:
School                    0
Conf                      0
Coach                     0
NCAAFBREV16              15
MedianConfSal             0
SchoolPay                 0
TotalPay                  0
PayPlusBonus2016          0
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                        10
L                        10
Ratio                    10
OffenceScore             10
Defense Score            10
Score                    10
PointsPerGame            10
dtype: int64
In [41]:
len(data)
Out[41]:
113
In [51]:
# Fill the remaining missing values with medians
# Keep the medians for later

medians = []
fill_with_median = "W, L, Ratio, OffenceScore, Defense Score, Score, PointsPerGame"
for tofill in fill_with_median.split(', '):
    median = data[tofill].median()
    medians.append({ tofill: median })
    data[tofill].fillna(median, inplace=True)
In [54]:
medians
Out[54]:
[{'W': 7.0},
 {'L': 6.0},
 {'Ratio': 1.1666666666666667},
 {'OffenceScore': 0.35},
 {'Defense Score': -0.3},
 {'Score': 1.68},
 {'PointsPerGame': 74.88}]
In [55]:
data.isnull().sum()
Out[55]:
School                   0
Conf                     0
Coach                    0
NCAAFBREV16              0
MedianConfSal            0
SchoolPay                0
TotalPay                 0
PayPlusBonus2016         0
StadSize                 0
Graduation Rate (GSR)    0
Seat Rank                0
GSRank                   0
Combo Rank               0
TrueRank                 0
W                        0
L                        0
Ratio                    0
OffenceScore             0
Defense Score            0
Score                    0
PointsPerGame            0
dtype: int64

E: Explore

In [57]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
In [61]:
from pandas.plotting import scatter_matrix
attributes = ["TotalPay", 'W', 'L', 'TrueRank']
scatter_matrix(data[attributes], figsize=(12,8))
plt.show()
In [62]:
import seaborn as sns

sns.boxplot(y="TotalPay", data=data)
plt.show()
In [64]:
sns.boxplot(x="Conf", y="TotalPay", data=data, color = "gray");
plt.xticks(rotation="vertical")
plt.show()
In [67]:
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=Coaches)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.show()
In [ ]: