IST 718 | WK 2 | COACHES

O: Obtain

In [98]:
import pandas as pd
data = pd.read_excel("coaches_modify.xlsx")

S: Scrub

* Check and change types

In [99]:
data.dtypes
Out[99]:
School                    object
Conf                      object
Coach                     object
NCAAFBREV16              float64
MedianConfSal            float64
SchoolPay                float64
TotalPay                 float64
Bonus                    float64
BonusPaid                float64
PayPlusBonus2016         float64
StadSize                   int64
Graduation Rate (GSR)      int64
Seat Rank                  int64
GSRank                     int64
Combo Rank               float64
TrueRank                   int64
W                        float64
L                        float64
Ratio                    float64
OffenceScore             float64
Defense Score            float64
Score                    float64
PointsPerGame            float64
dtype: object
In [100]:
attrs = "School Conf Coach"

for attr in attrs.split():
    data[attr] = data[attr].astype("category")
In [101]:
data.dtypes
Out[101]:
School                   category
Conf                     category
Coach                    category
NCAAFBREV16               float64
MedianConfSal             float64
SchoolPay                 float64
TotalPay                  float64
Bonus                     float64
BonusPaid                 float64
PayPlusBonus2016          float64
StadSize                    int64
Graduation Rate (GSR)       int64
Seat Rank                   int64
GSRank                      int64
Combo Rank                float64
TrueRank                    int64
W                         float64
L                         float64
Ratio                     float64
OffenceScore              float64
Defense Score             float64
Score                     float64
PointsPerGame             float64
dtype: object

* Check for missing values

In [102]:
data.isnull().values.any()
Out[102]:
True
In [103]:
data.isnull().sum()
Out[103]:
School                    0
Conf                      0
Coach                     0
NCAAFBREV16              19
MedianConfSal             0
SchoolPay                 5
TotalPay                  5
Bonus                    22
BonusPaid                55
PayPlusBonus2016          4
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                        11
L                        11
Ratio                    11
OffenceScore             11
Defense Score            11
Score                    11
PointsPerGame            11
dtype: int64

-- DROP ROWS

In [104]:
# Drop schools without the values we are trying to predict
# Or the attributes needed to predict them

data.dropna(subset=['SchoolPay'], inplace=True)
data.dropna(subset=['W'], inplace=True)
data.dropna(subset=['L'], inplace=True)
data.dropna(subset=['Ratio'], inplace=True)
In [105]:
data.isnull().sum()
Out[105]:
School                    0
Conf                      0
Coach                     0
NCAAFBREV16              12
MedianConfSal             0
SchoolPay                 0
TotalPay                  0
Bonus                    14
BonusPaid                43
PayPlusBonus2016          0
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                         0
L                         0
Ratio                     0
OffenceScore              0
Defense Score             0
Score                     0
PointsPerGame             0
dtype: int64
In [106]:
len(data)
Out[106]:
103

-- DROP COLUMNS

In [107]:
# Eventually we might need to drop columns/ deal with this missing data
# Attempt 1:

# Drop the variables `BonusPaid` and `Bonus` as they are too close to
# data.drop('BonusPaid', axis =1, inplace=True)
# data.drop('Bonus', axis =1, inplace=True)

-- FILL MEDIANS

In [108]:
# Fill the remaining missing values with medians
# Keep the medians for later

medians = []
fill_with_median = "W, L, Ratio, OffenceScore, Defense Score, Score, PointsPerGame"
for tofill in fill_with_median.split(', '):
    median = data[tofill].median()
    medians.append({ tofill: median })
    data[tofill].fillna(median, inplace=True)
In [109]:
# medians
In [110]:
data.isnull().sum()WOW
Out[110]:
School                    0
Conf                      0
Coach                     0
NCAAFBREV16              12
MedianConfSal             0
SchoolPay                 0
TotalPay                  0
Bonus                    14
BonusPaid                43
PayPlusBonus2016          0
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                         0
L                         0
Ratio                     0
OffenceScore              0
Defense Score             0
Score                     0
PointsPerGame             0
dtype: int64

E: Explore

In [111]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
In [112]:
from pandas.plotting import scatter_matrix
attributes = ["TotalPay", 'W', 'L', 'TrueRank']
scatter_matrix(data[attributes], figsize=(12,8))
plt.show()
In [113]:
import seaborn as sns

sns.boxplot(y="TotalPay", data=data)
plt.show()
In [114]:
sns.boxplot(x="Conf", y="TotalPay", data=data, color = "gray");
plt.xticks(rotation="vertical")
plt.show()
In [115]:
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.show()
In [143]:
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)


schools = data['School']            
x_coords = data['W']
y_coords = data['TotalPay']
for i, school in enumerate(schools):
    try:
        x = x_coords[i]
        y = y_coords[i]
        plt.text(x+0.5, y+0.5, school, fontsize=9)
    except:
        pass
    
plt.show()
In [159]:
def make_scatterplot(conf_data, conf):
    p = sns.scatterplot(x="W", y="TotalPay", data=conf_data)
    p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)


    schools = conf_data['School'].values            
    x_coords = conf_data['W'].values
    y_coords = conf_data['TotalPay'].values
    for i, school in enumerate(schools):
        try:
            x = x_coords[i]
            y = y_coords[i]
            plt.text(x+0.5, y+0.5, school, fontsize=9)
        except:
            pass
    plt.title(conf)
    plt.show()
In [160]:
# universal = movies_original_df[movies_original_df.universal == True]
data.Conf.value_counts()
for conf in set(data.Conf.values):
    make_scatterplot(data[data.Conf == conf], conf)
# big_ten = data[data.Conf == 'Big Ten']
# make_scatterplot(big_ten)
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
In [156]:
big_ten
Out[156]:
School Conf Coach NCAAFBREV16 MedianConfSal SchoolPay TotalPay Bonus BonusPaid PayPlusBonus2016 ... GSRank Combo Rank TrueRank W L Ratio OffenceScore Defense Score Score PointsPerGame
40 Illinois Big Ten Lovie Smith 97447731.0 3775000.0 5000000.0 5000000.0 1000000.0 50000.0 5050000.0 ... 52 83.666667 32 3.0 9.0 0.333333 -5.92 -1.83 -7.75 69.69
41 Indiana Big Ten Tom Allen 106139192.0 3775000.0 1830000.0 1830000.0 1441668.0 NaN 1830000.0 ... 60 65.333333 64 6.0 7.0 0.857143 2.15 -0.77 1.38 74.18
42 Iowa Big Ten Kirk Ferentz 130681467.0 3775000.0 4700000.0 4700000.0 2875000.0 600000.0 5300000.0 ... 101 107.666667 9 8.0 5.0 1.600000 -0.51 7.31 6.81 74.25
53 Maryland Big Ten DJ Durkin 94881357.0 3775000.0 2512000.0 2512000.0 625000.0 NaN 2512000.0 ... 55 66.333333 59 6.0 7.0 0.857143 -1.82 1.89 0.07 74.00
58 Michigan Big Ten Jim Harbaugh 185173187.0 3775000.0 7504000.0 7504000.0 1325000.0 150000.0 7654000.0 ... 107 120.000000 3 10.0 3.0 3.333333 4.71 12.85 17.56 90.05
59 Michigan State Big Ten Mark Dantonio 126021377.0 3775000.0 4390417.0 4390417.0 650000.0 125000.0 4515417.0 ... 107 108.000000 7 3.0 9.0 0.333333 -1.21 -0.58 -1.79 74.72
60 Minnesota Big Ten P.J. Fleck 116376862.0 3775000.0 3550000.0 3550000.0 900000.0 NaN 3550000.0 ... 107 90.333333 23 9.0 4.0 2.250000 4.00 4.65 8.65 76.61
65 Nebraska Big Ten Scott Frost 120205090.0 3775000.0 5000000.0 5000000.0 950000.0 NaN 5000000.0 ... 40 89.666667 25 9.0 4.0 2.250000 -0.90 8.45 7.55 73.76
73 Northwestern Big Ten Pat Fitzgerald NaN 3775000.0 3619775.0 3619775.0 NaN NaN 3619775.0 ... 101 85.333333 30 7.0 6.0 1.166667 0.48 4.98 5.46 75.16
76 Ohio State Big Ten Urban Meyer 185409602.0 3775000.0 7600000.0 7600000.0 775000.0 350000.0 7950000.0 ... 127 126.333333 1 11.0 2.0 5.500000 3.69 15.13 18.82 88.76
81 Penn State Big Ten James Franklin 144017055.0 3775000.0 4800000.0 4800000.0 1000000.0 300000.0 5100000.0 ... 33 92.666667 21 11.0 3.0 3.666667 11.63 4.09 15.72 87.46
83 Purdue Big Ten Jeff Brohm 84841133.0 3775000.0 3800000.0 3800000.0 1290000.0 180000.0 3980000.0 ... 124 100.666667 15 3.0 9.0 0.333333 0.20 -9.16 -8.96 73.85
84 Rutgers Big Ten Chris Ash 96883027.0 3775000.0 2200000.0 2200000.0 960000.0 50000.0 2250000.0 ... 79 74.000000 48 2.0 10.0 0.200000 -9.42 2.45 -6.97 66.15
116 Wisconsin Big Ten Paul Chryst 143420668.0 3775000.0 3750000.0 3750000.0 NaN 290000.0 4040000.0 ... 40 83.666667 32 11.0 3.0 3.666667 2.13 13.17 15.30 78.74

14 rows × 23 columns

In [162]:
columns = ['TrueRank', 'TotalPay', 'W', 'L', 'Ratio', 'Conf']
df1 = pd.DataFrame(data, columns=columns)

sns.pairplot(df1, hue="Conf", height=2.5);
In [163]:
data.head()
Out[163]:
School Conf Coach NCAAFBREV16 MedianConfSal SchoolPay TotalPay Bonus BonusPaid PayPlusBonus2016 ... GSRank Combo Rank TrueRank W L Ratio OffenceScore Defense Score Score PointsPerGame
0 Air Force Mt. West Troy Calhoun 59577780.0 879288.0 885000.0 885000.0 247000.0 NaN 885000.0 ... 97 64.666667 67 10.0 3.0 3.333333 4.96 -0.87 4.09 78.16
1 Akron MAC Terry Bowden 35331217.0 492413.0 411000.0 412500.0 225000.0 50000.0 462500.0 ... 5 10.666667 129 5.0 7.0 0.714286 -0.40 -6.12 -6.52 72.64
2 Alabama SEC Nick Saban 174307419.0 3929800.0 8307000.0 8307000.0 1100000.0 500000.0 8807000.0 ... 84 111.333333 5 14.0 1.0 14.000000 8.01 17.62 25.62 90.38
3 Appalachian State Sun Belt Scott Satterfield 35058621.0 675000.0 712500.0 712500.0 295000.0 145000.0 857500.0 ... 13 19.666667 122 10.0 3.0 3.333333 -0.86 9.68 8.83 70.76
4 Arizona Pac-12 Kevin Sumlin 90976758.0 2752232.5 1600000.0 2000000.0 2025000.0 NaN 2000000.0 ... 60 65.666667 62 3.0 9.0 0.333333 1.01 -5.64 -4.63 74.42

5 rows × 23 columns

In [ ]: