IST 718 | WK 2 | COACHES¶

O: Obtain¶

import pandas as pd
data = pd.read_excel("coaches_modify.xlsx")

S: Scrub¶

* Check and change types¶

data.dtypes

School                    object
Conf                      object
Coach                     object
NCAAFBREV16              float64
MedianConfSal            float64
SchoolPay                float64
TotalPay                 float64
Bonus                    float64
BonusPaid                float64
PayPlusBonus2016         float64
StadSize                   int64
Graduation Rate (GSR)      int64
Seat Rank                  int64
GSRank                     int64
Combo Rank               float64
TrueRank                   int64
W                        float64
L                        float64
Ratio                    float64
OffenceScore             float64
Defense Score            float64
Score                    float64
PointsPerGame            float64
dtype: object

attrs = "School Conf Coach"

for attr in attrs.split():
    data[attr] = data[attr].astype("category")

data.dtypes

School                   category
Conf                     category
Coach                    category
NCAAFBREV16               float64
MedianConfSal             float64
SchoolPay                 float64
TotalPay                  float64
Bonus                     float64
BonusPaid                 float64
PayPlusBonus2016          float64
StadSize                    int64
Graduation Rate (GSR)       int64
Seat Rank                   int64
GSRank                      int64
Combo Rank                float64
TrueRank                    int64
W                         float64
L                         float64
Ratio                     float64
OffenceScore              float64
Defense Score             float64
Score                     float64
PointsPerGame             float64
dtype: object

* Check for missing values¶

data.isnull().values.any()

True

data.isnull().sum()

School                    0
Conf                      0
Coach                     0
NCAAFBREV16              19
MedianConfSal             0
SchoolPay                 5
TotalPay                  5
Bonus                    22
BonusPaid                55
PayPlusBonus2016          4
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                        11
L                        11
Ratio                    11
OffenceScore             11
Defense Score            11
Score                    11
PointsPerGame            11
dtype: int64

-- DROP ROWS¶

# Drop schools without the values we are trying to predict
# Or the attributes needed to predict them

data.dropna(subset=['SchoolPay'], inplace=True)
data.dropna(subset=['W'], inplace=True)
data.dropna(subset=['L'], inplace=True)
data.dropna(subset=['Ratio'], inplace=True)

data.isnull().sum()

School                    0
Conf                      0
Coach                     0
NCAAFBREV16              12
MedianConfSal             0
SchoolPay                 0
TotalPay                  0
Bonus                    14
BonusPaid                43
PayPlusBonus2016          0
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                         0
L                         0
Ratio                     0
OffenceScore              0
Defense Score             0
Score                     0
PointsPerGame             0
dtype: int64

len(data)

103

-- DROP COLUMNS¶

# Eventually we might need to drop columns/ deal with this missing data
# Attempt 1:

# Drop the variables `BonusPaid` and `Bonus` as they are too close to
# data.drop('BonusPaid', axis =1, inplace=True)
# data.drop('Bonus', axis =1, inplace=True)

-- FILL MEDIANS¶

# Fill the remaining missing values with medians
# Keep the medians for later

medians = []
fill_with_median = "W, L, Ratio, OffenceScore, Defense Score, Score, PointsPerGame"
for tofill in fill_with_median.split(', '):
    median = data[tofill].median()
    medians.append({ tofill: median })
    data[tofill].fillna(median, inplace=True)

# medians

data.isnull().sum()WOW

School                    0
Conf                      0
Coach                     0
NCAAFBREV16              12
MedianConfSal             0
SchoolPay                 0
TotalPay                  0
Bonus                    14
BonusPaid                43
PayPlusBonus2016          0
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                         0
L                         0
Ratio                     0
OffenceScore              0
Defense Score             0
Score                     0
PointsPerGame             0
dtype: int64

E: Explore¶

%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()

from pandas.plotting import scatter_matrix
attributes = ["TotalPay", 'W', 'L', 'TrueRank']
scatter_matrix(data[attributes], figsize=(12,8))
plt.show()

import seaborn as sns

sns.boxplot(y="TotalPay", data=data)
plt.show()

sns.boxplot(x="Conf", y="TotalPay", data=data, color = "gray");
plt.xticks(rotation="vertical")
plt.show()

p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.show()

p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)


schools = data['School']            
x_coords = data['W']
y_coords = data['TotalPay']
for i, school in enumerate(schools):
    try:
        x = x_coords[i]
        y = y_coords[i]
        plt.text(x+0.5, y+0.5, school, fontsize=9)
    except:
        pass
    
plt.show()

def make_scatterplot(conf_data, conf):
    p = sns.scatterplot(x="W", y="TotalPay", data=conf_data)
    p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)


    schools = conf_data['School'].values            
    x_coords = conf_data['W'].values
    y_coords = conf_data['TotalPay'].values
    for i, school in enumerate(schools):
        try:
            x = x_coords[i]
            y = y_coords[i]
            plt.text(x+0.5, y+0.5, school, fontsize=9)
        except:
            pass
    plt.title(conf)
    plt.show()

# universal = movies_original_df[movies_original_df.universal == True]
data.Conf.value_counts()
for conf in set(data.Conf.values):
    make_scatterplot(data[data.Conf == conf], conf)
# big_ten = data[data.Conf == 'Big Ten']
# make_scatterplot(big_ten)

No handles with labels found to put in legend.

No handles with labels found to put in legend.

No handles with labels found to put in legend.

No handles with labels found to put in legend.

No handles with labels found to put in legend.

No handles with labels found to put in legend.

No handles with labels found to put in legend.

No handles with labels found to put in legend.

No handles with labels found to put in legend.

No handles with labels found to put in legend.

big_ten

columns = ['TrueRank', 'TotalPay', 'W', 'L', 'Ratio', 'Conf']
df1 = pd.DataFrame(data, columns=columns)

sns.pairplot(df1, hue="Conf", height=2.5);

data.head()

	School	Conf	Coach	NCAAFBREV16	MedianConfSal	SchoolPay	TotalPay	Bonus	BonusPaid	PayPlusBonus2016	...	GSRank	Combo Rank	TrueRank	W	L	Ratio	OffenceScore	Defense Score	Score	PointsPerGame
40	Illinois	Big Ten	Lovie Smith	97447731.0	3775000.0	5000000.0	5000000.0	1000000.0	50000.0	5050000.0	...	52	83.666667	32	3.0	9.0	0.333333	-5.92	-1.83	-7.75	69.69
41	Indiana	Big Ten	Tom Allen	106139192.0	3775000.0	1830000.0	1830000.0	1441668.0	NaN	1830000.0	...	60	65.333333	64	6.0	7.0	0.857143	2.15	-0.77	1.38	74.18
42	Iowa	Big Ten	Kirk Ferentz	130681467.0	3775000.0	4700000.0	4700000.0	2875000.0	600000.0	5300000.0	...	101	107.666667	9	8.0	5.0	1.600000	-0.51	7.31	6.81	74.25
53	Maryland	Big Ten	DJ Durkin	94881357.0	3775000.0	2512000.0	2512000.0	625000.0	NaN	2512000.0	...	55	66.333333	59	6.0	7.0	0.857143	-1.82	1.89	0.07	74.00
58	Michigan	Big Ten	Jim Harbaugh	185173187.0	3775000.0	7504000.0	7504000.0	1325000.0	150000.0	7654000.0	...	107	120.000000	3	10.0	3.0	3.333333	4.71	12.85	17.56	90.05
59	Michigan State	Big Ten	Mark Dantonio	126021377.0	3775000.0	4390417.0	4390417.0	650000.0	125000.0	4515417.0	...	107	108.000000	7	3.0	9.0	0.333333	-1.21	-0.58	-1.79	74.72
60	Minnesota	Big Ten	P.J. Fleck	116376862.0	3775000.0	3550000.0	3550000.0	900000.0	NaN	3550000.0	...	107	90.333333	23	9.0	4.0	2.250000	4.00	4.65	8.65	76.61
65	Nebraska	Big Ten	Scott Frost	120205090.0	3775000.0	5000000.0	5000000.0	950000.0	NaN	5000000.0	...	40	89.666667	25	9.0	4.0	2.250000	-0.90	8.45	7.55	73.76
73	Northwestern	Big Ten	Pat Fitzgerald	NaN	3775000.0	3619775.0	3619775.0	NaN	NaN	3619775.0	...	101	85.333333	30	7.0	6.0	1.166667	0.48	4.98	5.46	75.16
76	Ohio State	Big Ten	Urban Meyer	185409602.0	3775000.0	7600000.0	7600000.0	775000.0	350000.0	7950000.0	...	127	126.333333	1	11.0	2.0	5.500000	3.69	15.13	18.82	88.76
81	Penn State	Big Ten	James Franklin	144017055.0	3775000.0	4800000.0	4800000.0	1000000.0	300000.0	5100000.0	...	33	92.666667	21	11.0	3.0	3.666667	11.63	4.09	15.72	87.46
83	Purdue	Big Ten	Jeff Brohm	84841133.0	3775000.0	3800000.0	3800000.0	1290000.0	180000.0	3980000.0	...	124	100.666667	15	3.0	9.0	0.333333	0.20	-9.16	-8.96	73.85
84	Rutgers	Big Ten	Chris Ash	96883027.0	3775000.0	2200000.0	2200000.0	960000.0	50000.0	2250000.0	...	79	74.000000	48	2.0	10.0	0.200000	-9.42	2.45	-6.97	66.15
116	Wisconsin	Big Ten	Paul Chryst	143420668.0	3775000.0	3750000.0	3750000.0	NaN	290000.0	4040000.0	...	40	83.666667	32	11.0	3.0	3.666667	2.13	13.17	15.30	78.74

	School	Conf	Coach	NCAAFBREV16	MedianConfSal	SchoolPay	TotalPay	Bonus	BonusPaid	PayPlusBonus2016	...	GSRank	Combo Rank	TrueRank	W	L	Ratio	OffenceScore	Defense Score	Score	PointsPerGame
0	Air Force	Mt. West	Troy Calhoun	59577780.0	879288.0	885000.0	885000.0	247000.0	NaN	885000.0	...	97	64.666667	67	10.0	3.0	3.333333	4.96	-0.87	4.09	78.16
1	Akron	MAC	Terry Bowden	35331217.0	492413.0	411000.0	412500.0	225000.0	50000.0	462500.0	...	5	10.666667	129	5.0	7.0	0.714286	-0.40	-6.12	-6.52	72.64
2	Alabama	SEC	Nick Saban	174307419.0	3929800.0	8307000.0	8307000.0	1100000.0	500000.0	8807000.0	...	84	111.333333	5	14.0	1.0	14.000000	8.01	17.62	25.62	90.38
3	Appalachian State	Sun Belt	Scott Satterfield	35058621.0	675000.0	712500.0	712500.0	295000.0	145000.0	857500.0	...	13	19.666667	122	10.0	3.0	3.333333	-0.86	9.68	8.83	70.76
4	Arizona	Pac-12	Kevin Sumlin	90976758.0	2752232.5	1600000.0	2000000.0	2025000.0	NaN	2000000.0	...	60	65.666667	62	3.0	9.0	0.333333	1.01	-5.64	-4.63	74.42