IST 718 | WK 2 | COACHES | LAB 3

O: Obtain

In [55]:
import pandas as pd
data = pd.read_csv("Coaches9.csv")

S: Scrub

* Check and change types

In [56]:
data.dtypes
Out[56]:
School          object
Conference      object
Coach           object
SchoolPay       object
TotalPay        object
Bonus           object
BonusPaid       object
AssistantPay    object
Buyout          object
dtype: object
In [57]:
data.head()
Out[57]:
School Conference Coach SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout
0 Air Force Mt. West Troy Calhoun 885000 885000 247000 -- $0 --
1 Akron MAC Terry Bowden $411,000 $412,500 $225,000 $50,000 $0 $688,500
2 Alabama SEC Nick Saban $8,307,000 $8,307,000 $1,100,000 $500,000 $0 $33,600,000
3 Alabama at Birmingham C-USA Bill Clark $900,000 $900,000 $950,000 $165,471 $0 $3,847,500
4 Appalachian State Sun Belt Scott Satterfield $712,500 $712,500 $295,000 $145,000 $0 $2,160,417
In [58]:
attrs = "School Conference Coach"

for attr in attrs.split():
    data[attr] = data[attr].astype("category")
In [59]:
data.dtypes
Out[59]:
School          category
Conference      category
Coach           category
SchoolPay         object
TotalPay          object
Bonus             object
BonusPaid         object
AssistantPay      object
Buyout            object
dtype: object
In [61]:
# attrs = "SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout"

# for attr in attrs.split():
#     try:
#         data[attr] = data[attr].replace('--',0)
#         data[attr] = data[attr].replace('[\$--,]', '', regex=True).astype(float)
#     except:
#         pass

# data.head()


attrs = "SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout"
attrs = attrs.split()

data[attrs] = data[attrs].replace({'\$':'', ',': '','--': 0}, regex=True)
data.head()
Out[61]:
School Conference Coach SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout
0 Air Force Mt. West Troy Calhoun 885000 885000 247000 0 0 0
1 Akron MAC Terry Bowden 411000 412500 225000 50000 0 688500
2 Alabama SEC Nick Saban 8307000 8307000 1100000 500000 0 33600000
3 Alabama at Birmingham C-USA Bill Clark 900000 900000 950000 165471 0 3847500
4 Appalachian State Sun Belt Scott Satterfield 712500 712500 295000 145000 0 2160417
In [46]:
data.dtypes
Out[46]:
School          category
Conference      category
Coach           category
SchoolPay        float64
TotalPay         float64
Bonus            float64
BonusPaid        float64
AssistantPay     float64
Buyout           float64
dtype: object

* Check for missing values

In [62]:
data.isnull().values.any()
Out[62]:
False
In [63]:
data.isnull().sum()
Out[63]:
School          0
Conference      0
Coach           0
SchoolPay       0
TotalPay        0
Bonus           0
BonusPaid       0
AssistantPay    0
Buyout          0
dtype: int64

-- DROP ROWS

In [64]:
# Drop schools without the values we are trying to predict
# Or the attributes needed to predict them

# data.dropna(subset=['SchoolPay'], inplace=True)
# data.dropna(subset=['W'], inplace=True)
# data.dropna(subset=['L'], inplace=True)
# data.dropna(subset=['Ratio'], inplace=True)
In [65]:
data.isnull().sum()
Out[65]:
School          0
Conference      0
Coach           0
SchoolPay       0
TotalPay        0
Bonus           0
BonusPaid       0
AssistantPay    0
Buyout          0
dtype: int64
In [66]:
len(data)
Out[66]:
129

-- DROP COLUMNS

In [107]:
# Eventually we might need to drop columns/ deal with this missing data
# Attempt 1:

# Drop the variables `BonusPaid` and `Bonus` as they are too close to
# data.drop('BonusPaid', axis =1, inplace=True)
# data.drop('Bonus', axis =1, inplace=True)

-- FILL MEDIANS

In [108]:
# Fill the remaining missing values with medians
# Keep the medians for later

medians = []
fill_with_median = "W, L, Ratio, OffenceScore, Defense Score, Score, PointsPerGame"
for tofill in fill_with_median.split(', '):
    median = data[tofill].median()
    medians.append({ tofill: median })
    data[tofill].fillna(median, inplace=True)
In [109]:
# medians
In [110]:
data.isnull().sum()
Out[110]:
School                    0
Conf                      0
Coach                     0
NCAAFBREV16              12
MedianConfSal             0
SchoolPay                 0
TotalPay                  0
Bonus                    14
BonusPaid                43
PayPlusBonus2016          0
StadSize                  0
Graduation Rate (GSR)     0
Seat Rank                 0
GSRank                    0
Combo Rank                0
TrueRank                  0
W                         0
L                         0
Ratio                     0
OffenceScore              0
Defense Score             0
Score                     0
PointsPerGame             0
dtype: int64

E: Explore

In [111]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
In [112]:
from pandas.plotting import scatter_matrix
attributes = ["TotalPay", 'W', 'L', 'TrueRank']
scatter_matrix(data[attributes], figsize=(12,8))
plt.show()
In [113]:
import seaborn as sns

sns.boxplot(y="TotalPay", data=data)
plt.show()
In [114]:
sns.boxplot(x="Conf", y="TotalPay", data=data, color = "gray");
plt.xticks(rotation="vertical")
plt.show()
In [115]:
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.show()
In [143]:
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)


schools = data['School']            
x_coords = data['W']
y_coords = data['TotalPay']
for i, school in enumerate(schools):
    try:
        x = x_coords[i]
        y = y_coords[i]
        plt.text(x+0.5, y+0.5, school, fontsize=9)
    except:
        pass
    
plt.show()
In [159]:
def make_scatterplot(conf_data, conf):
    p = sns.scatterplot(x="W", y="TotalPay", data=conf_data)
    p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)


    schools = conf_data['School'].values            
    x_coords = conf_data['W'].values
    y_coords = conf_data['TotalPay'].values
    for i, school in enumerate(schools):
        try:
            x = x_coords[i]
            y = y_coords[i]
            plt.text(x+0.5, y+0.5, school, fontsize=9)
        except:
            pass
    plt.title(conf)
    plt.show()
In [160]:
# universal = movies_original_df[movies_original_df.universal == True]
data.Conf.value_counts()
for conf in set(data.Conf.values):
    make_scatterplot(data[data.Conf == conf], conf)
# big_ten = data[data.Conf == 'Big Ten']
# make_scatterplot(big_ten)
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
In [156]:
big_ten
Out[156]:
School Conf Coach NCAAFBREV16 MedianConfSal SchoolPay TotalPay Bonus BonusPaid PayPlusBonus2016 ... GSRank Combo Rank TrueRank W L Ratio OffenceScore Defense Score Score PointsPerGame
40 Illinois Big Ten Lovie Smith 97447731.0 3775000.0 5000000.0 5000000.0 1000000.0 50000.0 5050000.0 ... 52 83.666667 32 3.0 9.0 0.333333 -5.92 -1.83 -7.75 69.69
41 Indiana Big Ten Tom Allen 106139192.0 3775000.0 1830000.0 1830000.0 1441668.0 NaN 1830000.0 ... 60 65.333333 64 6.0 7.0 0.857143 2.15 -0.77 1.38 74.18
42 Iowa Big Ten Kirk Ferentz 130681467.0 3775000.0 4700000.0 4700000.0 2875000.0 600000.0 5300000.0 ... 101 107.666667 9 8.0 5.0 1.600000 -0.51 7.31 6.81 74.25
53 Maryland Big Ten DJ Durkin 94881357.0 3775000.0 2512000.0 2512000.0 625000.0 NaN 2512000.0 ... 55 66.333333 59 6.0 7.0 0.857143 -1.82 1.89 0.07 74.00
58 Michigan Big Ten Jim Harbaugh 185173187.0 3775000.0 7504000.0 7504000.0 1325000.0 150000.0 7654000.0 ... 107 120.000000 3 10.0 3.0 3.333333 4.71 12.85 17.56 90.05
59 Michigan State Big Ten Mark Dantonio 126021377.0 3775000.0 4390417.0 4390417.0 650000.0 125000.0 4515417.0 ... 107 108.000000 7 3.0 9.0 0.333333 -1.21 -0.58 -1.79 74.72
60 Minnesota Big Ten P.J. Fleck 116376862.0 3775000.0 3550000.0 3550000.0 900000.0 NaN 3550000.0 ... 107 90.333333 23 9.0 4.0 2.250000 4.00 4.65 8.65 76.61
65 Nebraska Big Ten Scott Frost 120205090.0 3775000.0 5000000.0 5000000.0 950000.0 NaN 5000000.0 ... 40 89.666667 25 9.0 4.0 2.250000 -0.90 8.45 7.55 73.76
73 Northwestern Big Ten Pat Fitzgerald NaN 3775000.0 3619775.0 3619775.0 NaN NaN 3619775.0 ... 101 85.333333 30 7.0 6.0 1.166667 0.48 4.98 5.46 75.16
76 Ohio State Big Ten Urban Meyer 185409602.0 3775000.0 7600000.0 7600000.0 775000.0 350000.0 7950000.0 ... 127 126.333333 1 11.0 2.0 5.500000 3.69 15.13 18.82 88.76
81 Penn State Big Ten James Franklin 144017055.0 3775000.0 4800000.0 4800000.0 1000000.0 300000.0 5100000.0 ... 33 92.666667 21 11.0 3.0 3.666667 11.63 4.09 15.72 87.46
83 Purdue Big Ten Jeff Brohm 84841133.0 3775000.0 3800000.0 3800000.0 1290000.0 180000.0 3980000.0 ... 124 100.666667 15 3.0 9.0 0.333333 0.20 -9.16 -8.96 73.85
84 Rutgers Big Ten Chris Ash 96883027.0 3775000.0 2200000.0 2200000.0 960000.0 50000.0 2250000.0 ... 79 74.000000 48 2.0 10.0 0.200000 -9.42 2.45 -6.97 66.15
116 Wisconsin Big Ten Paul Chryst 143420668.0 3775000.0 3750000.0 3750000.0 NaN 290000.0 4040000.0 ... 40 83.666667 32 11.0 3.0 3.666667 2.13 13.17 15.30 78.74

14 rows × 23 columns

In [162]:
columns = ['TrueRank', 'TotalPay', 'W', 'L', 'Ratio', 'Conf']
df1 = pd.DataFrame(data, columns=columns)

sns.pairplot(df1, hue="Conf", height=2.5);
In [ ]:
levers = ['conf', 'bonus', 'buyout', 'rank', 'W', 'L', 'ratio']
for lever in levers:
    m = 'school_pay ~ conf + ' + lever
    my_model = str(m)
    train_model_fit = smf.ols(my_model, data = train_set).fit()
    print('\nEstimated Salary given Conference and ', lever, ' : ', 
          round(train_model_fit.params[1],0))