import pandas as pd
data = pd.read_csv("Coaches9.csv")
data.dtypes
data.head()
attrs = "School Conference Coach"
for attr in attrs.split():
data[attr] = data[attr].astype("category")
data.dtypes
# attrs = "SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout"
# for attr in attrs.split():
# try:
# data[attr] = data[attr].replace('--',0)
# data[attr] = data[attr].replace('[\$--,]', '', regex=True).astype(float)
# except:
# pass
# data.head()
attrs = "SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout"
attrs = attrs.split()
data[attrs] = data[attrs].replace({'\$':'', ',': '','--': 0}, regex=True)
data.head()
data.dtypes
data.isnull().values.any()
data.isnull().sum()
# Drop schools without the values we are trying to predict
# Or the attributes needed to predict them
# data.dropna(subset=['SchoolPay'], inplace=True)
# data.dropna(subset=['W'], inplace=True)
# data.dropna(subset=['L'], inplace=True)
# data.dropna(subset=['Ratio'], inplace=True)
data.isnull().sum()
len(data)
# Eventually we might need to drop columns/ deal with this missing data
# Attempt 1:
# Drop the variables `BonusPaid` and `Bonus` as they are too close to
# data.drop('BonusPaid', axis =1, inplace=True)
# data.drop('Bonus', axis =1, inplace=True)
# Fill the remaining missing values with medians
# Keep the medians for later
medians = []
fill_with_median = "W, L, Ratio, OffenceScore, Defense Score, Score, PointsPerGame"
for tofill in fill_with_median.split(', '):
median = data[tofill].median()
medians.append({ tofill: median })
data[tofill].fillna(median, inplace=True)
# medians
data.isnull().sum()
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
from pandas.plotting import scatter_matrix
attributes = ["TotalPay", 'W', 'L', 'TrueRank']
scatter_matrix(data[attributes], figsize=(12,8))
plt.show()
import seaborn as sns
sns.boxplot(y="TotalPay", data=data)
plt.show()
sns.boxplot(x="Conf", y="TotalPay", data=data, color = "gray");
plt.xticks(rotation="vertical")
plt.show()
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.show()
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
schools = data['School']
x_coords = data['W']
y_coords = data['TotalPay']
for i, school in enumerate(schools):
try:
x = x_coords[i]
y = y_coords[i]
plt.text(x+0.5, y+0.5, school, fontsize=9)
except:
pass
plt.show()
def make_scatterplot(conf_data, conf):
p = sns.scatterplot(x="W", y="TotalPay", data=conf_data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
schools = conf_data['School'].values
x_coords = conf_data['W'].values
y_coords = conf_data['TotalPay'].values
for i, school in enumerate(schools):
try:
x = x_coords[i]
y = y_coords[i]
plt.text(x+0.5, y+0.5, school, fontsize=9)
except:
pass
plt.title(conf)
plt.show()
# universal = movies_original_df[movies_original_df.universal == True]
data.Conf.value_counts()
for conf in set(data.Conf.values):
make_scatterplot(data[data.Conf == conf], conf)
# big_ten = data[data.Conf == 'Big Ten']
# make_scatterplot(big_ten)
big_ten
columns = ['TrueRank', 'TotalPay', 'W', 'L', 'Ratio', 'Conf']
df1 = pd.DataFrame(data, columns=columns)
sns.pairplot(df1, hue="Conf", height=2.5);
levers = ['conf', 'bonus', 'buyout', 'rank', 'W', 'L', 'ratio']
for lever in levers:
m = 'school_pay ~ conf + ' + lever
my_model = str(m)
train_model_fit = smf.ols(my_model, data = train_set).fit()
print('\nEstimated Salary given Conference and ', lever, ' : ',
round(train_model_fit.params[1],0))