import pandas as pd
data = pd.read_excel("coaches_modify.xlsx")
data.dtypes
attrs = "School Conf Coach"
for attr in attrs.split():
data[attr] = data[attr].astype("category")
data.dtypes
data.isnull().values.any()
data.isnull().sum()
# Drop schools without the value we are trying to predict
data.dropna(subset=['SchoolPay'], inplace=True)
# Drop the variables `BonusPaid` and `Bonus` as they are too close to
# What are are trying to predict
data.drop('BonusPaid', axis =1, inplace=True)
data.drop('Bonus', axis =1, inplace=True)
data.isnull().sum()
len(data)
# Fill the remaining missing values with medians
# Keep the medians for later
medians = []
fill_with_median = "W, L, Ratio, OffenceScore, Defense Score, Score, PointsPerGame"
for tofill in fill_with_median.split(', '):
median = data[tofill].median()
medians.append({ tofill: median })
data[tofill].fillna(median, inplace=True)
medians
data.isnull().sum()
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
from pandas.plotting import scatter_matrix
attributes = ["TotalPay", 'W', 'L', 'TrueRank']
scatter_matrix(data[attributes], figsize=(12,8))
plt.show()
import seaborn as sns
sns.boxplot(y="TotalPay", data=data)
plt.show()
sns.boxplot(x="Conf", y="TotalPay", data=data, color = "gray");
plt.xticks(rotation="vertical")
plt.show()
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=Coaches)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.show()