import pandas as pd
data = pd.read_excel("coaches_modify.xlsx")
data.dtypes
attrs = "School Conf Coach"
for attr in attrs.split():
data[attr] = data[attr].astype("category")
data.dtypes
data.isnull().values.any()
data.isnull().sum()
# Drop schools without the values we are trying to predict
# Or the attributes needed to predict them
data.dropna(subset=['SchoolPay'], inplace=True)
data.dropna(subset=['W'], inplace=True)
data.dropna(subset=['L'], inplace=True)
data.dropna(subset=['Ratio'], inplace=True)
data.isnull().sum()
len(data)
# Eventually we might need to drop columns/ deal with this missing data
# Attempt 1:
# Drop the variables `BonusPaid` and `Bonus` as they are too close to
# data.drop('BonusPaid', axis =1, inplace=True)
# data.drop('Bonus', axis =1, inplace=True)
# Fill the remaining missing values with medians
# Keep the medians for later
medians = []
fill_with_median = "W, L, Ratio, OffenceScore, Defense Score, Score, PointsPerGame"
for tofill in fill_with_median.split(', '):
median = data[tofill].median()
medians.append({ tofill: median })
data[tofill].fillna(median, inplace=True)
# medians
data.isnull().sum()WOW
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
from pandas.plotting import scatter_matrix
attributes = ["TotalPay", 'W', 'L', 'TrueRank']
scatter_matrix(data[attributes], figsize=(12,8))
plt.show()
import seaborn as sns
sns.boxplot(y="TotalPay", data=data)
plt.show()
sns.boxplot(x="Conf", y="TotalPay", data=data, color = "gray");
plt.xticks(rotation="vertical")
plt.show()
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.show()
p = sns.scatterplot(x="W", y="TotalPay", hue="Conf", data=data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
schools = data['School']
x_coords = data['W']
y_coords = data['TotalPay']
for i, school in enumerate(schools):
try:
x = x_coords[i]
y = y_coords[i]
plt.text(x+0.5, y+0.5, school, fontsize=9)
except:
pass
plt.show()
def make_scatterplot(conf_data, conf):
p = sns.scatterplot(x="W", y="TotalPay", data=conf_data)
p.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
schools = conf_data['School'].values
x_coords = conf_data['W'].values
y_coords = conf_data['TotalPay'].values
for i, school in enumerate(schools):
try:
x = x_coords[i]
y = y_coords[i]
plt.text(x+0.5, y+0.5, school, fontsize=9)
except:
pass
plt.title(conf)
plt.show()
# universal = movies_original_df[movies_original_df.universal == True]
data.Conf.value_counts()
for conf in set(data.Conf.values):
make_scatterplot(data[data.Conf == conf], conf)
# big_ten = data[data.Conf == 'Big Ten']
# make_scatterplot(big_ten)
big_ten
columns = ['TrueRank', 'TotalPay', 'W', 'L', 'Ratio', 'Conf']
df1 = pd.DataFrame(data, columns=columns)
sns.pairplot(df1, hue="Conf", height=2.5);
data.head()