01-23-20
import pandas as pd
data = pd.read_csv("Coaches9.csv")
data.head()
attrs = list(data[data.columns[3:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':None}, regex=True).astype('float')
data.dtypes
data.isnull().sum()
Four options:
For V1, we are going to
SchoolPay
BonusPaid
Bonus
& Buyout
with the attribute mean*For V2, we are going to use #4 and a more advanced version of #3* taking within conference mean
data.dropna(subset=['SchoolPay'], inplace=True)
data.drop('BonusPaid', axis=1, inplace=True)
median_bonus = data['Bonus'].median()
median_buyout = data['Buyout'].median()
data["Bonus"].fillna(median_bonus, inplace = True)
data["Buyout"].fillna(median_buyout, inplace = True)
data.isnull().sum()
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()
From these two visualizations, it's clear we need to:
SchoolPay
or TotalPay
AssistantPay
data.drop(['TotalPay','AssistantPay'], axis=1, inplace=True)
scatter_matrix(data, figsize=(12,8))
plt.show()
data.head()
We've lost a lot of data at this point and we need to add more. To do so we:
This sub OSM occured in an attached auxiliary file
data['school_key'] = data.apply(lambda x: ''.join(x['School'].lower().split(' '))[:8], axis=1)
data
winloss = pd.read_csv('winloss_withkey.csv')
test = data.copy()
data_wl = test.merge(winloss, left_on="school_key", right_on="team_key")
data_wl
data_for_m = data_wl.drop(['school_key','Team','team_key'], axis=1)
columns = ['school','conf','coach','school_pay', 'bonus', 'buyout', 'rank', 'W','L', 'T', 'ratio']
data_for_m.columns = columns
data_for_m
scatter_matrix(data_for_m, figsize=(12,8))
plt.show()
data_for_m.drop('T', axis=1, inplace=True)
data = data_for_m.copy()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()
from sklearn.compose import ColumnTransformer
data_num = data.drop(['school', 'conf', 'coach'],axis =1)
num_attribs = list(data_num)
cat_attribs = ['school','conf','coach']
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
# ('attribs_addr', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('cat', OneHotEncoder(), cat_attribs)
])
data_prepared = full_pipeline.fit_transform(data)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Stardard Deviation:", scores.std())
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
display_scores(lin_reg_rmse_scores)
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_prepared, data_labels)
scores = cross_val_score(tree_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)
display_scores(tree_reg_rmse_scores)
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(data_prepared, data_labels)
scores = cross_val_score(forest_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_reg_rmse_scores = np.sqrt(-scores)
display_scores(forest_reg_rmse_scores)