import pandas as pd
# data = pd.read_csv("Coaches9.csv")
data = pd.read_csv("coaches2019.csv")
data.head()
data.dtypes
# attrs = "School Conference Coach".split() # Coaches9
attrs = "SCHOOL CONF COACH".split() # Coaches9
data[attrs] = data[attrs].astype('category')
data.dtypes
list(data[data.columns[4:]].columns)
# attrs = "SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout".split() # Coaches9
attrs = list(data[data.columns[4:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
data.dtypes
data.isnull().sum()
len(data)
# data.drop('AssistantPay', axis=1, inplace=True) # Coaches9
# data.drop('AssistantPay', axis=1, inplace=True)
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12,8))
plt.show()
# data.drop('SchoolPay', axis=1, inplace=True) # Coaches9
data.drop(['RK','TOTAL PAY'], axis=1, inplace=True)
scatter_matrix(data, figsize=(12,8))
plt.show()
# data.to_csv('coaches9_clean.csv') #Coaches9
data.to_csv('coaches2019_clean.csv')
# data = train_set.drop('TotalPay', axis=1) #Coaches9
# data_labels = train_set['TotalPay'].copy() #Coaches9
# data = train_set.drop('SCHOOL PAY', axis=1)
# data_labels = train_set['SCHOOL PAY'].copy()
data.columns = ['school', 'conf', 'coach', 'school_pay', 'max_bonus', 'bonus_paid', 'asst_pay', 'buyout']
data
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()
data.head()
from sklearn.compose import ColumnTransformer
# data_num = data.drop(['School', 'Conference', 'Coach'],axis =1) # Coaches9
data_num = data.drop(['school', 'conf', 'coach'],axis =1)
num_attribs = list(data_num)
# cat_attribs = ['School','Conference','Coach'] # Coaches9
cat_attribs = ['school','conf','coach']
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
# ('attribs_addr', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('cat', OneHotEncoder(), cat_attribs)
])
# data_prepared = full_pipeline.fit_transform(data)
# data_prepared
data_prepared = full_pipeline.fit_transform(data)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)
data_labels
some_data = data.iloc[:10]
some_labels = data_labels.iloc[:10]
some_data_prepared = full_pipeline.transform(some_data)
print("PREDICTIONS:", lin_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Stardard Deviation:", scores.std())
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="r2", cv=10)
scores
from sklearn.metrics import r2_score
y_true = list(some_labels)
y_pred = lin_reg.predict(some_data_prepared)
r2_score(y_true, y_pred)