import pandas as pd
# data = pd.read_csv("Coaches9.csv")
data = pd.read_csv("coaches2019.csv")
data.head()
data.dtypes
# attrs = "School Conference Coach".split() # Coaches9
attrs = "SCHOOL CONF COACH".split()
data[attrs] = data[attrs].astype('category')
data.dtypes
list(data[data.columns[4:]].columns)
# attrs = "SchoolPay TotalPay Bonus BonusPaid AssistantPay Buyout".split() # Coaches9
attrs = list(data[data.columns[4:]].columns)
data[attrs] = data[attrs].replace({'\$':'', ',': '', '--':0}, regex=True).astype('float')
data.dtypes
data.isnull().sum()
len(data)
# data.drop('AssistantPay', axis=1, inplace=True) # Coaches9
# data.drop('AssistantPay', axis=1, inplace=True)
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins = 50, figsize=(20,15))
plt.show()
# data.drop('SchoolPay', axis=1, inplace=True) # Coaches9
data.drop(['RK','TOTAL PAY'], axis=1, inplace=True)
scatter_matrix(data, figsize=(12,8))
plt.show()
# data.to_csv('coaches9_clean.csv') #Coaches9
data.to_csv('coaches2019_clean.csv')
# data = train_set.drop('TotalPay', axis=1) #Coaches9
# data_labels = train_set['TotalPay'].copy() #Coaches9
# data = train_set.drop('SCHOOL PAY', axis=1)
# data_labels = train_set['SCHOOL PAY'].copy()
data.columns = ['school', 'conf', 'coach', 'school_pay', 'max_bonus', 'bonus_paid', 'asst_pay', 'buyout']
data
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
# data = train_set.drop('school_pay', axis=1)
data = train_set.drop('school_pay', axis=1)
data_labels = train_set['school_pay'].copy()
data.head()
from sklearn.compose import ColumnTransformer
# data_num = data.drop(['School', 'Conference', 'Coach'],axis =1) # Coaches9
data_num = data.drop(['school', 'conf', 'coach'],axis =1)
num_attribs = list(data_num)
# cat_attribs = ['School','Conference','Coach'] # Coaches9
cat_attribs = ['school','conf','coach']
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
# ('attribs_addr', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('cat', OneHotEncoder(), cat_attribs)
])
# data_prepared = full_pipeline.fit_transform(data)
# data_prepared
data_prepared = full_pipeline.fit_transform(data)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)
data_labels
some_data = data.iloc[:10]
some_labels = data_labels.iloc[:10]
some_data_prepared = full_pipeline.transform(some_data)
print("PREDICTIONS:", lin_reg.predict(some_data_prepared))
print('LABELS:', list(some_labels))
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Stardard Deviation:", scores.std())
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)
scores = cross_val_score(lin_reg, data_prepared, data_labels, scoring="r2", cv=10)
scores
from sklearn.metrics import r2_score
y_true = list(some_labels)
y_pred = lin_reg.predict(some_data_prepared)
r2_score(y_true, y_pred)
import pandas as pd
champions = pd.read_html('https://www.ncaa.com/history/football/fbs')
champions_df = pd.DataFrame(champions[0])
champions_df.to_csv('coaches_champions.csv', index=False)
# url = 'https://stats.ncaa.org/team/365.0/14900'
# test = pd.read_html(url)
# import requests
# r = requests.get(url)
# newhtml = r.url
# r.url
# test2 = pd.read_html('https://stats.ncaa.org/team/365.0/14900')
# def get_html(num):
# url = "https://stats.ncaa.org/teams/478037"
# html = urllib.request.urlopen(url).read()
# soup = BeautifulSoup(html, 'html.parser')
# print(soup)
# # text = soup.findAll("div", {"class": "imdb-user-review"})
# get_html(5)
import re
import urllib
from bs4 import BeautifulSoup
import requests
# url = 'https://stats.ncaa.org/teams/478037'
def get_ncaa_data(url):
# url = 'https://stats.ncaa.org/team/365.0/14900'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')
all_tables = pd.read_html(result.content)
return all_tables
all_tables = get_ncaa_data('https://stats.ncaa.org/team/365.0/14900')
all_tables_as_df = [pd.DataFrame(table) for table in all_tables]
history_2019 = all_tables_as_df[0].dropna()
teamstats_2019 = all_tables_as_df[2]
playerstats_2019 = all_tables_as_df[3]
cleaned_tables = []
all_tables = pd.read_html('https://www.ncaa.com/standings/football/fbs/all-conferences')
newdf = pd.DataFrame()
for table in all_tables:
try:
string = "SCHOOL W L W L PF PA HOME AWAY STREAK".split()
table.columns = string
cleaned_tables.append(table)
except:
print('pass')
len(cleaned_tables)
tables_as_dfs = [pd.DataFrame(table) for table in cleaned_tables]
all_tables_as_dfs = pd.concat(tables_as_dfs)
all_tables_as_dfs = all_tables_as_dfs[all_tables_as_dfs.STREAK != 'STREAK']
all_tables_as_dfs.to_csv('coaches_2019_winloss_clean.csv', index=False)
all_tables_as_dfs[:20]