IST 718 | LAB 3 (V2) MODELS

In [37]:
import pandas as pd
data = pd.read_csv('data_for_m.csv')
data_initial = data.copy()

PIPELINE

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

Create pipeline for numeric variables

In [39]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_addr', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

Create pipeline for categorical variables

(We are simply using OneHotEncoder here)

Create Full Pipeline

In [40]:
def get_full_pipeline(data, num_attribs, cat_attribs):
    full_pipeline = ColumnTransformer([
        ('num', num_pipeline, num_attribs),
        ('cat', OneHotEncoder(), cat_attribs)
    ])
    data_prepared = full_pipeline.fit_transform(data)
    return data_prepared
In [41]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

data_train = train_set.drop('school_pay', axis=1)
data_train_labels = train_set['school_pay'].copy()
In [42]:
data_num = data_train.drop(['school', 'conf', 'coach'],axis =1) 
num_attribs = list(data_num)
# cat_attribs = ['school','conf','coach']
cat_attribs = ['conf']


data_prepared = get_full_pipeline(data_train, num_attribs, cat_attribs)
In [43]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_train_labels)
Out[43]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [44]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Stardard Deviation:", scores.std())

Test performance on Test Set

In [45]:
scores = cross_val_score(lin_reg, data_prepared, data_train_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
display_scores(lin_reg_rmse_scores)
Scores: [ 627138.42590247  528461.92031606  740615.40460434  772653.8213664
  421934.01610918  189714.87363032  913332.69201488 1110949.79108439
  559555.59486329 1070420.75902697]
Mean: 693477.7298918306
Stardard Deviation: 273911.5276976352

Test performance on Whole Set

In [46]:
data_whole = data_initial.copy()
data_whole_y = data_whole['school_pay']
data_whole_X = data_whole.drop(['school_pay', 'school','coach'], axis=1)
In [50]:
# full_pipeline = ColumnTransformer([
#     ('num', num_pipeline, num_attribs),
#     ('cat', OneHotEncoder(), cat_attribs)
# ])

# data_whole_prepared = full_pipeline.fit_transform(data_whole_X)
data_whole_prepared = get_full_pipeline(data_whole, num_attribs, cat_attribs)
lin_reg.fit(data_whole_prepared, data_whole_y)
Out[50]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [51]:
post_prediction = data_initial.copy()
post_prediction['prediction'] = lin_reg.predict(data_whole_prepared)
In [53]:
post_prediction['prediction_clear'] = post_prediction.apply(lambda x: '%.1f' % x['prediction'], axis=1)
In [54]:
post_prediction
Out[54]:
school conf coach school_pay bonus buyout rank W L ratio prediction prediction_clear
0 Air Force Mt. West Troy Calhoun 885000.0 247000.0 4000000.0 10 11 2 0.846 1.609983e+06 1609982.5
1 Akron MAC Terry Bowden 411000.0 225000.0 688500.0 130 0 12 0.000 -1.853449e+05 -185344.9
2 Alabama SEC Nick Saban 8307000.0 1100000.0 33600000.0 10 11 2 0.846 6.532092e+06 6532092.3
3 Appalachian State Sun Belt Scott Satterfield 712500.0 295000.0 2160417.0 3 13 1 0.929 1.166417e+06 1166416.5
4 Arizona Pac-12 Kevin Sumlin 1600000.0 2025000.0 10000000.0 95 4 8 0.333 2.630493e+06 2630493.2
... ... ... ... ... ... ... ... ... ... ... ... ...
125 West Virginia Big 12 Dana Holgorsen 3605000.0 1000000.0 7150000.0 81 5 7 0.417 3.507403e+06 3507402.9
126 Western Kentucky C-USA Mike Sanford Jr. 800000.0 400000.0 1200000.0 28 9 4 0.692 8.082516e+05 808251.6
127 Western Michigan MAC Tim Lester 800000.0 346500.0 800000.0 57 7 6 0.538 6.619932e+05 661993.2
128 Wisconsin Big Ten Paul Chryst 3750000.0 775000.0 6000000.0 27 10 4 0.714 3.893803e+06 3893803.1
129 Wyoming Mt. West Craig Bohl 1412000.0 450000.0 8016667.0 34 8 5 0.615 1.554951e+06 1554951.1

130 rows × 12 columns

In [ ]: