import pandas as pd
data = pd.read_csv('data_for_m.csv')
data_initial = data.copy()
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
# ('attribs_addr', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
(We are simply using OneHotEncoder here)
def get_full_pipeline(data, num_attribs, cat_attribs):
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('cat', OneHotEncoder(), cat_attribs)
])
data_prepared = full_pipeline.fit_transform(data)
return data_prepared
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
data_train = train_set.drop('school_pay', axis=1)
data_train_labels = train_set['school_pay'].copy()
data_num = data_train.drop(['school', 'conf', 'coach'],axis =1)
num_attribs = list(data_num)
# cat_attribs = ['school','conf','coach']
cat_attribs = ['conf']
data_prepared = get_full_pipeline(data_train, num_attribs, cat_attribs)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_train_labels)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np
def display_scores(scores):
print("Scores:", scores)
print("Mean:", scores.mean())
print("Stardard Deviation:", scores.std())
Test Set
¶scores = cross_val_score(lin_reg, data_prepared, data_train_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
display_scores(lin_reg_rmse_scores)
Whole Set
¶data_whole = data_initial.copy()
data_whole_y = data_whole['school_pay']
data_whole_X = data_whole.drop(['school_pay', 'school','coach'], axis=1)
# full_pipeline = ColumnTransformer([
# ('num', num_pipeline, num_attribs),
# ('cat', OneHotEncoder(), cat_attribs)
# ])
# data_whole_prepared = full_pipeline.fit_transform(data_whole_X)
data_whole_prepared = get_full_pipeline(data_whole, num_attribs, cat_attribs)
lin_reg.fit(data_whole_prepared, data_whole_y)
post_prediction = data_initial.copy()
post_prediction['prediction'] = lin_reg.predict(data_whole_prepared)
post_prediction['prediction_clear'] = post_prediction.apply(lambda x: '%.1f' % x['prediction'], axis=1)
post_prediction