import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
app_train = pd.read_csv('application_train.csv')
app_test = pd.read_csv('application_test.csv')
def quick_preprocess(app_train, app_test, target_col):
train_labels = app_train[target_col]
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
app_train[target_col] = train_labels
print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)
return app_train, app_test, train_labels
###################################
## MODELS
###################################
## LOGISTIC REGRESSION
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.linear_model import LogisticRegression
def run_logistic_regression(app_train, app_test, target_col, id_col, train_labels):
if target_col in app_train:
train = app_train.drop(columns = target_col)
else:
train = app_train.copy()
features = list(train.columns)
test = app_test.copy()
imputer = Imputer(strategy = 'median')
scaler = MinMaxScaler(feature_range = (0, 1))
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(app_test)
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)
print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)
log_reg = LogisticRegression(C = 0.0001)
log_reg.fit(train, train_labels)
log_reg_pred = log_reg.predict_proba(test)[:, 1]
submit = app_test[[id_col]]
submit[target_col] = log_reg_pred
# submit.head()
submit.to_csv('log_reg_baseline_v2.csv', index = False)
app_train, app_test, train_labels = quick_preprocess(app_train, app_test, 'TARGET')
run_logistic_regression(app_train, app_test, 'TARGET', 'SK_ID_CURR', train_labels)