In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder 
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
app_train = pd.read_csv('application_train.csv')
app_test = pd.read_csv('application_test.csv')
In [38]:
def quick_preprocess(app_train, app_test, target_col):
    train_labels = app_train[target_col]
    app_train = pd.get_dummies(app_train)
    app_test = pd.get_dummies(app_test)
    app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
    app_train[target_col] = train_labels
    print('Training Features shape: ', app_train.shape)
    print('Testing Features shape: ', app_test.shape)
    return app_train, app_test, train_labels

###################################
## MODELS
###################################
## LOGISTIC REGRESSION

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.linear_model import LogisticRegression

def run_logistic_regression(app_train, app_test, target_col, id_col, train_labels):
    if target_col in app_train.columns:
        train = app_train.drop(columns = target_col)
    else:
        train = app_train.copy()

    features = list(train.columns)
    test = app_test.copy()

    imputer = Imputer(strategy = 'median')
    scaler = MinMaxScaler(feature_range = (0, 1))
    imputer.fit(train)

    train = imputer.transform(train)
    test = imputer.transform(app_test)

    scaler.fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)

    print('Training data shape: ', train.shape)
    print('Testing data shape: ', test.shape)

    log_reg = LogisticRegression(C = 0.0001)
    log_reg.fit(train, train_labels)
    log_reg_pred = log_reg.predict_proba(test)[:, 1]
    
    submit = app_test[[id_col]]
    submit[target_col] = log_reg_pred
#     submit.head()
    submit.to_csv('log_reg_baseline_master.csv', index = False)
In [20]:
# app_train, app_test, train_labels = quick_preprocess(app_train, app_test, 'TARGET')
# run_logistic_regression(app_train, app_test, 'TARGET', 'SK_ID_CURR', train_labels)
Training Features shape:  (307511, 243)
Testing Features shape:  (48744, 242)
Training data shape:  (307511, 242)
Testing data shape:  (48744, 242)

THIS AND ONLY THIS GAVE US 67.9%

V2

In [46]:
app_train = pd.read_csv('master_train_df.csv')
app_test = pd.read_csv('master_test_df.csv')

app_train.fillna(0, inplace=True)
app_test.fillna(0, inplace=True)

app_train.replace([np.inf, -np.inf], np.nan, inplace=True)
app_test.replace([np.inf, -np.inf], np.nan, inplace=True)
In [47]:
app_train, app_test, train_labels = quick_preprocess(app_train, app_test, 'target')
run_logistic_regression(app_train, app_test, 'target', 'sk_id_curr', train_labels)
Training Features shape:  (307511, 624)
Testing Features shape:  (48744, 623)
Training data shape:  (307511, 623)
Testing data shape:  (48744, 623)

THIS GAVE US 69.7%

In [ ]: