%tensorflow_version 2.x
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
import os
import tempfile
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
file = tf.keras.utils
raw_df = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')
raw_df.head()
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("drive/My Drive/data")
raw_df_B = pd.read_csv('home-credit-default-risk/application_train.csv')
### A
neg, pos = np.bincount(raw_df['Class'])
total = neg + pos
print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format(
total, pos, 100 * pos / total))
### B
repaid, defaulted = np.bincount(raw_df_B['TARGET'])
total = repaid + defaulted
print('Examples:\n Total: {}\n Defaulted: {} ({:.2f}% of total)\n'.format(
total, defaulted, 100 * defaulted / total))
### A
# You don't want the `Time` column.
# The `Amount` column covers a huge range. Convert to log-space.
cleaned_df = raw_df.copy()
cleaned_df.pop('Time')
eps=0.001 # 0 => 0.1¢
cleaned_df['Log Ammount'] = np.log(cleaned_df.pop('Amount')+eps)
### B
# Temp "cleaning"
# cleaned_df_B = raw_df_B.copy()
# cols = ['TARGET','CNT_FAM_MEMBERS',
# 'CNT_CHILDREN',
# 'AMT_REQ_CREDIT_BUREAU_YEAR',
# 'OWN_CAR_AGE',
# 'DAYS_REGISTRATION',
# 'DAYS_ID_PUBLISH',
# 'DAYS_LAST_PHONE_CHANGE',
# 'DAYS_BIRTH','AMT_INCOME_TOTAL']
# cleaned_df_B = pd.DataFrame(cleaned_df_B, columns=cols)
continuous_cols = ['TARGET']
for col in raw_df_B.columns:
# if raw_df_B[col].value_counts()
column = raw_df_B[col]
if len(list(column.unique())) > 100:
if column.isna().sum() < 100000:
print(column.isna().sum())
continuous_cols.append(col)
cont_df = pd.DataFrame(raw_df_B, columns = continuous_cols)
for col in cont_df.columns:
median = cont_df[col].median()
cont_df[col].fillna(median, inplace=True)
cont_df = cont_df.drop('SK_ID_CURR',axis=1)
cleaned_df_B = cont_df.copy()
print(cleaned_df.shape)
print(cleaned_df_B.shape)
cleaned_df_B
### A
def get_split_data(df, target_label):
# Use a utility from sklearn to split and shuffle our dataset.
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)
# Form np arrays of labels and features.
train_labels = np.array(train_df.pop(target_label))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop(target_label))
test_labels = np.array(test_df.pop(target_label))
train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)
return train_df, train_labels, bool_train_labels, val_labels, test_labels, train_features, val_features, test_features
train_df, train_labels, bool_train_labels, val_labels, test_labels, train_features, val_features, test_features = get_split_data(cleaned_df, 'Class')
train_df_B, train_labels_B, bool_train_labels_B, val_labels_B, test_labels_B, train_features_B, val_features_B, test_features_B = get_split_data(cleaned_df_B, 'TARGET')
def scale_and_plot(train_df, train_labels, val_labels, test_labels, train_features, val_features, test_features, bool_train_labels, col1, col2):
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)
train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)
print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)
print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)
pos_df = pd.DataFrame(train_features[ bool_train_labels], columns = train_df.columns)
neg_df = pd.DataFrame(train_features[~bool_train_labels], columns = train_df.columns)
sns.jointplot(pos_df[col1], pos_df[col2],
kind='hex', xlim = (-5,5), ylim = (-5,5))
plt.suptitle("Positive distribution")
sns.jointplot(neg_df[col1], neg_df[col2],
kind='hex', xlim = (-5,5), ylim = (-5,5))
_ = plt.suptitle("Negative distribution")
scale_and_plot(train_df, train_labels, val_labels, test_labels, train_features, val_features, test_features, bool_train_labels, 'V5', 'V6')
scale_and_plot(train_df_B, train_labels_B, val_labels_B, test_labels_B, train_features_B, val_features_B, test_features_B, bool_train_labels_B, 'AMT_INCOME_TOTAL','DAYS_BIRTH')
METRICS = [
keras.metrics.TruePositives(name='tp'),
keras.metrics.FalsePositives(name='fp'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.FalseNegatives(name='fn'),
keras.metrics.BinaryAccuracy(name='accuracy'),
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.AUC(name='auc'),
]
def make_model(metrics = METRICS, output_bias=None, tf=train_features):
if output_bias is not None:
output_bias = tf.keras.initializers.Constant(output_bias)
model = keras.Sequential([
keras.layers.Dense(
16, activation='tanh',
input_shape=(tf.shape[-1],)),
keras.layers.Dropout(0.5),
keras.layers.Dense(1, activation='sigmoid',
bias_initializer=output_bias),
])
model.compile(
optimizer=keras.optimizers.Adam(lr=1e-3),
loss=keras.losses.BinaryCrossentropy(),
metrics=metrics)
return model
EPOCHS = 100
BATCH_SIZE = 2048
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_auc',
verbose=1,
patience=10,
mode='max',
restore_best_weights=True)
model = make_model()
model.summary()
model.predict(train_features[:10])
model = make_model(tf = train_features_B)
model.summary()
model.predict(train_features_B[:100])
raw_df_B
cont_df = pd.DataFrame(raw_df_B, columns = continuous_cols)
for col in cont_df.columns:
median = cont_df[col].median()
cont_df[col].fillna(median, inplace=True)
# continuous_cols
cont_df.isna().sum()
raw_df_B['EXT_SOURCE_1']
train_df_B['TARGET']