import numpy as np
import pandas as pd
## dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
app_train = pd.read_csv('application_train.csv')
app_train.shape
app_test = pd.read_csv('application_test.csv')
app_train['TARGET'].value_counts()
df_na = pd.DataFrame(app_train.isna().sum())
df_na['percent'] = (df_na[0] / app_train.shape[0]) *100
df_na.sort_values(by="percent", ascending = False)
correlations = app_train.corr()['TARGET'].sort_values()
correlations.tail(20)
correlations.head(20)
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
plt.style.use('fivethirtyeight')
plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25)
plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count');
app_train['DAYS_LAST_PHONE_CHANGE'] = abs(app_train['DAYS_LAST_PHONE_CHANGE'])
plt.style.use('fivethirtyeight')
plt.hist(app_train['DAYS_LAST_PHONE_CHANGE'] / 365, edgecolor = 'k', bins = 25)
plt.title('Days Since Phone Change'); plt.xlabel('Days'); plt.ylabel('Count');
type(correlations.tail(20))
high_corr = list(correlations.tail(20).axes[0])
high_corr
high_corr_for_graphs = ['CNT_FAM_MEMBERS',
'CNT_CHILDREN',
'AMT_REQ_CREDIT_BUREAU_YEAR',
'OWN_CAR_AGE',
'DAYS_REGISTRATION',
'DAYS_ID_PUBLISH',
'DAYS_LAST_PHONE_CHANGE',
'DAYS_BIRTH']
for col in high_corr_for_graphs:
# print(app_train[col].value_counts())
# print(app_train[col].dtype)
df = app_train.copy()
df[col] = abs(df[col])
plt.style.use('fivethirtyeight')
plt.hist(df[col] / 365, edgecolor = 'k', bins = 25)
plt.title(col); plt.xlabel('x'); plt.ylabel('Count');
plt.show()