Linear Regression

GOAL: Trying to predict the price of a house

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv("Refactored_Py_DS_ML_Bootcamp-master/11-Linear-Regression/USA_Housing.csv")
/Users/kendraryan/.pyenv/versions/3.7.3/lib/python3.7/site-packages/pandas/compat/__init__.py:117: UserWarning: Could not import the lzma module. Your installed Python is incomplete. Attempting to use lzma compression will result in a RuntimeError.
  warnings.warn(msg)
In [3]:
df.head()
Out[3]:
Avg. Area Income Avg. Area House Age Avg. Area Number of Rooms Avg. Area Number of Bedrooms Area Population Price Address
0 79545.458574 5.682861 7.009188 4.09 23086.800503 1.059034e+06 208 Michael Ferry Apt. 674\nLaurabury, NE 3701...
1 79248.642455 6.002900 6.730821 3.09 40173.072174 1.505891e+06 188 Johnson Views Suite 079\nLake Kathleen, CA...
2 61287.067179 5.865890 8.512727 5.13 36882.159400 1.058988e+06 9127 Elizabeth Stravenue\nDanieltown, WI 06482...
3 63345.240046 7.188236 5.586729 3.26 34310.242831 1.260617e+06 USS Barnett\nFPO AP 44820
4 59982.197226 5.040555 7.839388 4.23 26354.109472 6.309435e+05 USNS Raymond\nFPO AE 09386
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB
In [5]:
df.describe()
Out[5]:
Avg. Area Income Avg. Area House Age Avg. Area Number of Rooms Avg. Area Number of Bedrooms Area Population Price
count 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5.000000e+03
mean 68583.108984 5.977222 6.987792 3.981330 36163.516039 1.232073e+06
std 10657.991214 0.991456 1.005833 1.234137 9925.650114 3.531176e+05
min 17796.631190 2.644304 3.236194 2.000000 172.610686 1.593866e+04
25% 61480.562388 5.322283 6.299250 3.140000 29403.928702 9.975771e+05
50% 68804.286404 5.970429 7.002902 4.050000 36199.406689 1.232669e+06
75% 75783.338666 6.650808 7.665871 4.490000 42861.290769 1.471210e+06
max 107701.748378 9.519088 10.759588 6.500000 69621.713378 2.469066e+06
In [6]:
sns.pairplot(df)
Out[6]:
<seaborn.axisgrid.PairGrid at 0x11a667710>
In [7]:
sns.distplot(df['Price'])
Out[7]:
<AxesSubplot:xlabel='Price'>
In [8]:
df.corr()
Out[8]:
Avg. Area Income Avg. Area House Age Avg. Area Number of Rooms Avg. Area Number of Bedrooms Area Population Price
Avg. Area Income 1.000000 -0.002007 -0.011032 0.019788 -0.016234 0.639734
Avg. Area House Age -0.002007 1.000000 -0.009428 0.006149 -0.018743 0.452543
Avg. Area Number of Rooms -0.011032 -0.009428 1.000000 0.462695 0.002040 0.335664
Avg. Area Number of Bedrooms 0.019788 0.006149 0.462695 1.000000 -0.022168 0.171071
Area Population -0.016234 -0.018743 0.002040 -0.022168 1.000000 0.408556
Price 0.639734 0.452543 0.335664 0.171071 0.408556 1.000000
In [9]:
sns.heatmap(df.corr())
Out[9]:
<AxesSubplot:>
In [10]:
sns.heatmap(df.corr(), annot=True)
Out[10]:
<AxesSubplot:>
In [11]:
from sklearn.model_selection import train_test_split

USE SHIFT + TAB TO EXPLORE THIS FUNCTION

In [12]:
train_test_split
Out[12]:
<function sklearn.model_selection._split.train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)>
In [14]:
df.columns
Out[14]:
Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')
In [16]:
X = df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population']]
y = df['Price']
In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)
In [21]:
from sklearn.linear_model import LinearRegression
In [22]:
lm = LinearRegression()
In [23]:
lm.fit(X_train, y_train)
Out[23]:
LinearRegression()
In [24]:
lm.coef_
Out[24]:
array([2.15282755e+01, 1.64883282e+05, 1.22368678e+05, 2.23380186e+03,
       1.51504200e+01])
In [25]:
lm.intercept_
Out[25]:
-2640159.796853739
In [27]:
cdf = pd.DataFrame(lm.coef_, X.columns)
cdf.columns = ['Coeff']
cdf
Out[27]:
Coeff
Avg. Area Income 21.528276
Avg. Area House Age 164883.282027
Avg. Area Number of Rooms 122368.678027
Avg. Area Number of Bedrooms 2233.801864
Area Population 15.150420
In [ ]: