import pandas as pd
from fbprophet import Prophet
from google.colab import files
og_df = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/Zip_Zhvi_SingleFamilyResidence.csv', encoding='latin')
# ----- USE CALIFORNIA AS AN EXAMPLE BC CALIFORNIA IS BEST STATE DUH
# CA = og_df[og_df['State'] == 'CA']
# ca_df = pd.DataFrame(CA.describe())
# ca_df
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df = og_df.copy()
df.isna().sum()
df = og_df.copy()
zips_per_state = []
for state in set(df['State']):
state_df = df[df['State'] == state]
state_df_desc = pd.DataFrame(state_df.describe())
metro = len(set(state_df['Metro']))
avg_price_97 = state_df_desc['1997-01']['mean']
max_price_97 = state_df_desc['1997-01']['max']
min_price_97 = state_df_desc['1997-01']['max']
std_price_97 = state_df_desc['1997-01']['std']
avg_price_19 = state_df_desc['2019-12']['mean']
max_price_19 = state_df_desc['2019-12']['max']
min_price_19 = state_df_desc['2019-12']['max']
std_price_19 = state_df_desc['2019-12']['std']
zips_per_state.append({ "state": state,
"zips": len(state_df),
"metros": metro,
"avg_price_97": avg_price_97,
"max_price_97": max_price_97,
"min_price_97": min_price_97,
"std_price_97": std_price_97,
"avg_price_19": avg_price_19,
"max_price_19": max_price_19,
"min_price_19": min_price_19,
"std_price_19": std_price_19
})
# print(state, len(state_df), metro)
# ----- UNCOMMENT TO SAVE TO FILE
# zps = pd.DataFrame(zips_per_state)
# zps.to_csv('zips_per_state.csv')
# files.download('zips_per_state.csv')
df = og_df.copy()
zips_per_metro = []
for metro in set(df['Metro']):
metro_df = df[df['Metro'] == metro]
metro_df_desc = pd.DataFrame(metro_df.describe())
# metro = len(set(metro_df['Metro']))
avg_price_97 = metro_df_desc['1997-01']['mean']
max_price_97 = metro_df_desc['1997-01']['max']
min_price_97 = metro_df_desc['1997-01']['max']
std_price_97 = metro_df_desc['1997-01']['std']
avg_price_19 = metro_df_desc['2019-12']['mean']
max_price_19 = metro_df_desc['2019-12']['max']
min_price_19 = metro_df_desc['2019-12']['max']
std_price_19 = metro_df_desc['2019-12']['std']
zips_per_metro.append({ "metro": metro,
"zips": len(metro_df),
# "metros": metro,
"avg_price_97": avg_price_97,
"max_price_97": max_price_97,
"min_price_97": min_price_97,
"std_price_97": std_price_97,
"avg_price_19": avg_price_19,
"max_price_19": max_price_19,
"min_price_19": min_price_19,
"std_price_19": std_price_19
})
# ----- UNCOMMENT TO SAVE TO FILE
# zps = pd.DataFrame(zips_per_metro)
# zps.to_csv('zips_per_metro.csv')
# files.download('zips_per_metro.csv')
#
df = og_df.copy()
ds = df.columns[7:]
region_row = df[df['RegionName'] == 90039]
r_df = pd.DataFrame({ 'y': region_row.loc[:, "1996-04"::].values[0], 'ds': ds })
m = Prophet()
m.fit(r_df)
future = m.make_future_dataframe(periods=60, freq='M')
forecast = m.predict(future)
fig1 = m.plot(forecast)
Process run in separate file. Results exported, saved to github and imported below --
df1 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_1000.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_2000.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_4000.csv')
df4 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_6000.csv')
df5 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_8000.csv')
df6 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_13000a.csv')
df7 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_13000b.csv')
df8 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_15000.csv')
df9 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_17000.csv')
df10 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_19000.csv')
df11 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_21000.csv')
df12 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_22000.csv')
df13 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_24000.csv')
df14 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_26000.csv')
df15 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_28000.csv')
df16 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_30000.csv')
df17 = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/bigger_df_-1.csv')
# combine the data
df_all = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17], axis=1)
df = df_all.copy()
df = df.loc[:,~df.columns.duplicated()]
# confirm we have the correct number of zip codes
len(df.T)
# shoot, looks like we're one short. Let's compare with our original list of zips
# oh well, we can just run these manually
og_RegionNames = list(og_df['RegionName'].values)
og_RegionNames_str = [str(region) for region in og_RegionNames]
df_RegionNames = list(df.columns)
diff = list(set(og_RegionNames_str) - set(df_RegionNames))
diff
# also, looks like our column names don't actually look like the zip codes they are representing
new_cols = []
for column in df.columns:
if len(column) == 3:
new_cols.append('00'+ str(column))
elif len(column) == 4:
new_cols.append('0'+ str(column))
else:
new_cols.append(str(column))
df.columns = new_cols
# and we didn't import our date column
df['Unnamed: 0'] = forecast['ds']
cols = df.columns.values
cols[0] = 'forecast'
df.columns = cols
df_t = df.T
df_t.to_csv('30k_zips_with_prophet.csv')
files.download('30k_zips_with_prophet.csv')
df_t_a = df_t[df_t.columns[0:150]]
df_t_b = df_t[df_t.columns[150:]]
df_t_a.to_csv('30k_zips_with_prophet_A.csv')
df_t_b.to_csv('30k_zips_with_prophet_B.csv')
files.download('30k_zips_with_prophet_A.csv')
files.download('30k_zips_with_prophet_B.csv')
zips = pd.read_csv('https://raw.githubusercontent.com/danielcaraway/data/master/DMA-zip.csv')
zips
df
new_cols = []
for column in df.columns:
if len(column) == 3:
new_cols.append('00'+ str(column))
elif len(column) == 4:
new_cols.append('0'+ str(column))
else:
new_cols.append(str(column))
df.columns = new_cols
df.columns
forecast['ds']
from google.colab import drive # import drive from google colab
ROOT = "/content/drive" # default location for the drive
print(ROOT) # print content of ROOT (Optional)
drive.mount(ROOT) # we mount the google drive at /content/drive
%pwd
%ls
path = "/content/drive/My Drive/data/30k_zips_with_prophet.csv"
df_bonus = pd.read_csv(path)
df_bonus.head()