# DO NOT REMOVE THESE
%load_ext autoreload
%autoreload 2
#%aimport src.base
# DO NOT REMOVE This
%reload_ext autoreload
## DO NOT REMOVE
## import local src module -
## src in this project will contain all your local code
## clean_data.py, model.py, visualize.py, custom.py
from src import clean_data as cln
from src import visualize as viz
from src import model as md
from src.clean_data import *
import io
import sys
def test_src():
#cln.test_clean_data()
#viz.test_viz()
#md.test_model()
return 1
test_src()
# Dataframe manipulation
import numpy as np
import pandas as pd
# Visualization libraries
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
%matplotlib inline
import seaborn as sns
# Modules needed for the timeseries
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from sklearn import metrics
from sklearn.metrics import mean_squared_error
# Converting index to datetime format
from datetime import datetime
from dateutil.relativedelta import relativedelta
# Hide the warning messages we get
import warnings
warnings.filterwarnings("ignore")
Steps:
zillow_df = pd.read_csv('../data/raw/zillow_data.csv')
chicago_df = zillow_df[zillow_df['City']=='Chicago']
chicago_dates = chicago_df[chicago_df.columns[7:]] # Take the dates from the 8th column onwards, drop previous columns
chicago_df = chicago_dates.T # Transpose dataframe to put the column headings as row headings and give each date its own row.
chicago_df.reset_index(inplace=True) # Bring the dates out of the index, so we can change their type
chicago_df['index'] = pd.to_datetime(chicago_df['index']) # change dates to datetime
chicago_df.set_index('index', inplace=True) # Place the dates back in as the index to dataframe
chicago_zips = zillow_df['RegionName'][zillow_df['City']=='Chicago'] # Grab the Chicago zipcodes from original dataframe
chicago_df.columns = list(chicago_zips) # use Chicago's zipcodes as column headings
chicago_df.to_csv('../data/processed/chicago_jan1996_to_april2018')
# Build a DataFrame with the date alongside the mean of all the median chicago house prices
chicago_df_mean = np.mean(chicago_df, axis=1)
# chicago_dates_list = pd.to_datetime(list(chicago_dates.columns)) # date column
# chicago_dates_mean = list(chicago_df.mean(axis=1)) # mean column
# chicago_df_mean = pd.DataFrame({'date': chicago_dates_list, 'mean': chicago_dates_mean})
# chicago_df_mean.head(3)
chicago_df_mean = chicago_df_mean.reset_index()
chicago_df_mean.columns = ['date', 'mean']
chicago_df_mean['date']= pd.to_datetime(chicago_df_mean['date'])
# Set figuresize and style
sns.set(rc={'figure.figsize':(12,6)},style="white", context="talk")
# Plot
# Data
ax = sns.lineplot('date', 'mean', data=chicago_df_mean, color='g', lw=5);
# Financial crisis vertical line
ax.axvline(x='2007-04', ymin=0, ymax=0.95, ls= "--", lw=3, color='black', label='Financial Crisis')
# Upward trend vertical line
ax.axvline(x='2012-06', ymin=0, ymax=0.95, ls= "--", lw=3, color='b', label='Prices Bounce Back')
# Formatting
# Title
title = 'Median House Price for Chicago\n Between January 1996 & April 2018'
ax.set_title(title)
# Axis labels
xlabel = 'Year'
ylabel = "Median House Price (USD)"
ax.set(xlabel=xlabel, ylabel=ylabel)
# Take off the border
sns.despine()
# Show legend
plt.legend();
# Save the model as a png file
plt.savefig('../reports/figures/chicago_price_over_time_with_cutoffs.png', bbox_inches='tight')
Steps:
# Focus on dates after 2012-06
chicago_df = chicago_df['2012-06':]
chicago_df.to_csv('../data/processed/chicago_june2012_to_april2018')
# Deflate Chicago Data
# New Dataframe for the deflation factors
deflation_factors_2012_to_2019 = pd.DataFrame({'year': [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
'factor': [0.937, 0.950, 0.966, 0.967, 0.979, 1.000, 1.020, 1.042]})
# Change the year column to datetime format
deflation_factors_2012_to_2019['year'] = pd.to_datetime(deflation_factors_2012_to_2019['year'], format='%Y')
# Set the year column to be the index
deflation_factors_2012_to_2019.set_index('year', inplace=True)
# Expand dataframe to have each month of the years alongside its deflation factor
deflation_factors_2012_to_2019 = deflation_factors_2012_to_2019.resample('M').ffill()
# Take off the months we don't need for now
deflation_factors_2012_to_2018 = deflation_factors_2012_to_2019[5:-9] # 2012-06 to 2018-04
# Change the index (dates) to datetime
deflation_factors_2012_to_2018.index = [pd.datetime(x.year, x.month, 1) for x in deflation_factors_2012_to_2018.index.tolist()]
print(deflation_factors_2012_to_2018.head(3))
print(deflation_factors_2012_to_2018.tail(3))
# Concatenate 2 dataframes
chicago_df_defl = pd.concat([chicago_df, deflation_factors_2012_to_2018], axis=1)
# Divide each price value by the corresponding CPI deflation value
for column in chicago_df_defl.columns[:-1]:
chicago_df_defl[column] = chicago_df_defl[column].div(chicago_df_defl['factor'])
# Drop factor column
chicago_df_defl.drop('factor', axis=1, inplace=True)
chicago_df_defl.to_csv('../data/processed/chicago_june2012_to_april2018_deflated')
chicago_df = pd.read_csv('../data/processed/chicago_jan1996_to_april2018', index_col='index' )
Steps:
# chicago_df_defl = pd.read_csv('../data/processed/chicago_june2012_to_april2018_deflated', index_col = ['Unnamed: 0'])
chicago_df_defl_train = chicago_df_defl.loc['2012-06':'2017-10',:] # 5 years 4 months for train data
chicago_df_defl_test = chicago_df_defl.loc['2017-11':'2018-04',:] # 6 months for test data
viz.stationarity_check(chicago_df_defl_train,60654, plot_std=False)
viz.stationarity_check(chicago_df_defl_train,60659, plot_std=False)
chicago_df_defl_train_dt, log_diff_list = detrend_test(chicago_df_defl_train)
chicago_df_defl_train_examples = chicago_df_defl_train[[60654, 60659]]
detrend_test(chicago_df_defl_train_examples, suppress_output=True)
# Remove log first difference columns, to focus on first difference columns only
# NB that log_diff_list are Zipcodes which require a log first difference to transform
chicago_df_defl_train_fd = chicago_df_defl_train.drop(log_diff_list, axis=1)
chicago_df_defl_test_fd = chicago_df_defl_test.drop(log_diff_list, axis=1)
p_list = [0,1,2,3]
d_list = [1]
q_list = [0,1,2,3]
arima_order_rmse_values = [] # list of tuples, where each tuple is an arima order
for column in list(chicago_df_defl_train_fd.columns):
print(column)
order_value = md.evaluate_models(chicago_df_defl_train_fd[column], test=chicago_df_defl_test_fd[column],
p_values= p_list, d_values=d_list, q_values=q_list)
arima_order_rmse_values.append(order_value)
Steps:
for i in range(len(chicago_df_defl_train_fd.columns)):
md.predict_arima_model(chicago_df_defl_train_fd, chicago_df_defl_train_fd.columns[i],
arima_order=arima_order_rmse_values[i], periods=12)
final_zips = ['60638','60654', '60659']
for zipcode, order in zip(chicago_df_defl_train_fd.columns,arima_order_rmse_values):
if (str(zipcode) in final_zips):
md.predict_arima_model(chicago_df_defl_train_fd, zipcode, order, 12, save=True)
print('\n')
chicago_defl_fd = pd.concat([chicago_df_defl_train, chicago_df_defl_test], axis=0)
for code in [60638, 60654, 60659]:
update = chicago_df_defl_train[code]
for t in range(len(chicago_df_defl_test)):
rolmean = update.rolling(window = 3, center = False).mean()
yhat = rolmean.iloc[-1].item()
date = chicago_df_defl_test.index[t]
series = pd.DataFrame([[yhat]], index=[date])
update = update.append(series)
diff = update.iloc[-6:,0] - chicago_defl_fd[code].iloc[-6:]
RMSE = np.sqrt(sum((diff)**2)/len(diff))
print(code, "RMSE: ",RMSE)
We can see from the table below that the ARIMA model has achieved a much lower RMSE than the baseline model.
Zipcode | ARIMA RMSE | Baseline RMSE |
---|---|---|
60638 | 1687.0 | 4095.4 |
60654 | 7102.8 | 12760.6 |
60659 | 1964.7 | 3053.4 |