# Run once:
# install.packages(c("reticulate",
# "dplyr",
# "ggplot2",
# "RSocrata",
# "reshape2",
# "lubridate"))
library(dplyr)
library(ggplot2)
library(RSocrata)
library(reshape2)
library(lubridate)
library(xts)
library(here)
# Run once:
# reticulate::py_install("pandas")
# reticulate::py_install("statsmodels")
# Import custom functions from 'functions.R' file
source(
here::here(
"functions.R"
)
)
library(reticulate)
reticulate::use_virtualenv(virtualenv = "r-reticulate")
reticulate::py_available(TRUE)
## [1] TRUE
# Python
import pandas as pd
from statsmodels.tsa.vector_ar.var_model import VAR
In this analysis, we’ll attempt to build two models (one in R, and one in Python) to predict the total monthly prison population for every month in the year 2020, based off of training the models on historical data.
The prison population data can be found in the CT Open Data Repository.
We’ll also use some data on monthly CT non-farm employment statistics to use as a regressor variable to hopefully help us predict the prison population. This data is also accessible via the CT Open Data Repository
my_data <- GetData()
Now we have an R dataframe named my_data in memory. Let’s take a look at the first few rows of data, some summary statistics, and the observations with the lowest total state prison population.
But… let’s use Python to do so:
# Python
# We can execute Python against R dataframe objects by prepending the R dataframe name with 'r.'
r.my_data.head()
## date total.state.prison.pop total.nonfarm.employed
## 0 2010-01-01 18053 1568.7
## 1 2010-02-01 18383 1576.8
## 2 2010-03-01 18330 1582.6
## 3 2010-04-01 18331 1607.8
## 4 2010-05-01 18259 1625.9
# Python
# We can execute Python against R dataframe objects by prepending the R dataframe name with 'r.'
r.my_data.describe()
## total.state.prison.pop total.nonfarm.employed
## count 120.000000 120.000000
## mean 15867.900000 1663.920000
## std 1689.618526 33.632939
## min 12530.000000 1568.700000
## 25% 14405.000000 1639.725000
## 50% 16324.500000 1667.450000
## 75% 17032.750000 1692.775000
## max 18593.000000 1719.100000
# Python
# We can execute Python against R dataframe objects by prepending the R dataframe name with 'r.'
r.my_data.sort_values(by = 'total.state.prison.pop').head()
## date total.state.prison.pop total.nonfarm.employed
## 119 2019-12-01 12530 1717.5
## 118 2019-11-01 12793 1719.1
## 117 2019-10-01 12935 1709.8
## 116 2019-09-01 12941 1704.2
## 113 2019-06-01 13001 1712.9
First, we have to split the
# Split the data into "train" and "test" sets
# We'll train the model on the "train" data, and evaluate how accurate the model makes predictions using the "test data
train_df <- my_data %>%
dplyr::filter(date < as.Date("2019-01-01"))
test_df <- my_data %>%
dplyr::filter(date >= as.Date("2019-01-01"))
# Visualize the train/test data split
ggplot2::ggplot(
my_data %>%
dplyr::mutate(train.test = ifelse(date < as.Date("2019-01-01"),
"train",
"test")),
ggplot2::aes(x = date,
y = total.state.prison.pop,
col = train.test)) +
ggplot2::geom_line()
In time series forecasting, we have to remove trend from the data in order to ensure that “time” itself isn’t a factor in your final model equation. One of the simplest ways to do this is to transform the dataset into the between-period differences. Removing trend is referred to as making the data “stationary”.
train_df_stry <- train_df %>%
dplyr::mutate(total.state.prison.pop.diffs = c(NA, diff(total.state.prison.pop)),
total.nonfarm.employed.diffs = c(NA, diff(total.nonfarm.employed))) %>%
dplyr::select(date,
total.state.prison.pop.diffs,
total.nonfarm.employed.diffs)
train_df_stry <- xts::xts(train_df_stry[,2:3],
order.by = train_df_stry$date)
# create stationary test dataframe
test_df_stry <- train_df %>%
dplyr::bind_rows(test_df) %>%
dplyr::mutate(total.state.prison.pop.diffs = c(NA, diff(total.state.prison.pop)),
total.nonfarm.employed.diffs = c(NA, diff(total.nonfarm.employed))) %>%
dplyr::select(date,
total.state.prison.pop.diffs,
total.nonfarm.employed.diffs) %>%
dplyr::filter(date >= as.Date("2019-01-01"))
test_df_stry <- xts::xts(test_df_stry[,2:3],
order.by = test_df_stry$date)
We can take a look at what the new train and test datasets look like. Don’t worry, we’ll reverse engineer this at the end to give the end user the true predicted prison population values.
ggplot2::ggplot(
data.frame(
date = zoo::index(train_df_stry),
zoo::coredata(train_df_stry)
) %>%
dplyr::bind_rows(
data.frame(
date = zoo::index(test_df_stry),
zoo::coredata(test_df_stry)
)
) %>%
dplyr::mutate(train.test = ifelse(date < as.Date("2019-01-01"),
"train",
"test")),
ggplot2::aes(
x = date,
y = total.state.prison.pop.diffs,
col = train.test
)) +
ggplot2::geom_line()
## Warning: Removed 1 rows containing missing values (geom_path).
train_df_stry_rdf <- data.frame(
date = zoo::index(train_df_stry),
zoo::coredata(train_df_stry)
) %>%
tidyr::drop_na()
test_df_stry_rdf <- data.frame(
date = zoo::index(test_df_stry),
zoo::coredata(test_df_stry)
)
str(train_df_stry_rdf)
## 'data.frame': 107 obs. of 3 variables:
## $ date : Date, format: "2010-02-01" "2010-03-01" ...
## $ total.state.prison.pop.diffs: num 330 -53 1 -72 105 67 59 103 -112 -161 ...
## $ total.nonfarm.employed.diffs: num 8.1 5.8 25.2 18.1 0 ...
# Python
#Training model on training set
py_train_data_stry = r.train_df_stry_rdf
py_train_data_stry = py_train_data_stry.set_index('date')
py_test_data_stry = r.test_df_stry_rdf
py_test_data_stry = py_test_data_stry.set_index('date')
py_train_data_stry.head()
## total.state.prison.pop.diffs total.nonfarm.employed.diffs
## date
## 2010-02-01 330.0 8.1
## 2010-03-01 -53.0 5.8
## 2010-04-01 1.0 25.2
## 2010-05-01 -72.0 18.1
## 2010-06-01 105.0 0.0
# Python
#Training model on training set
model = VAR(endog=py_train_data_stry.values)
model_fit = model.fit(2)
# make prediction on "test" set
prediction = model_fit.forecast(model_fit.endog, steps=len(py_test_data_stry))
prediction
## array([[-21.67981649, -1.20385862],
## [-35.84853668, 2.22209531],
## [-54.21567396, 2.68524858],
## [-52.87491408, 1.13084042],
## [-44.42194739, 0.49861351],
## [-42.97573098, 1.06493982],
## [-46.5925841 , 1.49810547],
## [-48.16160301, 1.34092588],
## [-46.86395116, 1.09975398],
## [-45.81048428, 1.11344767],
## [-46.1556112 , 1.22937696],
## [-46.73260729, 1.25166808]])
diffd_preds <- py$prediction
diffd_preds
## [,1] [,2]
## [1,] -21.67982 -1.2038586
## [2,] -35.84854 2.2220953
## [3,] -54.21567 2.6852486
## [4,] -52.87491 1.1308404
## [5,] -44.42195 0.4986135
## [6,] -42.97573 1.0649398
## [7,] -46.59258 1.4981055
## [8,] -48.16160 1.3409259
## [9,] -46.86395 1.0997540
## [10,] -45.81048 1.1134477
## [11,] -46.15561 1.2293770
## [12,] -46.73261 1.2516681
# Calculate the actual predicted values after reconstructing the "differenced" predictions
pop_preds <- c(train_df$total.state.prison.pop[train_df$date == max(train_df$date)],
rep(NA, 12))
for (i in 1:12) {
pop_preds[i+1] <- pop_preds[i] + diffd_preds[i]
}
pop_preds <- pop_preds[-1]
pop_preds
## [1] 13373.32 13337.47 13283.26 13230.38 13185.96 13142.98 13096.39
## [8] 13048.23 13001.37 12955.55 12909.40 12862.67
test_df %>%
dplyr::select(-total.nonfarm.employed) %>%
dplyr::mutate(pred = pop_preds) %>%
tidyr::pivot_longer(-date, names_to = "Group", values_to = "total.state.prison.pop") %>%
ggplot2::ggplot(
ggplot2::aes(x = date,
y = total.state.prison.pop,
col = Group)
) +
ggplot2::geom_line(size = 2)