This vignette shows comparisons in terms of computation time with other packages. Specifically, we will make comparisons with the roll
package and zoo
package. It should be stressed though that the other solutions do additional things than this package does. E.g., there is not performed any rank test in the function in this package. We start by showing the comparisons of computation times and then we show different options. Some function definitions are shown at the end.
We start by simulating the data
set.seed(32981054)
<- 10000
n <- 6
p = 50
wdth <- matrix(rnorm(p * n), n, p)
X <- drop(X %*% runif(p)) + rnorm(n)
y <- data.frame(y, X)
df <- eval(parse(text = paste0(
frm "formula(y ~ -1 + ", paste0("X", 1:p, collapse = " + "), ")")))
Then we check that the functions give the same (the function definitions are at the end of this document)
library(rollRegres)
<- roll_regress_R_for_loop(X, y, wdth)
base_res all.equal(base_res, roll_regres.fit(X, y, wdth)$coefs,
check.attributes = FALSE)
#R [1] TRUE
all.equal(base_res, roll_regres(frm, df, wdth)$coefs,
check.attributes = FALSE)
#R [1] TRUE
all.equal(base_res, roll_regress_zoo(X, y, wdth),
check.attributes = FALSE)
#R [1] TRUE
all.equal(base_res, roll_lm_func(X, y, wdth),
check.attributes = FALSE)
#R [1] TRUE
and here we compare the computation time
::microbenchmark(
microbenchmarkroll_regress = roll_regres.fit(X, y, wdth),
# this will be slower due to call to `model.matrix` and `model.frame`
roll_regress_df = roll_regres(frm, df, wdth),
roll_regress_zoo = roll_regress_zoo(X, y, wdth),
roll_regress_R_for_loop = roll_regress_R_for_loop(X, y, wdth),
roll_lm = roll_lm_func(X, y, wdth),
times = 5)
#R Unit: milliseconds
#R expr min lq mean median uq
#R roll_regress 4.999500 5.053804 5.07759 5.070857 5.091074
#R roll_regress_df 5.759151 5.817656 5.84265 5.856961 5.889038
#R roll_regress_zoo 571.484783 588.036867 595.73757 605.818587 606.632240
#R roll_regress_R_for_loop 345.055169 349.455274 367.43727 377.524748 378.513305
#R roll_lm 56.424120 56.457575 58.57639 56.724943 60.622997
#R max neval
#R 5.172715 5
#R 5.890446 5
#R 606.715380 5
#R 386.637874 5
#R 62.652334 5
# here is the formula used above
frm#R y ~ -1 + X1 + X2 + X3 + X4 + X5 + X6
This section will cover some additional features.
Here are expanding window regressions with additional output
#####
# simulate data
set.seed(65731482)
<- 100
n <- 2
p <- matrix(rnorm(p * n), n, p)
X <- drop(X %*% runif(p)) + rnorm(n)
y
#####
# use package function
<- roll_regres.fit(
pck_out width = 50L, do_downdates = FALSE,
X, y, do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))
#####
# assign R-version
<- function(X, y, width){
R_func <- nrow(X)
n <- ncol(X)
p <- matrix(NA_real_, n, p)
out <- rep(NA_real_, n)
sigmas <- rep(NA_real_, n)
r.squared <- rep(NA_real_, n)
one_step_forecasts
for(i in width:n){
<- 1:i
idx <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
fit <- fit$coefficients
out[i, ]
<- summary(fit)
su <- su$sigma
sigmas[i]
<- sum((y[idx] - mean(y[idx]))^2)
ss1 <- sum(fit$residuals^2)
ss2 <- 1 - ss2 / ss1
r.squared[i]
if(i < n){
<- i + 1L
next_i <- fit$coefficients %*% X[next_i, ]
one_step_forecasts[next_i]
}
}
list(coef = out, sigmas = sigmas, r.squared = r.squared,
one_step_forecasts = one_step_forecasts)
}
<- R_func(X, y, 50L)
R_out
#####
# gives the same
stopifnot(
isTRUE(all.equal(R_out$coef , pck_out$coefs)),
isTRUE(all.equal(R_out$sigmas , pck_out$sigmas)),
isTRUE(all.equal(R_out$r.squared , pck_out$r.squared)),
isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))
You can use the grp
argument to make updates in blocks. E.g., here is an example with weekly data
#####
# simulate data
set.seed(68799379)
<- as.integer(gl(25, 7))
week head(week[1:10])
#R [1] 1 1 1 1 1 1
<- length(week)
n <- 2
p <- matrix(rnorm(p * n), n, p)
X <- drop(X %*% runif(p)) + rnorm(n)
y
#####
# use package function
<- roll_regres.fit(
pck_out width = 10L, grp = week,
X, y, do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))
#####
# assign R-version
<- function(X, y, width, grp){
R_func <- grp + 1L - min(grp)
grp = unique(grp)
u_grp <- nrow(X)
n <- ncol(X)
p <- matrix(NA_real_, n, p)
out <- rep(NA_real_, n)
sigmas <- rep(NA_real_, n)
r.squared <- rep(NA_real_, n)
one_step_forecasts
<- min(which(u_grp >= width))
start_val for(g in u_grp[start_val:length(u_grp)]){
<- which(grp %in% (g - width + 1L):g)
idx <- which(grp == g)
i <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
fit <- sapply(fit$coefficients, rep, times = length(i))
out[i, ]
<- summary(fit)
su <- su$sigma
sigmas[i]
<- sum((y[idx] - mean(y[idx]))^2)
ss1 <- sum(fit$residuals^2)
ss2 <- 1 - ss2 / ss1
r.squared[i]
if(g != max(grp)){
<- grp[min(which(grp > g))]
next_g <- which(grp == next_g)
next_g <- X[next_g, ] %*% fit$coefficients
one_step_forecasts[next_g]
}
}
list(coef = out, sigmas = sigmas, r.squared = r.squared,
one_step_forecasts = one_step_forecasts)
}
<- R_func(X, y, 10L, grp = week)
R_out
#####
# gives the same
stopifnot(
isTRUE(all.equal(R_out$coef , pck_out$coefs)),
isTRUE(all.equal(R_out$sigmas , pck_out$sigmas)),
isTRUE(all.equal(R_out$r.squared , pck_out$r.squared)),
isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))
Suppose that we are performing rolling regression on stock data over a yearly window. Further, suppose that there are some gaps in the data where we do not have data and we require at least 6 months of data. This can be done as follows
#####
# simulate data
set.seed(96235555)
<- 10L * 12L * 21L # x years w/ 12 months of 21 trading days
n <- (seq_len(n) - 1L) %/% 21L + 1L # group by months
month set.seed(29478439)
<- matrix(rnorm(n * 4L), ncol = 4L)
X <- rnorm(n)
y
# randomly drop rows
<- seq_along(y) %in% sample.int(nrow(X), as.integer(n * .5))
keep <- X [keep, ]
X <- y [keep]
y <- month[keep]
month
#####
# use package function
<- roll_regres.fit(
pck_out width = 12L, grp = month, min_obs = 21L * 6L, do_downdates = TRUE,
X, y, do_compute = c("sigmas", "r.squareds"))
#####
# assign R-version
<- function(X, y, width, grp, downdate, min_obs){
R_func <- grp + 1L - min(grp)
grp = unique(grp)
u_grp <- nrow(X)
n <- ncol(X)
p <- matrix(NA_real_, n, p)
out <- rep(NA_real_, n)
sigmas <- rep(NA_real_, n)
r.squared
<- min(which(u_grp >= width))
start_val for(g in u_grp[start_val:length(u_grp)]){
<-
idx if(downdate)
which(grp %in% (g - width + 1L):g) else
which(grp %in% 1:g)
<- which(grp == g)
i if(length(idx) < min_obs)
next
<- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
fit <- sapply(fit$coefficients, rep, times = length(i))
out[i, ]
<- summary(fit)
su <- su$sigma
sigmas[i]
<- sum((y[idx] - mean(y[idx]))^2)
ss1 <- sum(fit$residuals^2)
ss2 <- 1 - ss2 / ss1
r.squared[i]
}
list(coef = out, sigmas = sigmas, r.squared = r.squared)
}
<- R_func(X, y, width = 12L, downdate = TRUE, grp = month,
R_out min_obs = 21L * 6L)
#####
# gives the same
stopifnot(
isTRUE(all.equal(R_out$coef , pck_out$coefs)),
isTRUE(all.equal(R_out$sigmas , pck_out$sigmas)),
isTRUE(all.equal(R_out$r.squared, pck_out$r.squared)))
sessionInfo()
#R R version 4.1.3 (2022-03-10)
#R Platform: x86_64-pc-linux-gnu (64-bit)
#R Running under: Ubuntu 20.04.4 LTS
#R
#R Matrix products: default
#R BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#R LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#R
#R locale:
#R [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#R [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#R [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#R [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#R [9] LC_ADDRESS=C LC_TELEPHONE=C
#R [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#R
#R attached base packages:
#R [1] compiler stats graphics grDevices utils datasets methods
#R [8] base
#R
#R other attached packages:
#R [1] rollRegres_0.1.4 roll_1.1.6 zoo_1.8-10
#R
#R loaded via a namespace (and not attached):
#R [1] Rcpp_1.0.8.3 rstudioapi_0.13 knitr_1.39
#R [4] magrittr_2.0.3 lattice_0.20-45 R6_2.5.1
#R [7] rlang_1.0.2 fastmap_1.1.0 stringr_1.4.0
#R [10] tools_4.1.3 grid_4.1.3 checkmate_2.1.0
#R [13] xfun_0.30 cli_3.3.0 jquerylib_0.1.4
#R [16] htmltools_0.5.2 yaml_2.3.5 RcppParallel_5.1.5
#R [19] digest_0.6.29 microbenchmark_1.4.9 sass_0.4.1
#R [22] evaluate_0.15 rmarkdown_2.14 stringi_1.7.6
#R [25] bslib_0.3.1 backports_1.4.1 jsonlite_1.8.0
Here are the function definitions
#####
# simple R version
<- function(X, y, width){
roll_regress_R_for_loop <- nrow(X)
n <- ncol(X)
p <- matrix(NA_real_, n, p)
out
for(i in width:n){
<- (i - width + 1L):i
idx <- lm.fit(X[idx, , drop = FALSE], y[idx])$coefficients
out[i, ]
}
out
}
#####
# zoo version
<- function(Z) {
.zoo_inner <- lm.fit(Z[, -1, drop = FALSE], Z[, 1])
fit $coefficients
fit
}library(zoo)
<- function(x, y, width){
roll_regress_zoo <- cbind(y, X)
Z rollapply(Z, width, FUN = .zoo_inner,
by.column = FALSE, align = "right", fill = NA_real_)
}
#####
# roll_lm
library(roll)
<- function(x, y ,width)
roll_lm_func roll_lm(X, matrix(y, ncol = 1), wdth, intercept = FALSE)$coefficients
# use one thread as other methods are easily to compute in parallel too
::setThreadOptions(numThreads = 1)
RcppParallel
# compile functions
library(compiler)
<- cmpfun(roll_regress_R_for_loop)
roll_regress_R_for_loop <- cmpfun(.zoo_inner)
.zoo_inner <- cmpfun(roll_regress_zoo)
roll_regress_zoo <- cmpfun(roll_lm_func) roll_lm_func