In this vignette we explore the differences in asymptotic time complexity between different implementations of binary segmentation.
The code below uses the following arguments:
N
is a numeric vector of data sizes,setup
is an R expression to create the data,library(data.table)
atime.list <- atime::atime(
N=2^seq(2, 20),
setup={
max.segs <- as.integer(N/2)
max.changes <- max.segs-1L
set.seed(1)
data.vec <- 1:N
},
"changepoint::cpt.mean"={
cpt.fit <- changepoint::cpt.mean(data.vec, method="BinSeg", Q=max.changes)
sort(c(N,cpt.fit@cpts.full[max.changes,]))
},
"binsegRcpp::binseg_normal"={
binseg.fit <- binsegRcpp::binseg_normal(data.vec, max.segs)
sort(binseg.fit$splits$end)
},
"fpop::multiBinSeg"={
mbs.fit <- fpop::multiBinSeg(data.vec, max.changes)
sort(c(mbs.fit$t.est, N))
},
"wbs::sbs"={
wbs.fit <- wbs::sbs(data.vec)
split.dt <- data.table(wbs.fit$res)[order(-min.th, scale)]
sort(split.dt[, c(N, cpt)][1:max.segs])
},
binsegRcpp.list={
binseg.fit <- binsegRcpp::binseg(
"mean_norm", data.vec, max.segs, container.str="list")
sort(binseg.fit$splits$end)
},
##seconds.limit=0.1,
times=5)
plot(atime.list)
The default plot method creates a log-log plot of median time vs data
size, for each of the specified R expressions. You can use
references_best
to get a tall/long data table that can be plotted to
show both empirical time and memory complexity:
best.list <- atime::references_best(atime.list)
if(require(ggplot2)){
hline.df <- with(atime.list, data.frame(seconds.limit, unit="seconds"))
gg.both <- ggplot()+
theme_bw()+
facet_grid(unit ~ ., scales="free")+
geom_hline(aes(
yintercept=seconds.limit),
color="grey",
data=hline.df)+
geom_line(aes(
N, empirical, color=expr.name),
data=best.list$meas)+
geom_ribbon(aes(
N, ymin=min, ymax=max, fill=expr.name),
data=best.list$meas[unit=="seconds"],
alpha=0.5)+
scale_x_log10()+
scale_y_log10("median line, min/max band")
if(require(directlabels)){
gg.both+
directlabels::geom_dl(aes(
N, empirical, color=expr.name, label=expr.name),
method="right.polygons",
data=best.list$meas)+
theme(legend.position="none")+
coord_cartesian(xlim=c(1,2e7))
}else{
gg.both
}
}
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis
The plots above show some speed differences between binary
segmentation algorithms, but they could be even easier to see for
larger data sizes (exercise for the reader: try modifying the N
and
seconds.limit
arguments). You can also see that memory usage is much
larger for changepoint than for the other packages.
You can use code like below to compute asymptotic references which are
best fit for each expression. We do the best fit by adjusting each
reference to the largest N, and then ranking each reference by
distance to the measurement of the second to largest N. The code below
uses each.sign.rank==1
to compute the closest reference above and
below,
best.refs <- best.list$ref[each.sign.rank==1]
time.refs <- best.refs[unit=="seconds"]
Then you can plot these references with the empirical data using the ggplot code below,
ref.color <- "red"
if(require(ggplot2)){
gg <- ggplot()+
geom_line(aes(
N, reference, group=fun.name),
color=ref.color,
data=time.refs)+
geom_line(aes(
N, empirical),
size=1,
data=best.list$meas[unit=="seconds"])+
scale_x_log10()+
scale_y_log10("median line, min/max band")+
facet_wrap("expr.name")+
theme_bw()
if(require(directlabels)){
gg+
directlabels::geom_dl(aes(
N, reference, label=fun.name),
data=time.refs,
color=ref.color,
method="bottom.polygons")
}else{
gg
}
}
If you have one or more expected time complexity classes that you want
to compare with your empirical measurements, you can use the
fun.list
argument:
my.refs <- list(
"N \\log N"=function(N)log10(N) + log10(log(N)),
"N^2"=function(N)2*log10(N),
"N^3"=function(N)3*log10(N))
my.best <- atime::references_best(atime.list, fun.list=my.refs)
Note that in the code above, each R function should take as input the
data size N
and output log base 10 of the reference function.
my.best.time.refs <- my.best$ref[unit=="seconds"]
if(require(ggplot2)){
gg <- ggplot()+
geom_line(aes(
N, reference, group=fun.name),
color=ref.color,
data=my.best.time.refs)+
geom_line(aes(
N, empirical),
size=1,
data=best.list$meas[unit=="seconds"])+
scale_x_log10()+
scale_y_log10("median line, min/max band")+
facet_wrap("expr.name")+
theme_bw()
if(require(directlabels)){
gg+
directlabels::geom_dl(aes(
N, reference, label=fun.name),
data=my.best.time.refs,
color=ref.color,
method="bottom.polygons")
}else{
gg
}
}
From the plot above you should be able to see the asymptotic time complexity class of each algorithm.
seconds.limit
to see the differences more clearly.