In this example we fit a piecewise constant function to example data.
Please see here for a discussion of the methodology.
library("RcppDynProg")
set.seed(2018)
<- 50
g <- data.frame(
d x = 1:(3*g)) # ordered in x
$y_ideal <- c(rep(0, g), rep(1, g), rep(-1, g))
d$y_observed <- d$y_ideal + rnorm(length(d$y_ideal))
d
# plot
plot(d$x, d$y_observed,
xlab = "x", ylab = "y",
main = "raw data\ncircles: observed values, dashed line: unobserved true values")
lines(d$x, d$y_ideal,
type = "l",
lty = "dashed")
As a heuristic, we set our regularization penalty to a value that treats permuted data (no relation between x and y) as a single partition.
<- d$y_ideal[sample.int(nrow(d), nrow(d), replace = FALSE)]
y_permuted
<- function(ycol, penalty) {
solve_with_penalty <- length(ycol)
n = seq_len(n)
indices <- const_costs(ycol, 1+numeric(n), 1, indices)
x <- x + penalty
x solve_interval_partition(x, n)
}
<- 1
lb <- 10
ub while(length(solve_with_penalty(y_permuted, ub))>2) {
<- ub*2
ub
}while(TRUE) {
<- ceiling((ub+lb)/2)
mid if(mid>=ub) {
break
}<- solve_with_penalty(y_permuted, mid)
si if(length(si)<=2) {
<- mid
ub else {
} <- mid
lb
}
}print(ub)
## [1] 2
We now use this penalty to segment the data. Notice we recover the actual problem structure.
<- solve_with_penalty(d$y_observed, ub)
soln print(soln)
## [1] 1 50 101 145 151
$group <- as.character(findInterval(d$x, soln))
d<- tapply(d$y_observed, d$group, mean)
group_means $group_mean <- group_means[d$group]
d$estimate <- d$group_mean
d
print(sum((d$y_observed - d$y_ideal)^2))
## [1] 151.876
print(sum((d$group_mean - d$y_ideal)^2))
## [1] 6.653456
# plot
$group <- as.character(d$group)
dplot(d$x, d$y_observed,
xlab = "x", ylab = "y",
main = "RcppDynProg piecewise linear estimate\ndots: observed values, segments: estimated shape")
points(d$x, d$y_ideal,
type = "l",
lty = "dashed")
<- c("#a6cee3",
cmap "#1f78b4",
"#b2df8a",
"#33a02c",
"#fb9a99",
"#e31a1c",
"#fdbf6f",
"#ff7f00",
"#cab2d6",
"#6a3d9a",
"#ffff99",
"#b15928")
names(cmap) <- as.character(seq_len(length(cmap)))
points(d$x, d$y_observed, col = cmap[d$group], pch=19)
<- sort(unique(d$group))
groups for(gi in groups) {
<- d[d$group==gi, , drop = FALSE]
di lines(di$x, di$estimate, col = cmap[di$group[[1]]], lwd=2)
}
The same solution through the more succinct solve_for_partitionc()
interface.
# x_cuts <- solve_for_partition(d$x, d$y_observed)
# sometimes a different penalty due to problem chunking
<- solve_for_partitionc(d$x, d$y_observed, penalty = ub)
x_cuts print(x_cuts)
## x pred group what
## 1 1 -0.1147501 1 left
## 2 49 -0.1147501 1 right
## 3 50 1.1321951 2 left
## 4 100 1.1321951 2 right
## 5 101 -1.0414092 3 left
## 6 144 -1.0414092 3 right
## 7 145 -0.2065736 4 left
## 8 150 -0.2065736 4 right
$estimate <- approx(x_cuts$x, x_cuts$pred, xout = d$x, method = "constant", rule = 2)$y
d$group <- as.character(findInterval(d$x, x_cuts[x_cuts$what=="left", "x"]))
d
print(sum((d$y_observed - d$y_ideal)^2))
## [1] 151.876
print(sum((d$estimate - d$y_ideal)^2))
## [1] 6.653456
print(sum((d$estimate - d$y_observed)^2))
## [1] 144.2765
# plot
$group <- as.character(d$group)
dplot(d$x, d$y_observed,
xlab = "x", ylab = "y",
main = "RcppDynProg piecewise constant estimate\ndots: observed values, segments: estimated shape")
points(d$x, d$y_ideal,
type = "l",
lty = "dashed")
<- c("#a6cee3",
cmap "#1f78b4",
"#b2df8a",
"#33a02c",
"#fb9a99",
"#e31a1c",
"#fdbf6f",
"#ff7f00",
"#cab2d6",
"#6a3d9a",
"#ffff99",
"#b15928")
names(cmap) <- as.character(seq_len(length(cmap)))
points(d$x, d$y_observed, col = cmap[d$group], pch=19)
<- sort(unique(d$group))
groups for(gi in groups) {
<- d[d$group==gi, , drop = FALSE]
di lines(di$x, di$estimate, col = cmap[di$group[[1]]], lwd=2)
}