In this example there are 5 time points with a treatment and time varying covariate. At each, treatment can be 0 or 1. There is a single outcome Y.
L_0 L_1 A_1 L_2 A_2 … L_5 A_5 Y
There are 2^5 = 32 regimes of interest. Some of these may have limited support because there are few patients who follow a particular regime. We pool over all regimes using a working marginal structural model. In this example we want to know the effect of time on treatment on Y. We include time.on.treatment and time.on.treatment^2.
rexpit <- function(x) rbinom(n=length(x), size=1, prob=plogis(x))
n <- 5000
time.points <- 5
prev.L <- rnorm(n)
prev.A <- rep(0, n)
sum.A <- rep(0, n)
data <- data.frame(L_0 = prev.L)
for (t in 1:time.points) {
L <- 0.1 * prev.L + 0.3 * prev.A + rnorm(n)
A <- rexpit(L)
data1 <- data.frame(L, A)
names(data1) <- paste0(c("L_", "A_"), t)
data <- cbind(data, data1)
prev.A <- A
prev.L <- L
sum.A <- sum.A + A
}
data$Y <- rexpit(sum.A / time.points + L)
head(data)
#> L_0 L_1 A_1 L_2 A_2 L_3 A_3 L_4 A_4
#> 1 -0.7480303 -0.1865826 0 0.4469474 1 0.12149207 1 -1.76985617 0
#> 2 0.6557836 -0.4355446 1 0.8469899 0 -1.07773708 0 -0.98969343 0
#> 3 1.2270065 -0.9697197 0 -1.6375742 0 0.03251294 1 0.07680317 0
#> 4 -1.6460771 -1.7278791 0 -0.5303850 0 -1.25567621 0 0.04886727 1
#> 5 0.2744846 -1.1419435 0 0.9679383 1 -1.49024538 0 0.15774825 1
#> 6 -2.0300750 -0.1996350 0 0.8142871 1 0.82830073 0 -0.87578580 0
#> L_5 A_5 Y
#> 1 0.7097768 1 1
#> 2 1.4101966 0 1
#> 3 -0.6878611 0 0
#> 4 -0.6242420 0 0
#> 5 1.5856941 0 1
#> 6 -0.2540231 0 0
regime.matrix <- as.matrix(expand.grid(rep(list(0:1), time.points)))
dim(regime.matrix)
#> [1] 32 5
head(regime.matrix, 20)
#> Var1 Var2 Var3 Var4 Var5
#> [1,] 0 0 0 0 0
#> [2,] 1 0 0 0 0
#> [3,] 0 1 0 0 0
#> [4,] 1 1 0 0 0
#> [5,] 0 0 1 0 0
#> [6,] 1 0 1 0 0
#> [7,] 0 1 1 0 0
#> [8,] 1 1 1 0 0
#> [9,] 0 0 0 1 0
#> [10,] 1 0 0 1 0
#> [11,] 0 1 0 1 0
#> [12,] 1 1 0 1 0
#> [13,] 0 0 1 1 0
#> [14,] 1 0 1 1 0
#> [15,] 0 1 1 1 0
#> [16,] 1 1 1 1 0
#> [17,] 0 0 0 0 1
#> [18,] 1 0 0 0 1
#> [19,] 0 1 0 0 1
#> [20,] 1 1 0 0 1
num.regimes <- 2^time.points
regimes <- array(dim = c(n, time.points, num.regimes)) #n x numAnodes x numRegimes = n x time.points x 2^time.points
summary.measures <- array(dim = c(num.regimes, 1, 1)) #numRegimes x num.summary.measures x num.final.Ynodes = 2^time.points x 1 x 1
for (i in 1:num.regimes) {
regimes[, , i] <- matrix(regime.matrix[i, ], byrow = TRUE, nrow = n, ncol = time.points)
summary.measures[i, 1, 1] <- sum(regime.matrix[i, ])
}
colnames(summary.measures) <- "time.on.treatment"
regimes[1:3, , 1:3]
#> , , 1
#>
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0 0 0 0 0
#> [2,] 0 0 0 0 0
#> [3,] 0 0 0 0 0
#>
#> , , 2
#>
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 1 0 0 0 0
#> [2,] 1 0 0 0 0
#> [3,] 1 0 0 0 0
#>
#> , , 3
#>
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0 1 0 0 0
#> [2,] 0 1 0 0 0
#> [3,] 0 1 0 0 0
summary.measures
#> , , 1
#>
#> time.on.treatment
#> [1,] 0
#> [2,] 1
#> [3,] 1
#> [4,] 2
#> [5,] 1
#> [6,] 2
#> [7,] 2
#> [8,] 3
#> [9,] 1
#> [10,] 2
#> [11,] 2
#> [12,] 3
#> [13,] 2
#> [14,] 3
#> [15,] 3
#> [16,] 4
#> [17,] 1
#> [18,] 2
#> [19,] 2
#> [20,] 3
#> [21,] 2
#> [22,] 3
#> [23,] 3
#> [24,] 4
#> [25,] 2
#> [26,] 3
#> [27,] 3
#> [28,] 4
#> [29,] 3
#> [30,] 4
#> [31,] 4
#> [32,] 5
For large numbers of regimes, variance.method = "ic"
is much faster, but may give anticonservative confidence intervals. You may want to use variance.method = "ic"
first to make sure the MSM coefficients look reasonable and then use variance.method = "tmle"
(the default) for your final estimates.
result1 <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points),
Lnodes = paste0("L_", 0:time.points), Ynodes = "Y",
regimes = regimes, summary.measures = summary.measures,
working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)",
variance.method = "ic")
#> Qform not specified, using defaults:
#> formula for L_2:
#> Q.kplus1 ~ L_0 + L_1 + A_1
#> formula for L_3:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2
#> formula for L_4:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3
#> formula for L_5:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4
#> formula for Y:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 + A_5
#>
#> gform not specified, using defaults:
#> formula for A_1:
#> A_1 ~ L_0 + L_1
#> formula for A_2:
#> A_2 ~ L_0 + L_1 + A_1 + L_2
#> formula for A_3:
#> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3
#> formula for A_4:
#> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4
#> formula for A_5:
#> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5
#>
#> Estimate of time to completion: 1 minute
#> Warning in CheckForVarianceWarning(inputs, g.ratio): Variance estimate is
#> based on influence curve only, which may be significantly anticonservative
#> because your data appears to contain positivity violations. It is recommended
#> to use variance.method='tmle' or variance.method='iptw' to obtain a more
#> robust variance estimate (but run time may be significantly longer). See
#> variance.method details in ?ltmle
result2 <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points),
Lnodes = paste0("L_", 0:time.points), Ynodes = "Y",
regimes = regimes, summary.measures = summary.measures,
working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)")
#> Qform not specified, using defaults:
#> formula for L_2:
#> Q.kplus1 ~ L_0 + L_1 + A_1
#> formula for L_3:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2
#> formula for L_4:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3
#> formula for L_5:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4
#> formula for Y:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 + A_5
#>
#> gform not specified, using defaults:
#> formula for A_1:
#> A_1 ~ L_0 + L_1
#> formula for A_2:
#> A_2 ~ L_0 + L_1 + A_1 + L_2
#> formula for A_3:
#> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3
#> formula for A_4:
#> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4
#> formula for A_5:
#> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5
#>
#> Estimate of time to completion: 4 to 16 minutes
summary(result1)
#> Estimator: tmle
#> Estimate Std. Error CI 2.5% CI 97.5% p-value
#> (Intercept) 0.11565 0.13593 -0.15078 0.382 0.395
#> time.on.treatment 0.08999 0.11442 -0.13428 0.314 0.432
#> I(time.on.treatment^2) 0.03304 0.02284 -0.01171 0.078 0.148
summary(result2)
#> Estimator: tmle
#> Estimate Std. Error CI 2.5% CI 97.5% p-value
#> (Intercept) 0.11565 0.18547 -0.24786 0.479 0.533
#> time.on.treatment 0.08999 0.14958 -0.20318 0.383 0.547
#> I(time.on.treatment^2) 0.03304 0.02862 -0.02304 0.089 0.248
Suppose we are only interested in the effect of time on treatment on Y considering regimes that include at least 3 periods on treatment.
at.least.3 <- summary.measures[, "time.on.treatment", 1] >= 3
regimes.3 <- regimes[, , at.least.3]
summary.measures.3 <- summary.measures[at.least.3, , , drop = F]
dim(regimes.3)
#> [1] 5000 5 16
summary.measures.3
#> , , 1
#>
#> time.on.treatment
#> [1,] 3
#> [2,] 3
#> [3,] 3
#> [4,] 3
#> [5,] 4
#> [6,] 3
#> [7,] 3
#> [8,] 3
#> [9,] 4
#> [10,] 3
#> [11,] 3
#> [12,] 4
#> [13,] 3
#> [14,] 4
#> [15,] 4
#> [16,] 5
result <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points),
Lnodes = paste0("L_", 0:time.points),
Ynodes = "Y", regimes = regimes.3,
summary.measures = summary.measures.3,
working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)")
#> Qform not specified, using defaults:
#> formula for L_2:
#> Q.kplus1 ~ L_0 + L_1 + A_1
#> formula for L_3:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2
#> formula for L_4:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3
#> formula for L_5:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4
#> formula for Y:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5 + A_5
#>
#> gform not specified, using defaults:
#> formula for A_1:
#> A_1 ~ L_0 + L_1
#> formula for A_2:
#> A_2 ~ L_0 + L_1 + A_1 + L_2
#> formula for A_3:
#> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3
#> formula for A_4:
#> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4
#> formula for A_5:
#> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5
#>
#> Estimate of time to completion: 3 to 8 minutes
summary(result)
#> Estimator: tmle
#> Estimate Std. Error CI 2.5% CI 97.5% p-value
#> (Intercept) -0.47416 2.20574 -4.79733 3.849 0.830
#> time.on.treatment 0.51495 1.20690 -1.85053 2.880 0.670
#> I(time.on.treatment^2) -0.03434 0.16045 -0.34882 0.280 0.831
Given data over 3 time points where A switches to 1 once and then stays 1. We want to know how death varies as a function of gender, time and an indicator of whether a patient’s intended regime was to switch before time. Note that working.msm
includes time
and switch.time
, which are columns of summary.measures; working.msm
also includes male
, which is ok because it is a baseline covariate (it comes before any A/C/L/Y nodes).
data(sampleDataForLtmleMSM)
head(sampleDataForLtmleMSM$data, 20)
#> male age CD4_0 A0 Y1 CD4_1 A1 Y2 CD4_2 A2 Y3
#> 1 1 33 347 0 0 349 0 0 315 0 0
#> 2 0 18 277 0 0 302 0 0 300 0 0
#> 3 1 33 419 0 0 423 0 0 462 0 0
#> 4 1 35 318 1 0 358 1 0 413 1 0
#> 5 0 27 145 0 0 134 1 1 NA NA 1
#> 6 1 27 320 0 0 332 0 0 347 1 0
#> 7 1 35 220 0 0 241 0 0 216 1 0
#> 8 0 31 184 0 0 202 1 0 230 1 0
#> 9 0 35 289 0 0 302 0 0 290 0 0
#> 10 0 30 295 1 0 312 1 0 340 1 0
#> 11 0 31 300 1 0 394 1 0 467 1 0
#> 12 0 30 300 0 0 331 0 0 320 0 0
#> 13 0 25 276 0 1 NA NA 1 NA NA 1
#> 14 1 26 242 1 0 280 1 0 307 1 0
#> 15 0 21 238 1 0 345 1 0 379 1 0
#> 16 1 30 304 0 0 258 0 0 287 0 0
#> 17 0 30 271 1 0 297 1 0 324 1 0
#> 18 1 27 296 0 0 305 0 0 306 0 0
#> 19 1 33 217 1 0 242 1 0 267 1 0
#> 20 1 25 337 0 0 360 0 0 390 0 0
dim(sampleDataForLtmleMSM$regimes)
#> [1] 200 3 4
sampleDataForLtmleMSM$regimes[1:5, , ]
#> , , 1
#>
#> [,1] [,2] [,3]
#> [1,] 1 1 1
#> [2,] 1 1 1
#> [3,] 1 1 1
#> [4,] 1 1 1
#> [5,] 1 1 1
#>
#> , , 2
#>
#> [,1] [,2] [,3]
#> [1,] 0 1 1
#> [2,] 0 1 1
#> [3,] 0 1 1
#> [4,] 0 1 1
#> [5,] 0 1 1
#>
#> , , 3
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 1
#> [2,] 0 0 1
#> [3,] 0 0 1
#> [4,] 0 0 1
#> [5,] 0 0 1
#>
#> , , 4
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 0
#> [2,] 0 0 0
#> [3,] 0 0 0
#> [4,] 0 0 0
#> [5,] 0 0 0
sampleDataForLtmleMSM$summary.measures
#> , , 1
#>
#> switch.time time
#> [1,] 0 1
#> [2,] 1 1
#> [3,] 2 1
#> [4,] 3 1
#>
#> , , 2
#>
#> switch.time time
#> [1,] 0 2
#> [2,] 1 2
#> [3,] 2 2
#> [4,] 3 2
#>
#> , , 3
#>
#> switch.time time
#> [1,] 0 3
#> [2,] 1 3
#> [3,] 2 3
#> [4,] 3 3
Here msm.weights
is just an example. It could also be a 200x3x4 array or NULL (for no weights), or "empirical"
(the default).
result.regimes <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes,
Lnodes=Lnodes, Ynodes=Ynodes,
survivalOutcome=TRUE,
regimes=sampleDataForLtmleMSM$regimes,
summary.measures=sampleDataForLtmleMSM$summary.measures,
final.Ynodes=Ynodes,
working.msm="Y ~ male + time + pmax(time - switch.time, 0)",
msm.weights=msm.weights, estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#>
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#>
print(summary(result.regimes))
#> Estimator: tmle
#> Estimate Std. Error CI 2.5% CI 97.5% p-value
#> (Intercept) -3.4059 0.6545 -4.6886 -2.123 1.95e-07 ***
#> male -0.3802 0.5305 -1.4198 0.660 0.47359
#> time 0.7241 0.2602 0.2141 1.234 0.00539 **
#> pmax(time - switch.time, 0) -0.4717 0.2019 -0.8675 -0.076 0.01948 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
regimes
can also be specified as a list of rule functions where each rule is a function applied to each row of data
which returns a numeric vector of the same length as Anodes
.
regimesList <- list(function(row) c(1,1,1),
function(row) c(0,1,1),
function(row) c(0,0,1),
function(row) c(0,0,0))
result.regList <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes,
Lnodes=Lnodes, Ynodes=Ynodes,
survivalOutcome=TRUE, regimes=regimesList,
summary.measures=sampleDataForLtmleMSM$summary.measures,
final.Ynodes=Ynodes,
working.msm="Y ~ male + time + pmax(time - switch.time, 0)",
msm.weights=msm.weights, estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#>
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#>
This should be the same as result.regimes
:
print(summary(result.regList))
#> Estimator: tmle
#> Estimate Std. Error CI 2.5% CI 97.5% p-value
#> (Intercept) -3.4059 0.6545 -4.6886 -2.123 1.95e-07 ***
#> male -0.3802 0.5305 -1.4198 0.660 0.47359
#> time 0.7241 0.2602 0.2141 1.234 0.00539 **
#> pmax(time - switch.time, 0) -0.4717 0.2019 -0.8675 -0.076 0.01948 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Suppose we are only interested in pooling over the result at Y1 and Y3.
result <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes,
Lnodes=Lnodes, Ynodes=Ynodes,
survivalOutcome=TRUE,
regimes=sampleDataForLtmleMSM$regimes,
summary.measures=sampleDataForLtmleMSM$summary.measures[, , c(1, 3)],
final.Ynodes=c("Y1", "Y3"),
working.msm="Y ~ male + time + pmax(time - switch.time, 0)",
estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#>
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#>
#> speedglm failed, using glm instead. If you see a lot of this message and you have large absolute values in data[, Lnodes], you may get a speed performance improvement by rescaling these values.
summary(result)
#> Estimator: tmle
#> Estimate Std. Error CI 2.5% CI 97.5% p-value
#> (Intercept) -3.8891 0.5533 -4.9736 -2.805 2.08e-12 ***
#> male 0.2256 0.5152 -0.7842 1.235 0.661472
#> time 0.7826 0.2318 0.3282 1.237 0.000737 ***
#> pmax(time - switch.time, 0) -0.4331 0.2023 -0.8296 -0.037 0.032262 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We could be only interested in the result at Y3. time
is now a constant in working.msm
, so let’s remove it.
result <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes,
Lnodes=Lnodes, Ynodes=Ynodes,
survivalOutcome=TRUE,
regimes=sampleDataForLtmleMSM$regimes,
summary.measures=sampleDataForLtmleMSM$summary.measures[, , 3],
final.Ynodes="Y3",
working.msm="Y ~ male + switch.time",
estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#>
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#>
#> speedglm failed, using glm instead. If you see a lot of this message and you have large absolute values in data[, Lnodes], you may get a speed performance improvement by rescaling these values.
summary(result)
#> Estimator: tmle
#> Estimate Std. Error CI 2.5% CI 97.5% p-value
#> (Intercept) -2.81063 0.46551 -3.72301 -1.898 1.56e-09 ***
#> male 0.07357 0.53775 -0.98039 1.128 0.8912
#> switch.time 0.45362 0.19436 0.07268 0.835 0.0196 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In this example there are two treatment nodes and one outcome: W A1 L A2 Y W is normally distributed and L is continuous in (0, 1). We are interested in treatments where A1 is set to either 0 or 1 and A2 is set dynamically. The treatment for A2 is indexed by theta between 0 and 1. If \(L > theta\), set A2 to 1, otherwise set A2 to 0.
Here is a function that can be used to generate observed data (if abar = NULL
) or generate counterfactual truth (if abar
is a list with a1 and theta):
GenerateData <- function(n, abar = NULL) {
W <- rnorm(n)
if (is.null(abar)) {
A1 <- rexpit(W)
} else {
A1 <- abar$a1
}
L <- plogis(rnorm(n) + 0.3 * W + 0.5 * A1)
if (is.null(abar)) {
A2 <- rexpit(-0.5 * W + A1 - 0.6 * L)
} else {
A2 <- as.integer(L > abar$theta)
}
Y <- rexpit(-2 + W + A1 + L + 2 * A2)
if (is.null(abar)) {
return(data.frame(W, A1, L, A2, Y))
} else {
return(mean(Y))
}
}
Set up regimes
and summary.measures
:
set.seed(11)
n <- 10000
data <- GenerateData(n)
regimes <- array(dim = c(n, 2, 10)) #n x num.Anodes x num.regimes
theta.set <- seq(0, 1, length.out = 5)
summary.measures <- array(theta.set, dim = c(10, 2, 1))
colnames(summary.measures) <- c("a1", "theta")
cnt <- 0
for (a1 in 0:1) {
for (theta.index in 1:5) {
cnt <- cnt + 1
regimes[, 1, cnt] <- a1
regimes[, 2, cnt] <- data$L > theta.set[theta.index]
summary.measures[cnt, , 1] <- c(a1, theta.set[theta.index])
}
}
summary.measures
#> , , 1
#>
#> a1 theta
#> [1,] 0 0.00
#> [2,] 0 0.25
#> [3,] 0 0.50
#> [4,] 0 0.75
#> [5,] 0 1.00
#> [6,] 1 0.00
#> [7,] 1 0.25
#> [8,] 1 0.50
#> [9,] 1 0.75
#> [10,] 1 1.00
head(data, 3)
#> W A1 L A2 Y
#> 1 -0.59103110 1 0.77137613 1 1
#> 2 0.02659437 0 0.47561600 1 0
#> 3 -1.51655310 0 0.08392365 1 0
regimes[1:3, , ]
#> , , 1
#>
#> [,1] [,2]
#> [1,] 0 1
#> [2,] 0 1
#> [3,] 0 1
#>
#> , , 2
#>
#> [,1] [,2]
#> [1,] 0 1
#> [2,] 0 1
#> [3,] 0 0
#>
#> , , 3
#>
#> [,1] [,2]
#> [1,] 0 1
#> [2,] 0 0
#> [3,] 0 0
#>
#> , , 4
#>
#> [,1] [,2]
#> [1,] 0 1
#> [2,] 0 0
#> [3,] 0 0
#>
#> , , 5
#>
#> [,1] [,2]
#> [1,] 0 0
#> [2,] 0 0
#> [3,] 0 0
#>
#> , , 6
#>
#> [,1] [,2]
#> [1,] 1 1
#> [2,] 1 1
#> [3,] 1 1
#>
#> , , 7
#>
#> [,1] [,2]
#> [1,] 1 1
#> [2,] 1 1
#> [3,] 1 0
#>
#> , , 8
#>
#> [,1] [,2]
#> [1,] 1 1
#> [2,] 1 0
#> [3,] 1 0
#>
#> , , 9
#>
#> [,1] [,2]
#> [1,] 1 1
#> [2,] 1 0
#> [3,] 1 0
#>
#> , , 10
#>
#> [,1] [,2]
#> [1,] 1 0
#> [2,] 1 0
#> [3,] 1 0
working.msm <- "Y ~ a1*theta"
summary(ltmleMSM(data, Anodes = c("A1", "A2"), Lnodes = "L", Ynodes = "Y",
regimes = regimes, summary.measures = summary.measures,
working.msm = working.msm))
#> Qform not specified, using defaults:
#> formula for L:
#> Q.kplus1 ~ W + A1
#> formula for Y:
#> Q.kplus1 ~ W + A1 + L + A2
#>
#> gform not specified, using defaults:
#> formula for A1:
#> A1 ~ W
#> formula for A2:
#> A2 ~ W + A1 + L
#>
#> Estimate of time to completion: 2 to 3 minutes
#> Estimator: tmle
#> Estimate Std. Error CI 2.5% CI 97.5% p-value
#> (Intercept) 0.531670 0.050294 0.433095 0.630 <2e-16 ***
#> a1 1.039499 0.074409 0.893660 1.185 <2e-16 ***
#> theta -1.817514 0.071766 -1.958172 -1.677 <2e-16 ***
#> a1:theta -0.006772 0.110068 -0.222502 0.209 0.951
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Let’s compare to the true coefficients of the MSM. First we find the true value of \(E[Y_{a1, theta}]\) for 5 values of theta.
truth <- rep(NA_real_, 10)
cnt <- 0
for (a1 in 0:1) {
for (theta.index in 1:5) {
cnt <- cnt + 1
truth[cnt] <- GenerateData(n = 1e6,
abar = list(a1 = a1, theta = theta.set[theta.index]))
}
}
Fit a working MSM to the true values of \(E[Y_{a1, theta}]\).
m.true <- glm(working.msm,
data = data.frame(Y = truth, summary.measures[, , 1]),
family = "quasibinomial")
m.true
#>
#> Call: glm(formula = working.msm, family = "quasibinomial", data = data.frame(Y = truth,
#> summary.measures[, , 1]))
#>
#> Coefficients:
#> (Intercept) a1 theta a1:theta
#> 0.51497 0.95904 -1.76125 -0.02284
#>
#> Degrees of Freedom: 9 Total (i.e. Null); 6 Residual
#> Null Deviance: 1.341
#> Residual Deviance: 0.02342 AIC: NA
The estimated MSM coefficients are close to the true MSM coefficients.