Marginal Structural Models - ltmleMSM()

Joshua Schwab

2020-03-13

Marginal Structural Models (MSMs) - Multiple regimes with a single outcome

In this example there are 5 time points with a treatment and time varying covariate. At each, treatment can be 0 or 1. There is a single outcome Y.

L_0 L_1 A_1 L_2 A_2 … L_5 A_5 Y

There are 2^5 = 32 regimes of interest. Some of these may have limited support because there are few patients who follow a particular regime. We pool over all regimes using a working marginal structural model. In this example we want to know the effect of time on treatment on Y. We include time.on.treatment and time.on.treatment^2.

rexpit <- function(x) rbinom(n=length(x), size=1, prob=plogis(x))
n <- 5000
time.points <- 5
prev.L <- rnorm(n)
prev.A <- rep(0, n)
sum.A <- rep(0, n)
data <- data.frame(L_0 = prev.L)
for (t in 1:time.points) {
  L <- 0.1 * prev.L + 0.3 * prev.A + rnorm(n)
  A <- rexpit(L)
  
  data1 <- data.frame(L, A)
  names(data1) <- paste0(c("L_", "A_"), t)
  data <- cbind(data, data1)
  
  prev.A <- A
  prev.L <- L
  
  sum.A <- sum.A + A
}
data$Y <- rexpit(sum.A / time.points + L)
head(data)
#>          L_0        L_1 A_1        L_2 A_2         L_3 A_3         L_4 A_4
#> 1 -0.7480303 -0.1865826   0  0.4469474   1  0.12149207   1 -1.76985617   0
#> 2  0.6557836 -0.4355446   1  0.8469899   0 -1.07773708   0 -0.98969343   0
#> 3  1.2270065 -0.9697197   0 -1.6375742   0  0.03251294   1  0.07680317   0
#> 4 -1.6460771 -1.7278791   0 -0.5303850   0 -1.25567621   0  0.04886727   1
#> 5  0.2744846 -1.1419435   0  0.9679383   1 -1.49024538   0  0.15774825   1
#> 6 -2.0300750 -0.1996350   0  0.8142871   1  0.82830073   0 -0.87578580   0
#>          L_5 A_5 Y
#> 1  0.7097768   1 1
#> 2  1.4101966   0 1
#> 3 -0.6878611   0 0
#> 4 -0.6242420   0 0
#> 5  1.5856941   0 1
#> 6 -0.2540231   0 0
regime.matrix <- as.matrix(expand.grid(rep(list(0:1), time.points)))
dim(regime.matrix)
#> [1] 32  5
head(regime.matrix, 20)
#>       Var1 Var2 Var3 Var4 Var5
#>  [1,]    0    0    0    0    0
#>  [2,]    1    0    0    0    0
#>  [3,]    0    1    0    0    0
#>  [4,]    1    1    0    0    0
#>  [5,]    0    0    1    0    0
#>  [6,]    1    0    1    0    0
#>  [7,]    0    1    1    0    0
#>  [8,]    1    1    1    0    0
#>  [9,]    0    0    0    1    0
#> [10,]    1    0    0    1    0
#> [11,]    0    1    0    1    0
#> [12,]    1    1    0    1    0
#> [13,]    0    0    1    1    0
#> [14,]    1    0    1    1    0
#> [15,]    0    1    1    1    0
#> [16,]    1    1    1    1    0
#> [17,]    0    0    0    0    1
#> [18,]    1    0    0    0    1
#> [19,]    0    1    0    0    1
#> [20,]    1    1    0    0    1
num.regimes <- 2^time.points
regimes <- array(dim = c(n, time.points, num.regimes)) #n x numAnodes x numRegimes = n x time.points x 2^time.points
summary.measures <- array(dim = c(num.regimes, 1, 1)) #numRegimes x num.summary.measures x num.final.Ynodes = 2^time.points x 1 x 1
for (i in 1:num.regimes) {
  regimes[, , i] <- matrix(regime.matrix[i, ], byrow = TRUE, nrow = n, ncol = time.points)
  summary.measures[i, 1, 1] <- sum(regime.matrix[i, ])
}
colnames(summary.measures) <- "time.on.treatment"
regimes[1:3, , 1:3]
#> , , 1
#> 
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    0    0    0    0    0
#> [2,]    0    0    0    0    0
#> [3,]    0    0    0    0    0
#> 
#> , , 2
#> 
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    1    0    0    0    0
#> [2,]    1    0    0    0    0
#> [3,]    1    0    0    0    0
#> 
#> , , 3
#> 
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    0    1    0    0    0
#> [2,]    0    1    0    0    0
#> [3,]    0    1    0    0    0
summary.measures
#> , , 1
#> 
#>       time.on.treatment
#>  [1,]                 0
#>  [2,]                 1
#>  [3,]                 1
#>  [4,]                 2
#>  [5,]                 1
#>  [6,]                 2
#>  [7,]                 2
#>  [8,]                 3
#>  [9,]                 1
#> [10,]                 2
#> [11,]                 2
#> [12,]                 3
#> [13,]                 2
#> [14,]                 3
#> [15,]                 3
#> [16,]                 4
#> [17,]                 1
#> [18,]                 2
#> [19,]                 2
#> [20,]                 3
#> [21,]                 2
#> [22,]                 3
#> [23,]                 3
#> [24,]                 4
#> [25,]                 2
#> [26,]                 3
#> [27,]                 3
#> [28,]                 4
#> [29,]                 3
#> [30,]                 4
#> [31,]                 4
#> [32,]                 5

For large numbers of regimes, variance.method = "ic" is much faster, but may give anticonservative confidence intervals. You may want to use variance.method = "ic" first to make sure the MSM coefficients look reasonable and then use variance.method = "tmle" (the default) for your final estimates.

result1 <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points), 
                    Lnodes = paste0("L_", 0:time.points), Ynodes = "Y", 
                    regimes = regimes, summary.measures = summary.measures, 
                    working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)",
                    variance.method = "ic")
#> Qform not specified, using defaults:
#> formula for L_2:
#> Q.kplus1 ~ L_0 + L_1 + A_1
#> formula for L_3:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2
#> formula for L_4:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3
#> formula for L_5:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4
#> formula for Y:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 +     L_5 + A_5
#> 
#> gform not specified, using defaults:
#> formula for A_1:
#> A_1 ~ L_0 + L_1
#> formula for A_2:
#> A_2 ~ L_0 + L_1 + A_1 + L_2
#> formula for A_3:
#> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3
#> formula for A_4:
#> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4
#> formula for A_5:
#> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5
#> 
#> Estimate of time to completion: 1 minute
#> Warning in CheckForVarianceWarning(inputs, g.ratio): Variance estimate is
#> based on influence curve only, which may be significantly anticonservative
#> because your data appears to contain positivity violations. It is recommended
#> to use variance.method='tmle' or variance.method='iptw' to obtain a more
#> robust variance estimate (but run time may be significantly longer). See
#> variance.method details in ?ltmle
result2 <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points), 
                    Lnodes = paste0("L_", 0:time.points), Ynodes = "Y", 
                    regimes = regimes, summary.measures = summary.measures, 
                    working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)")
#> Qform not specified, using defaults:
#> formula for L_2:
#> Q.kplus1 ~ L_0 + L_1 + A_1
#> formula for L_3:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2
#> formula for L_4:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3
#> formula for L_5:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4
#> formula for Y:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 +     L_5 + A_5
#> 
#> gform not specified, using defaults:
#> formula for A_1:
#> A_1 ~ L_0 + L_1
#> formula for A_2:
#> A_2 ~ L_0 + L_1 + A_1 + L_2
#> formula for A_3:
#> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3
#> formula for A_4:
#> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4
#> formula for A_5:
#> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5
#> 
#> Estimate of time to completion: 4 to 16 minutes
summary(result1)
#> Estimator:  tmle 
#>                        Estimate Std. Error  CI 2.5% CI 97.5% p-value
#> (Intercept)             0.11565    0.13593 -0.15078    0.382   0.395
#> time.on.treatment       0.08999    0.11442 -0.13428    0.314   0.432
#> I(time.on.treatment^2)  0.03304    0.02284 -0.01171    0.078   0.148
summary(result2)
#> Estimator:  tmle 
#>                        Estimate Std. Error  CI 2.5% CI 97.5% p-value
#> (Intercept)             0.11565    0.18547 -0.24786    0.479   0.533
#> time.on.treatment       0.08999    0.14958 -0.20318    0.383   0.547
#> I(time.on.treatment^2)  0.03304    0.02862 -0.02304    0.089   0.248

Suppose we are only interested in the effect of time on treatment on Y considering regimes that include at least 3 periods on treatment.

at.least.3 <- summary.measures[, "time.on.treatment", 1] >= 3
regimes.3 <- regimes[, , at.least.3]
summary.measures.3 <- summary.measures[at.least.3, , , drop = F]
dim(regimes.3)
#> [1] 5000    5   16
summary.measures.3
#> , , 1
#> 
#>       time.on.treatment
#>  [1,]                 3
#>  [2,]                 3
#>  [3,]                 3
#>  [4,]                 3
#>  [5,]                 4
#>  [6,]                 3
#>  [7,]                 3
#>  [8,]                 3
#>  [9,]                 4
#> [10,]                 3
#> [11,]                 3
#> [12,]                 4
#> [13,]                 3
#> [14,]                 4
#> [15,]                 4
#> [16,]                 5

result <- ltmleMSM(data, Anodes = paste0("A_", 1:time.points), 
                   Lnodes = paste0("L_", 0:time.points), 
                   Ynodes = "Y", regimes = regimes.3, 
                   summary.measures = summary.measures.3, 
                   working.msm = "Y ~ time.on.treatment + I(time.on.treatment^2)")
#> Qform not specified, using defaults:
#> formula for L_2:
#> Q.kplus1 ~ L_0 + L_1 + A_1
#> formula for L_3:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2
#> formula for L_4:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3
#> formula for L_5:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4
#> formula for Y:
#> Q.kplus1 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 +     L_5 + A_5
#> 
#> gform not specified, using defaults:
#> formula for A_1:
#> A_1 ~ L_0 + L_1
#> formula for A_2:
#> A_2 ~ L_0 + L_1 + A_1 + L_2
#> formula for A_3:
#> A_3 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3
#> formula for A_4:
#> A_4 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4
#> formula for A_5:
#> A_5 ~ L_0 + L_1 + A_1 + L_2 + A_2 + L_3 + A_3 + L_4 + A_4 + L_5
#> 
#> Estimate of time to completion: 3 to 8 minutes
summary(result)
#> Estimator:  tmle 
#>                        Estimate Std. Error  CI 2.5% CI 97.5% p-value
#> (Intercept)            -0.47416    2.20574 -4.79733    3.849   0.830
#> time.on.treatment       0.51495    1.20690 -1.85053    2.880   0.670
#> I(time.on.treatment^2) -0.03434    0.16045 -0.34882    0.280   0.831

Marginal Structural Models (MSMs) - Switching time example - Multiple regimes and outcomes

Given data over 3 time points where A switches to 1 once and then stays 1. We want to know how death varies as a function of gender, time and an indicator of whether a patient’s intended regime was to switch before time. Note that working.msm includes time and switch.time, which are columns of summary.measures; working.msm also includes male, which is ok because it is a baseline covariate (it comes before any A/C/L/Y nodes).

data(sampleDataForLtmleMSM)
head(sampleDataForLtmleMSM$data, 20)
#>    male age CD4_0 A0 Y1 CD4_1 A1 Y2 CD4_2 A2 Y3
#> 1     1  33   347  0  0   349  0  0   315  0  0
#> 2     0  18   277  0  0   302  0  0   300  0  0
#> 3     1  33   419  0  0   423  0  0   462  0  0
#> 4     1  35   318  1  0   358  1  0   413  1  0
#> 5     0  27   145  0  0   134  1  1    NA NA  1
#> 6     1  27   320  0  0   332  0  0   347  1  0
#> 7     1  35   220  0  0   241  0  0   216  1  0
#> 8     0  31   184  0  0   202  1  0   230  1  0
#> 9     0  35   289  0  0   302  0  0   290  0  0
#> 10    0  30   295  1  0   312  1  0   340  1  0
#> 11    0  31   300  1  0   394  1  0   467  1  0
#> 12    0  30   300  0  0   331  0  0   320  0  0
#> 13    0  25   276  0  1    NA NA  1    NA NA  1
#> 14    1  26   242  1  0   280  1  0   307  1  0
#> 15    0  21   238  1  0   345  1  0   379  1  0
#> 16    1  30   304  0  0   258  0  0   287  0  0
#> 17    0  30   271  1  0   297  1  0   324  1  0
#> 18    1  27   296  0  0   305  0  0   306  0  0
#> 19    1  33   217  1  0   242  1  0   267  1  0
#> 20    1  25   337  0  0   360  0  0   390  0  0
dim(sampleDataForLtmleMSM$regimes)
#> [1] 200   3   4
sampleDataForLtmleMSM$regimes[1:5, , ]
#> , , 1
#> 
#>      [,1] [,2] [,3]
#> [1,]    1    1    1
#> [2,]    1    1    1
#> [3,]    1    1    1
#> [4,]    1    1    1
#> [5,]    1    1    1
#> 
#> , , 2
#> 
#>      [,1] [,2] [,3]
#> [1,]    0    1    1
#> [2,]    0    1    1
#> [3,]    0    1    1
#> [4,]    0    1    1
#> [5,]    0    1    1
#> 
#> , , 3
#> 
#>      [,1] [,2] [,3]
#> [1,]    0    0    1
#> [2,]    0    0    1
#> [3,]    0    0    1
#> [4,]    0    0    1
#> [5,]    0    0    1
#> 
#> , , 4
#> 
#>      [,1] [,2] [,3]
#> [1,]    0    0    0
#> [2,]    0    0    0
#> [3,]    0    0    0
#> [4,]    0    0    0
#> [5,]    0    0    0
sampleDataForLtmleMSM$summary.measures
#> , , 1
#> 
#>      switch.time time
#> [1,]           0    1
#> [2,]           1    1
#> [3,]           2    1
#> [4,]           3    1
#> 
#> , , 2
#> 
#>      switch.time time
#> [1,]           0    2
#> [2,]           1    2
#> [3,]           2    2
#> [4,]           3    2
#> 
#> , , 3
#> 
#>      switch.time time
#> [1,]           0    3
#> [2,]           1    3
#> [3,]           2    3
#> [4,]           3    3
Anodes <- c("A0", "A1", "A2")
Lnodes <- c("CD4_1", "CD4_2")
Ynodes <- c("Y1", "Y2", "Y3")

Here msm.weights is just an example. It could also be a 200x3x4 array or NULL (for no weights), or "empirical" (the default).

msm.weights <- matrix(1:12, nrow=4, ncol=3)  
result.regimes <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes, 
                   Lnodes=Lnodes, Ynodes=Ynodes, 
                   survivalOutcome=TRUE,
                   regimes=sampleDataForLtmleMSM$regimes, 
                   summary.measures=sampleDataForLtmleMSM$summary.measures,
                   final.Ynodes=Ynodes, 
                   working.msm="Y ~ male + time + pmax(time - switch.time, 0)", 
                   msm.weights=msm.weights, estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#> 
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#> 
print(summary(result.regimes))
#> Estimator:  tmle 
#>                             Estimate Std. Error CI 2.5% CI 97.5%  p-value    
#> (Intercept)                  -3.4059     0.6545 -4.6886   -2.123 1.95e-07 ***
#> male                         -0.3802     0.5305 -1.4198    0.660  0.47359    
#> time                          0.7241     0.2602  0.2141    1.234  0.00539 ** 
#> pmax(time - switch.time, 0)  -0.4717     0.2019 -0.8675   -0.076  0.01948 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

regimes can also be specified as a list of rule functions where each rule is a function applied to each row of data which returns a numeric vector of the same length as Anodes.

regimesList <- list(function(row) c(1,1,1),
                     function(row) c(0,1,1),
                     function(row) c(0,0,1),
                     function(row) c(0,0,0))
result.regList <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes, 
                   Lnodes=Lnodes, Ynodes=Ynodes, 
                   survivalOutcome=TRUE, regimes=regimesList, 
                   summary.measures=sampleDataForLtmleMSM$summary.measures,
                   final.Ynodes=Ynodes, 
                   working.msm="Y ~ male + time + pmax(time - switch.time, 0)", 
                   msm.weights=msm.weights, estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#> 
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#> 

This should be the same as result.regimes:

print(summary(result.regList))         
#> Estimator:  tmle 
#>                             Estimate Std. Error CI 2.5% CI 97.5%  p-value    
#> (Intercept)                  -3.4059     0.6545 -4.6886   -2.123 1.95e-07 ***
#> male                         -0.3802     0.5305 -1.4198    0.660  0.47359    
#> time                          0.7241     0.2602  0.2141    1.234  0.00539 ** 
#> pmax(time - switch.time, 0)  -0.4717     0.2019 -0.8675   -0.076  0.01948 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Suppose we are only interested in pooling over the result at Y1 and Y3.

result <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes, 
                   Lnodes=Lnodes, Ynodes=Ynodes, 
                   survivalOutcome=TRUE,
                   regimes=sampleDataForLtmleMSM$regimes, 
                   summary.measures=sampleDataForLtmleMSM$summary.measures[, , c(1, 3)],
                   final.Ynodes=c("Y1", "Y3"), 
                   working.msm="Y ~ male + time + pmax(time - switch.time, 0)", 
                   estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#> 
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#> 
#> speedglm failed, using glm instead. If you see a lot of this message and you have large absolute values in data[, Lnodes], you may get a speed performance improvement by rescaling these values.
summary(result)
#> Estimator:  tmle 
#>                             Estimate Std. Error CI 2.5% CI 97.5%  p-value    
#> (Intercept)                  -3.8891     0.5533 -4.9736   -2.805 2.08e-12 ***
#> male                          0.2256     0.5152 -0.7842    1.235 0.661472    
#> time                          0.7826     0.2318  0.3282    1.237 0.000737 ***
#> pmax(time - switch.time, 0)  -0.4331     0.2023 -0.8296   -0.037 0.032262 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We could be only interested in the result at Y3. time is now a constant in working.msm, so let’s remove it.

result <- ltmleMSM(sampleDataForLtmleMSM$data, Anodes=Anodes, 
                    Lnodes=Lnodes, Ynodes=Ynodes, 
                    survivalOutcome=TRUE,
                    regimes=sampleDataForLtmleMSM$regimes, 
                    summary.measures=sampleDataForLtmleMSM$summary.measures[, , 3],
                    final.Ynodes="Y3", 
                    working.msm="Y ~ male + switch.time", 
                    estimate.time=FALSE)
#> Qform not specified, using defaults:
#> formula for Y1:
#> Q.kplus1 ~ male + age + CD4_0 + A0
#> formula for Y2:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1
#> formula for Y3:
#> Q.kplus1 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2 + A2
#> 
#> gform not specified, using defaults:
#> formula for A0:
#> A0 ~ male + age + CD4_0
#> formula for A1:
#> A1 ~ male + age + CD4_0 + A0 + CD4_1
#> formula for A2:
#> A2 ~ male + age + CD4_0 + A0 + CD4_1 + A1 + CD4_2
#> 
#> speedglm failed, using glm instead. If you see a lot of this message and you have large absolute values in data[, Lnodes], you may get a speed performance improvement by rescaling these values.
summary(result)
#> Estimator:  tmle 
#>             Estimate Std. Error  CI 2.5% CI 97.5%  p-value    
#> (Intercept) -2.81063    0.46551 -3.72301   -1.898 1.56e-09 ***
#> male         0.07357    0.53775 -0.98039    1.128   0.8912    
#> switch.time  0.45362    0.19436  0.07268    0.835   0.0196 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Marginal Structural Models (MSMs) - Dynamic Treatment

In this example there are two treatment nodes and one outcome: W A1 L A2 Y W is normally distributed and L is continuous in (0, 1). We are interested in treatments where A1 is set to either 0 or 1 and A2 is set dynamically. The treatment for A2 is indexed by theta between 0 and 1. If \(L > theta\), set A2 to 1, otherwise set A2 to 0.
Here is a function that can be used to generate observed data (if abar = NULL) or generate counterfactual truth (if abar is a list with a1 and theta):

GenerateData <- function(n, abar = NULL) {
  W <- rnorm(n)
  if (is.null(abar)) {
    A1 <- rexpit(W)
  } else {
    A1 <- abar$a1
  }
  L <- plogis(rnorm(n) + 0.3 * W + 0.5 * A1)
  if (is.null(abar)) {
    A2 <- rexpit(-0.5 * W + A1 - 0.6 * L)
  } else {
    A2 <- as.integer(L > abar$theta)
  }
  Y <- rexpit(-2 + W + A1 + L + 2 * A2)
  if (is.null(abar)) {
    return(data.frame(W, A1, L, A2, Y))
  } else {
    return(mean(Y))
  }
}

Set up regimes and summary.measures:

set.seed(11)
n <- 10000
data <- GenerateData(n)
regimes <- array(dim = c(n, 2, 10)) #n x num.Anodes x num.regimes
theta.set <- seq(0, 1, length.out = 5)
summary.measures <- array(theta.set, dim = c(10, 2, 1))
colnames(summary.measures) <- c("a1", "theta")
cnt <- 0
for (a1 in 0:1) {
  for (theta.index in 1:5) {
    cnt <- cnt + 1
    regimes[, 1, cnt] <- a1
    regimes[, 2, cnt] <- data$L > theta.set[theta.index]
    summary.measures[cnt, , 1] <- c(a1, theta.set[theta.index])
  }
}
summary.measures
#> , , 1
#> 
#>       a1 theta
#>  [1,]  0  0.00
#>  [2,]  0  0.25
#>  [3,]  0  0.50
#>  [4,]  0  0.75
#>  [5,]  0  1.00
#>  [6,]  1  0.00
#>  [7,]  1  0.25
#>  [8,]  1  0.50
#>  [9,]  1  0.75
#> [10,]  1  1.00
head(data, 3)
#>             W A1          L A2 Y
#> 1 -0.59103110  1 0.77137613  1 1
#> 2  0.02659437  0 0.47561600  1 0
#> 3 -1.51655310  0 0.08392365  1 0
regimes[1:3, , ]
#> , , 1
#> 
#>      [,1] [,2]
#> [1,]    0    1
#> [2,]    0    1
#> [3,]    0    1
#> 
#> , , 2
#> 
#>      [,1] [,2]
#> [1,]    0    1
#> [2,]    0    1
#> [3,]    0    0
#> 
#> , , 3
#> 
#>      [,1] [,2]
#> [1,]    0    1
#> [2,]    0    0
#> [3,]    0    0
#> 
#> , , 4
#> 
#>      [,1] [,2]
#> [1,]    0    1
#> [2,]    0    0
#> [3,]    0    0
#> 
#> , , 5
#> 
#>      [,1] [,2]
#> [1,]    0    0
#> [2,]    0    0
#> [3,]    0    0
#> 
#> , , 6
#> 
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    1
#> [3,]    1    1
#> 
#> , , 7
#> 
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    1
#> [3,]    1    0
#> 
#> , , 8
#> 
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    0
#> [3,]    1    0
#> 
#> , , 9
#> 
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    0
#> [3,]    1    0
#> 
#> , , 10
#> 
#>      [,1] [,2]
#> [1,]    1    0
#> [2,]    1    0
#> [3,]    1    0
working.msm <- "Y ~ a1*theta"
summary(ltmleMSM(data, Anodes = c("A1", "A2"), Lnodes = "L", Ynodes = "Y", 
              regimes = regimes, summary.measures = summary.measures, 
              working.msm = working.msm))
#> Qform not specified, using defaults:
#> formula for L:
#> Q.kplus1 ~ W + A1
#> formula for Y:
#> Q.kplus1 ~ W + A1 + L + A2
#> 
#> gform not specified, using defaults:
#> formula for A1:
#> A1 ~ W
#> formula for A2:
#> A2 ~ W + A1 + L
#> 
#> Estimate of time to completion: 2 to 3 minutes
#> Estimator:  tmle 
#>              Estimate Std. Error   CI 2.5% CI 97.5% p-value    
#> (Intercept)  0.531670   0.050294  0.433095    0.630  <2e-16 ***
#> a1           1.039499   0.074409  0.893660    1.185  <2e-16 ***
#> theta       -1.817514   0.071766 -1.958172   -1.677  <2e-16 ***
#> a1:theta    -0.006772   0.110068 -0.222502    0.209   0.951    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s compare to the true coefficients of the MSM. First we find the true value of \(E[Y_{a1, theta}]\) for 5 values of theta.

truth <- rep(NA_real_, 10)
cnt <- 0
for (a1 in 0:1) {
  for (theta.index in 1:5) {
    cnt <- cnt + 1
    truth[cnt] <- GenerateData(n = 1e6, 
                    abar = list(a1 = a1, theta = theta.set[theta.index]))
  }
}

Fit a working MSM to the true values of \(E[Y_{a1, theta}]\).

m.true <- glm(working.msm, 
              data = data.frame(Y = truth, summary.measures[, , 1]), 
              family = "quasibinomial")
m.true
#> 
#> Call:  glm(formula = working.msm, family = "quasibinomial", data = data.frame(Y = truth, 
#>     summary.measures[, , 1]))
#> 
#> Coefficients:
#> (Intercept)           a1        theta     a1:theta  
#>     0.51497      0.95904     -1.76125     -0.02284  
#> 
#> Degrees of Freedom: 9 Total (i.e. Null);  6 Residual
#> Null Deviance:       1.341 
#> Residual Deviance: 0.02342   AIC: NA

The estimated MSM coefficients are close to the true MSM coefficients.