Install the package
## on CRAN
## install.packages("Compack")
library(Compack)
Regression with functional compostional predictors
Generate data
library(Compack)
df_beta = 5
p = 30
beta_C_true = matrix(0, nrow = p, ncol = df_beta)
beta_C_true[1, ] <- c(-0.5, -0.5, -0.5 , -1, -1)
beta_C_true[2, ] <- c(0.8, 0.8, 0.7, 0.6, 0.6)
beta_C_true[3, ] <- c(-0.8, -0.8 , 0.4 , 1 , 1)
beta_C_true[4, ] <- c(0.5, 0.5, -0.6 ,-0.6, -0.6)
n_train = 50
n_test = 30
Data <- Fcomp_Model(n = n_train, p = p, m = 0, intercept = TRUE,
SNR = 4, sigma = 3, rho_X = 0.6, rho_T = 0.2,
df_beta = df_beta, n_T = 20, obs_spar = 1, theta.add = FALSE,
beta_C = as.vector(t(beta_C_true)))
arg_list <- as.list(Data$call)[-1]
arg_list$n <- n_test
Test <- do.call(Fcomp_Model, arg_list)
sparse log-contrast regression with functional compositonal predictors
m1 <- FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = df_beta)
plot(m1, xlab = "-log")
betamat <- coef(m1)
predmat <- predict(m1, Data$data$Comp, Data$data$Zc)
Cross-Validation tuned model
linearly contrained group lasso creterion: CGL
k_list = c(4,5,6)
nfolds = 10
foldid <- sample(rep(seq(nfolds), length = n_train))
## cv_cgl: Constrained group lasso
cv_cgl <- cv.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list, foldid = foldid,
keep = TRUE)
plot(cv_cgl,k = k_list)
cv_cgl$Ftrim[c("lam.min", "lam.1se")]
## $lam.min
## lam df
## 0.005003941 4.000000000
##
## $lam.1se
## lam df
## 0.01203433 4.00000000
beta <- coef(cv_cgl, trim = FALSE, s = "lam.min")
k_opt <- cv_cgl$Ftrim$lam.min['df']
plot(cv_cgl$FuncompCGL.fit[[as.character(k_opt)]])
m1 <- ifelse(is.null(ncol(Data$data$Zc)), 0, ncol(Data$data$Zc))
m1 <- m1 + Data$data$intercept
if(k_opt == df_beta) {
plot(Data$beta, col = "red", pch = 19,
ylim = range(c(range(Data$beta), range(beta))))
abline(v= seq(from = 0, to = (p*df_beta), by = df_beta ))
abline(h = 0)
points(beta)
if(m1 > 0) points(p*df_beta + 1:m1, tail(Data$beta, m1),
col = "blue", pch = 19)
} else {
plot(beta, ylim = range(c(range(Data$beta), range(beta))) )
abline(v= seq(from = 0, to = (p*k_opt), by = k_opt ))
abline(h = 0, col = "red")
if(m1 > 0) points(p*k_opt + 1:m1, tail(Data$beta, m1),
col = "blue", pch = 19)
}
beta_C <- matrix(beta[1:(p*k_opt)], byrow = TRUE, nrow = p)
## satisfies zero-sum constraints
cat("colSums:", colSums(beta_C))
## colSums: 4.167479e-09 -5.502439e-09 -9.87453e-09 -5.177147e-09
Nonzero <- (1:p)[apply(beta_C, 1, function(x) max(abs(x)) >0)]
cat("selected groups:", Nonzero)
## selected groups: 1 2 3 4 6 7 8 9 10 12 14 15 21 25 27 28 30
sseq <- Data$basis.info[, 1]
beta_curve_true <- Data$basis.info[, -1] %*% t(beta_C_true)
Nonzero_true <- (1:p)[apply(beta_C_true, 1, function(x) max(abs(x)) >0)]
matplot(sseq, beta_curve_true, type = "l", ylim = range(beta_curve_true),
ylab = "True coeffcients curves", xlab = "TIME")
abline(a = 0, b = 0, col = "grey", lwd = 2)
text(0, beta_curve_true[1, Nonzero_true], labels = Nonzero_true)
beta_curve <- splines::bs(sseq, df = k_opt, intercept = TRUE) %*% t(beta_C)
matplot(sseq, beta_curve, type = "l", ylim = range(beta_curve_true),
ylab = "Estimated coefficient curves", xlab = "TIME")
abline(a = 0, b = 0, col = "grey", lwd = 2)
text(0, beta_curve[1, Nonzero], labels = Nonzero)
## set a thresholding for variable selection via cross-validation model
## example: cut by average L2-norm for estiamted coefficient curves
Curve_L2 <- colSums(beta_curve^2)
Curve_L2 <- Curve_L2 - colSums(beta_curve[c(1, nrow(beta_curve)), ]^2) / 2
Curve_L2 <- Curve_L2 * (Data$basis.info[2,1] - Data$basis.info[1,1])
Curve_L2 <- sqrt(Curve_L2)
plot(Curve_L2, xlab = "Component index", ylab = "L2-norm for coefficient curves")
cutoff <- sum(Curve_L2) / p
Nonzero_cut <- (1:p)[which(Curve_L2 >= cutoff)]
cat("selected groups after thresholding cut-off:", Nonzero_cut)
## selected groups after thresholding cut-off: 1 2 3 4
y_hat <- predict(cv_cgl, Data$data$Comp, Data$data$Zc, s = "lam.min")
MSE <- sum((drop(Data$data$y) - y_hat)^2) / n_train
y_hat <- predict(cv_cgl, Test$data$Comp, Test$data$Zc, s = "lam.min")
PRE <- sum((drop(Test$data$y) - y_hat)^2) / n_test
cgl_result <- list(cv.result = cv_cgl, beta = beta,
Nonzero = c("Original" = Nonzero, "Cut" = Nonzero_cut),
MSE = MSE, PRE = PRE)
ignoring the fact of one-sum for compositional data: naive
## set mu_raio = 0 to identifying without linear constraints,
## no outer_loop for Lagrange augmented multiplier
cv_naive <- cv.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list, foldid = foldid, keep = TRUE,
mu_ratio = 0)
plot(cv_naive, k = k_list)
beta <- coef(cv_naive, trim = FALSE, s = "lam.min")
k_opt <- cv_naive$Ftrim$lam.min['df']
beta_C <- matrix(beta[1:(p*k_opt)], byrow = TRUE, nrow = p)
## does NOT satisfy zero-sum constraints
cat("colSums:", colSums(beta_C))
## colSums: -0.2672194 -0.1827702 0.1566777 -0.2521267
Nonzero <- (1:p)[apply(beta_C, 1, function(x) max(abs(x)) >0)]
beta_curve <- splines::bs(sseq, df = k_opt, intercept = TRUE) %*% t(beta_C)
Curve_L2 <- colSums(beta_curve^2) - colSums(beta_curve[c(1, nrow(beta_curve)), ]^2) / 2
Curve_L2 <- sqrt(Curve_L2 * (Data$basis.info[2,1] - Data$basis.info[1,1]))
cutoff <- sum(Curve_L2) / p
Nonzero_cut <- (1:p)[which(Curve_L2 >= cutoff)]
MSE <- sum((drop(Data$data$y) - predict(cv_naive, Data$data$Comp, Data$data$Zc, s = "lam.min"))^2) / n_train
PRE <- sum((drop(Test$data$y) - predict(cv_naive, Test$data$Comp, Test$data$Zc, s = "lam.min"))^2) / n_test
naive_result <- list(cv.result = cv_naive, beta = beta,
Nonzero = c("Original" = Nonzero, "Cut" = Nonzero_cut),
MSE = MSE, PRE = PRE)
random select a component of the compostion as reference level
## mu_ratio is set to 0 automatically once ref is set to a integer
ref = sample(1:p, 1)
cv_base <- cv.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list, foldid = foldid, keep = TRUE,
ref = ref)
plot(cv_base, k = k_list)
beta <- coef(cv_base, trim = FALSE, s = "lam.min")
k_opt <- cv_base$Ftrim$lam.min['df']
beta_C <- matrix(beta[1:(p*k_opt)], byrow = TRUE, nrow = p)
## satisfies zero-sum constraints
cat("colSums:", colSums(beta_C))
## colSums: -3.469447e-18 2.168404e-18 2.602085e-18 6.938894e-18
Nonzero <- (1:p)[apply(beta_C, 1, function(x) max(abs(x)) >0)]
beta_curve <- splines::bs(sseq, df = k_opt, intercept = TRUE) %*% t(beta_C)
Curve_L2 <- colSums(beta_curve^2) - colSums(beta_curve[c(1, nrow(beta_curve)), ]^2) / 2
Curve_L2 <- sqrt(Curve_L2 * (Data$basis.info[2,1] - Data$basis.info[1,1]))
cutoff <- sum(Curve_L2) / p
Nonzero_cut <- (1:p)[which(Curve_L2 >= cutoff)]
MSE <- sum((drop(Data$data$y) - predict(cv_base, Data$data$Comp, Data$data$Zc, s = "lam.min"))^2) / n_train
PRE <- sum((drop(Test$data$y) - predict(cv_base, Test$data$Comp, Test$data$Zc, s = "lam.min"))^2) / n_test
base_result <- list(cv.result = cv_base, beta = beta,
Nonzero = c("Original" = Nonzero, "Cut" = Nonzero_cut),
MSE = MSE, PRE = PRE)
GIC tuned model
linearly contrained group lasso creterion: CGL
GIC_cgl <- GIC.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list)
beta <- coef(GIC_cgl)
plot(GIC_cgl)
y_hat <- predict(GIC_cgl, Znew = Test$data$Comp, Zcnew = Test$data$Zc)
plot(Test$data$y, y_hat, xlab = "Observed response", ylab = "Predicted response")
ignoring the fact of one-sum for compositional data: naive
GIC_naive <- GIC.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list, mu_ratio = 0)
beta <- coef(GIC_naive)
plot(GIC_naive)
y_hat <- predict(GIC_naive, Znew = Test$data$Comp, Zcnew = Test$data$Zc)
plot(Test$data$y, y_hat, xlab = "Observed response", ylab = "Predicted response")
random select a component of the compostion as reference level
GIC_base <- GIC.FuncompCGL(y = Data$data$y, X = Data$data$Comp,
Zc = Data$data$Zc, intercept = Data$data$intercept,
k = k_list, ref = ref)
beta <- coef(GIC_base)
plot(GIC_base)
y_hat <- predict(GIC_base, Znew = Test$data$Comp, Zcnew = Test$data$Zc)
plot(Test$data$y, y_hat, xlab = "Observed response", ylab = "Predicted response")
Regression with compositional predictors
Generate data
library(Compack)
p = 30
n = 50
beta = c(1, -0.8, 0.6, 0, 0, -1.5, -0.5, 1.2)
beta = c(beta, rep(0, times = p - length(beta)))
Comp_data = comp_Model(n = n, p = p, beta = beta, intercept = FALSE)
Comp_data2 = comp_Model(n = n, p = p, beta = Comp_data$beta, intercept = FALSE)
sparse log-contrast regression with compositonal predictors
m1 <- compCL(y = Comp_data$y, Z = Comp_data$X.comp,
Zc = Comp_data$Zc, intercept = Comp_data$intercept)
plot(m1, label = TRUE)
coef(m1)[1:10, 90:100]
## L90 L91 L92 L93 L94 L95
## Z1 0.92406820 0.92547909 0.92393318 0.92468211 0.92482516 0.92499664
## Z2 -0.66133490 -0.66328703 -0.66439024 -0.66621092 -0.66758205 -0.66890518
## Z3 0.58299125 0.58434121 0.58253225 0.58270957 0.58266568 0.58257745
## Z4 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## Z5 0.04906269 0.04925977 0.05387758 0.05507937 0.05690799 0.05856053
## Z6 -1.44762276 -1.44781669 -1.45108388 -1.45185109 -1.45315509 -1.45438880
## Z7 -0.29034027 -0.29063399 -0.29528538 -0.29726352 -0.29908001 -0.30087369
## Z8 1.02544144 1.02813418 1.03215631 1.03528559 1.03798521 1.04059747
## Z9 -0.01258296 -0.01473562 -0.01643933 -0.01842473 -0.01998444 -0.02156272
## Z10 -0.11472875 -0.11498203 -0.11470839 -0.11519857 -0.11511888 -0.11536527
## L96 L97 L98 L99 L100
## Z1 0.92528297 0.92544175 0.92555603 0.92573786 0.92614382
## Z2 -0.67023889 -0.67159139 -0.67288982 -0.67415268 -0.67543773
## Z3 0.58278803 0.58302517 0.58317946 0.58337527 0.58363210
## Z4 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## Z5 0.05989629 0.06128825 0.06265640 0.06387050 0.06490137
## Z6 -1.45558553 -1.45673691 -1.45775937 -1.45876054 -1.46061136
## Z7 -0.30234935 -0.30400968 -0.30576860 -0.30729023 -0.30818420
## Z8 1.04290265 1.04505940 1.04710022 1.04896656 1.05050565
## Z9 -0.02305226 -0.02429200 -0.02541595 -0.02649356 -0.02785752
## Z10 -0.11561523 -0.11568306 -0.11564942 -0.11550258 -0.11480866
Cross-Validation tuned model
cvm1 <- cv.compCL(y = Comp_data$y, Z = Comp_data$X.comp,
Zc = Comp_data$Zc, intercept = Comp_data$intercept)
plot(cvm1, xlab = "-log")
beta_est <- coef(cvm1, s = "lam.min")
head(beta_est, 10)
## 1
## Z1 0.90048789
## Z2 -0.62012673
## Z3 0.58372632
## Z4 0.00000000
## Z5 0.01756609
## Z6 -1.38814631
## Z7 -0.23387376
## Z8 0.94222496
## Z9 0.00000000
## Z10 -0.09116647
sum(beta_est[1:p]) # satisfies zero-sum constraint
## [1] 5.516289e-09
y_hat <- predict(cvm1, Comp_data2$X.comp, Comp_data2$Zc, s = "lam.min")
GIC tuned model
GICm1 <- GIC.compCL(y = Comp_data$y, Z = Comp_data$X.comp,
Zc = Comp_data$Zc, intercept = Comp_data$intercept)
plot(GICm1, xlab = "-log")
beta_est <- coef(GICm1, s = "lam.min")
head(beta_est, 10)
## 1
## Z1 0.85898896
## Z2 -0.56310354
## Z3 0.58218204
## Z4 0.00000000
## Z5 0.00000000
## Z6 -1.32214479
## Z7 -0.14017233
## Z8 0.82199822
## Z9 0.00000000
## Z10 -0.08630774
sum(beta_est[1:p]) # satisfies zero-sum constraint
## [1] 6.958372e-09
y_hat <- predict(GICm1, Comp_data2$X.comp, Comp_data2$Zc, s = "lam.min")