mclustAddons is a contributed R package that extends the functionality available in the mclust package (Scrucca et al. 2016).
In particular, the following methods are included:
density estimation for data with bounded support;
modal clustering using modal EM algorithm for Gaussian mixtures.
This document gives a quick tour of mclustAddons (version 0.6). It was written in R Markdown, using the knitr package for production.
References on the methodologies implemented are provided by citation("mclustAddons")
. See also at the end of this document.
library(mclustAddons)
## Loading required package: mclust
## Package 'mclust' version 5.4.9
## Type 'citation("mclust")' for citing this R package in publications.
<- rchisq(200, 3)
x <- seq(-2, max(x), length=1000)
xgrid <- dchisq(xgrid, 3) # true density
f <- densityMclustBounded(x, lbound = 0)
dens summary(dens, parameters = TRUE)
## ── Density estimation for bounded data via GMMs ───────────
##
## Boundaries: x
## lower 0
## upper Inf
##
## Model E (univariate, equal variance) model with 1 component
## on the transformation scale:
##
## log-likelihood n df BIC ICL
## -390.0517 200 3 -795.9983 -795.9983
##
## x
## Range-power transformation: 0.3715163
##
## Mixing probabilities:
## 1
## 1
##
## Means:
## 1
## 0.9191207
##
## Variances:
## 1
## 1.309037
plot(dens, what = "density")
lines(xgrid, f, lty = 2)
plot(dens, what = "density", data = x, breaks = 15)
<- rbeta(200, 5, 1.5)
x <- seq(-0.1, 1.1, length=1000)
xgrid <- dbeta(xgrid, 5, 1.5) # true density
f <- densityMclustBounded(x, lbound = 0, ubound = 1)
dens summary(dens, parameters = TRUE)
## ── Density estimation for bounded data via GMMs ───────────
##
## Boundaries: x
## lower 0
## upper 1
##
## Model E (univariate, equal variance) model with 1 component
## on the transformation scale:
##
## log-likelihood n df BIC ICL
## 104.4678 200 3 193.0407 193.0407
##
## x
## Range-power transformation: -0.1600959
##
## Mixing probabilities:
## 1
## 1
##
## Means:
## 1
## 1.164398
##
## Variances:
## 1
## 0.7221851
plot(dens, what = "density")
plot(dens, what = "density", data = x, breaks = 11)
<- rchisq(200, 3)
x1 <- 0.5*x1 + sqrt(1-0.5^2)*rchisq(200, 5)
x2 <- cbind(x1, x2)
x <- densityMclustBounded(x, lbound = c(0,0))
dens summary(dens, parameters = TRUE)
## ── Density estimation for bounded data via GMMs ───────────
##
## Boundaries: x1 x2
## lower 0 0
## upper Inf Inf
##
## Model VEE (ellipsoidal, equal shape and orientation) model with 1 component
## on the transformation scale:
##
## log-likelihood n df BIC ICL
## -882.0264 200 7 -1801.141 -1801.141
##
## x1 x2
## Range-power transformation: 0.2835466 0.3094675
##
## Mixing probabilities:
## 1
## 1
##
## Means:
## [,1]
## [1,] 0.8882233
## [2,] 2.1712294
##
## Variances:
## [,,1]
## [,1] [,2]
## [1,] 1.1820458 0.4040468
## [2,] 0.4040468 0.9491492
plot(dens, what = "BIC")
plot(dens, what = "density")
points(x, cex = 0.3)
abline(h = 0, v = 0, lty = 3)
plot(dens, what = "density", type = "hdr")
abline(h = 0, v = 0, lty = 3)
plot(dens, what = "density", type = "persp")
The data consist in the lengths of 86 spells of psychiatric treatment undergone by control patients in a suicide study (Silverman, 1986).
data("suicide")
<- densityMclustBounded(suicide, lbound = 0)
dens summary(dens, parameters = TRUE)
## ── Density estimation for bounded data via GMMs ───────────
##
## Boundaries: suicide
## lower 0
## upper Inf
##
## Model E (univariate, equal variance) model with 1 component
## on the transformation scale:
##
## log-likelihood n df BIC ICL
## -497.8204 86 3 -1009.004 -1009.004
##
## suicide
## Range-power transformation: 0.1929267
##
## Mixing probabilities:
## 1
## 1
##
## Means:
## 1
## 6.700073
##
## Variances:
## 1
## 7.788326
plot(dens, what = "density",
lwd = 2, col = "dodgerblue2",
data = suicide, breaks = 15,
xlab = "Length of psychiatric treatment")
rug(suicide)
This dataset provides the proportion of white student enrollment in 56 school districts in Nassau County (Long Island, New York), for the 1992-1993 school year (Simonoff 1996, Sec. 3.2).
data("racial")
<- racial$PropWhite
x <- densityMclustBounded(x, lbound = 0, ubound = 1)
dens summary(dens, parameters = TRUE)
## ── Density estimation for bounded data via GMMs ───────────
##
## Boundaries: x
## lower 0
## upper 1
##
## Model E (univariate, equal variance) model with 1 component
## on the transformation scale:
##
## log-likelihood n df BIC ICL
## 42.4598 56 3 72.84355 72.84355
##
## x
## Range-power transformation: 0.3869476
##
## Mixing probabilities:
## 1
## 1
##
## Means:
## 1
## 2.795429
##
## Variances:
## 1
## 5.253254
plot(dens, what = "density",
lwd = 2, col = "dodgerblue2",
data = x, breaks = 15,
xlab = "Proportion of white student enrolled in schools")
rug(x)
data(Baudry_etal_2010_JCGS_examples)
<- Mclust(ex4.1)
GMM plot(GMM, what = "classification")
<- MclustMEM(GMM)
MEM summary(MEM)
## ── Modal EM for GMMs ───────────────────
##
## Data dimensions = 600 x 2
## Mclust model = EEV,6
## MEM iterations = 17
## Number of modes = 4
##
## Modes:
## X1 X2
## mode1 8.06741504 -0.01772230
## mode2 8.07370160 4.98485099
## mode3 1.10622966 4.97230749
## mode4 -0.01639289 0.06464381
##
## Modal clustering:
## 1 2 3 4
## 118 122 228 132
plot(MEM)
plot(MEM, addPoints = FALSE)
<- Mclust(ex4.4.2)
GMM plot(GMM, what = "classification")
<- MclustMEM(GMM)
MEM summary(MEM)
## ── Modal EM for GMMs ───────────────────
##
## Data dimensions = 300 x 3
## Mclust model = EVI,3
## MEM iterations = 15
## Number of modes = 2
##
## Modes:
## X1 X2 X3
## mode1 0.7964915 0.7444244 0.4547285
## mode2 0.4996361 0.5014374 0.4957522
##
## Modal clustering:
## 1 2
## 78 222
plot(MEM)
plot(MEM, addDensity = FALSE)
Scrucca L. (2019) A transformation-based approach to Gaussian mixture density estimation for bounded data, Biometrical Journal, 61:4, 873–888. https://doi.org/10.1002/bimj.201800174
Scrucca L. (2021) A fast and efficient Modal EM algorithm for Gaussian mixtures. Statistical Analysis and Data Mining, 14:4, 305–314. https://doi.org/10.1002/sam.11527
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] mclustAddons_0.6 mclust_5.4.9 knitr_1.36
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.7 rstudioapi_0.13 magrittr_2.0.1 R6_2.5.1
## [5] rlang_0.4.12 fastmap_1.1.0 foreach_1.5.1 stringr_1.4.0
## [9] highr_0.9 tools_4.1.0 xfun_0.28 cli_3.1.0
## [13] jquerylib_0.1.4 htmltools_0.5.2 iterators_1.0.13 yaml_2.2.1
## [17] digest_0.6.28 crayon_1.4.2 sass_0.4.0 codetools_0.2-18
## [21] evaluate_0.14 rmarkdown_2.11 stringi_1.7.5 compiler_4.1.0
## [25] bslib_0.3.1 jsonlite_1.7.2