Introduction
In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the eusilc
dataset of the laeken
package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) <arXiv:2105.08379>.
Contingency table
We will estimate the contingency table when the factor variable which represents the economic status pl030
is crossed with a discretized version of the equivalized household income eqIncome
. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values.
library(laeken)
library(sampling)
library(StratifiedSampling)
#> Le chargement a nécessité le package : Matrix
data("eusilc")
<- na.omit(eusilc)
eusilc <- nrow(eusilc)
N
# Xm are the matching variables and id are identity of the units
<- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xm <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xmcat <- cbind(Xmcat,Xm[,-c(2,4,5)])
Xm <- eusilc$rb030
id
# categorial income splitted by the percentile
<- eusilc$eqIncome
c_income <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
q which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]"
c_income[
# variable of interests
<- data.frame(ecostat = eusilc$pl030)
Y <- data.frame(c_income = c_income)
Z
# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id
<- table(cbind(Y,Z))
YZ addmargins(YZ)
#> c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 409 616 722 807 935 1025 648 5162
#> 2 189 181 205 184 165 154 82 1160
#> 3 137 90 72 75 59 52 33 518
#> 4 210 159 103 95 74 49 46 736
#> 5 470 462 492 477 459 435 351 3146
#> 6 57 25 28 30 17 11 10 178
#> 7 344 283 194 149 106 91 40 1207
#> Sum 1816 1816 1816 1817 1815 1817 1210 12107
Sampling schemes
Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.
# size of sample
<- 1000
n1 <- 500
n2
# samples
<- srswor(n1,N)
s1 <- srswor(n2,N)
s2
# extract matching units
<- Xm[s1 == 1,]
X1 <- Xm[s2 == 1,]
X2
# extract variable of interest
<- data.frame(Y[s1 == 1,])
Y1 colnames(Y1) <- colnames(Y)
<- as.data.frame(Z[s2 == 1,])
Z2 colnames(Z2) <- colnames(Z)
# extract correct identities
<- id[s1 == 1]
id1 <- id[s2 == 1]
id2
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
# here weights are inverse of inclusion probabilities
<- rep(N/n1,n1)
d1 <- rep(N/n2,n2)
d2
# disjunctive form
<- sampling::disjunctive(as.matrix(Y))
Y_dis <- sampling::disjunctive(as.matrix(Z))
Z_dis
<- Y_dis[s1 ==1,]
Y1_dis <- Z_dis[s2 ==1,] Z2_dis
Harmonization
Then the harmonization step must be performed. The harmonize
function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function.
<- harmonize(X1,d1,id1,X2,d2,id2)
re
# if we want to use the population totals to harmonize we can use
<- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))
re
<- re$w1
w1 <- re$w2
w2
colSums(Xm)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w1*X1)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w2*X2)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
Optimal transport matching
The statistical matching is done by using the otmatch
function. The estimation of the contingency table is calculated by extracting the id1
units (respectively id2
units) and by using the function tapply
with the correct weights.
# Optimal transport matching
<- otmatch(X1,id1,X2,id2,w1,w2)
object head(object[,1:3])
#> id1 id2 weight
#> 401 401 483102 14.9080488
#> 1801 1801 371901 13.0680348
#> 2802 2802 30101 0.6713613
#> 2802.1 2802 303401 3.9807038
#> 2802.2 2802 597501 8.4273587
#> 3501 3501 339003 5.4671218
<- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Y1_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
Z2_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)
YZ_ot
# transform NA into 0
is.na(YZ_ot)] <- 0
YZ_ot[
# result
round(addmargins(YZ_ot),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 798.837 608.164 635.804 817.795 943.154 805.084 457.629 5066.467
#> 2 243.294 136.607 147.983 177.563 215.863 175.939 99.867 1197.117
#> 3 109.862 122.810 89.308 99.774 93.710 138.813 8.098 662.375
#> 4 129.711 76.272 71.171 98.921 94.545 63.649 25.722 559.991
#> 5 654.381 318.511 362.393 488.008 488.052 402.352 407.157 3120.855
#> 6 34.198 27.365 45.834 71.793 13.557 48.386 16.290 257.423
#> 7 221.135 184.752 193.826 214.999 221.704 142.417 63.937 1242.771
#> Sum 2191.419 1474.481 1546.320 1968.853 2070.585 1776.640 1078.701 12107.000
Balanced sampling
As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The bsmatch
function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2.
# Balanced Sampling
<- bsmatch(object)
BS head(BS$object[,1:3])
#> id1 id2 weight
#> 401 401 483102 14.908049
#> 1801 1801 371901 13.068035
#> 2802.1 2802 303401 3.980704
#> 3501.1 3501 570904 7.023456
#> 4001 4001 425101 6.044974
#> 5201 5201 215302 12.230852
<- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Y1_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
Z2_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs is.na(YZ_bs)] <- 0
YZ_bs[round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 852.388 579.251 620.583 798.392 936.224 826.174 453.455 5066.467
#> 2 239.351 136.121 164.141 190.650 197.252 148.271 121.331 1197.117
#> 3 113.757 114.573 79.136 94.826 96.779 148.434 14.869 662.375
#> 4 133.345 74.763 62.515 93.961 101.461 69.539 24.406 559.991
#> 5 677.249 329.620 381.421 488.708 462.650 375.320 405.888 3120.855
#> 6 35.026 27.932 39.346 71.604 13.557 46.307 23.650 257.423
#> 7 234.259 184.017 199.352 216.231 208.327 140.950 59.634 1242.771
#> Sum 2285.374 1446.278 1546.496 1954.372 2016.250 1754.997 1103.234 12107.000
# With Z2 as auxiliary information for stratified balanced sampling.
<- bsmatch(object,Z2)
BS
<- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Y1_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
Z2_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs is.na(YZ_bs)] <- 0
YZ_bs[round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 736.518 591.620 678.617 845.382 946.135 815.990 452.206 5066.467
#> 2 272.931 138.393 113.688 194.099 245.541 149.761 82.704 1197.117
#> 3 102.668 114.573 79.136 93.842 96.779 160.508 14.869 662.375
#> 4 133.345 100.536 88.432 104.760 65.250 32.060 35.608 559.991
#> 5 664.601 331.514 326.147 481.066 476.456 429.956 411.115 3120.855
#> 6 35.564 38.092 39.346 61.444 13.557 57.740 11.679 257.423
#> 7 242.581 163.849 223.569 192.967 219.221 128.421 72.163 1242.771
#> Sum 2188.207 1478.578 1548.936 1973.560 2062.939 1774.437 1080.343 12107.000
Prediction
# split the weight by id1
<- split(object$weight,f = object$id1)
q_l # normalize in each id1
<- lapply(q_l, function(x){x/sum(x)})
q_l <- as.numeric(do.call(c,q_l))
q
<- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
Z_pred colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]
#> [1,] 0.0000000 1.0000000 0.00000000 0.0000000 0 0.0000000 0
#> [2,] 0.0000000 0.0000000 1.00000000 0.0000000 0 0.0000000 0
#> [3,] 0.3043486 0.0000000 0.05132958 0.0000000 0 0.6443219 0
#> [4,] 0.0000000 0.5623003 0.00000000 0.4376997 0 0.0000000 0
#> [5,] 0.0000000 0.0000000 0.45928093 0.0000000 0 0.5407191 0
#> [6,] 0.0000000 0.0000000 0.00000000 0.0000000 0 1.0000000 0