An illustration of correlation attenuation when discretizing a continuous variable to an ordered categorical variable.
\[\rho(x,y) = \frac{COV(X, Y)}{\sigma_{X}\sigma_{Y}},\]
where \(\sigma\) is the standard deviation.
# Set correlation between X and Y* to 0.5
rho <- 0.5
# Assume X and Y*~N(0,1) for now
sd_x <- 1
sd_y <- 1
cov_xy <- rho * sd_x * sd_y
# Simulate correlated X and Y*
df <- as.data.frame(
mvrnorm(
n = 1e4,
mu = c(0, 0),
Sigma = matrix(
c(
sd_x^2, cov_xy,
cov_xy, sd_y^2
),
ncol = 2
)
)
)
names(df) <- c("X", "Y*")
# Manually dichotomize Y* to 0 and 1
df <- df %>%
mutate(Y = ifelse(`Y*` > qnorm(0.7, mean(`Y*`), sd(`Y*`)), 1, 0))
# Show proportion of Y
knitr::kable(table(df$Y) / nrow(df),
col.names = c("Label", "Proportion"),
align = "c"
)
Label | Proportion |
---|---|
0 | 0.6988 |
1 | 0.3012 |
# Show correlations between X and Y*, and X and Y
knitr::kable(
cbind(cor(df$X, df$`Y*`), cor(df$X, df$Y)),
col.names = c("$\\rho_{(X, Y*)}$", "$\\rho_{(X, Y)}$"),
align = "c"
)
\(\rho_{(X, Y*)}\) | \(\rho_{(X, Y)}\) |
---|---|
0.5110921 | 0.3819812 |
\[\text{Attenuation Factor} = \frac{COV(X, Y)}{COV(X, Y^*)}*\sqrt{\frac{\sigma^2_{Y^*}}{\sigma^2_{Y}}},\]
# Analytic calculation
thres <- qnorm(0.3)
var_ystar <- 1
var_y <- 0.7 * (1 - 0.7)
attenuation_bi <- dnorm(thres) * sqrt(var_ystar / var_y)
cor_xy_bi <- attenuation_bi * rho
# Simulated results
lat_cor <- cor(df$X, df$`Y*`)
obs_cor <- cor(df$X, df$Y)
att_fac <- (cov(df$X, df$Y) / cov(df$X, df$`Y*`)) * sqrt(var(df$`Y*`) / var(df$Y))
cal_cor <- att_fac * lat_cor
summary_1 <- round(c(rho, attenuation_bi, cor_xy_bi, att_fac, cal_cor), 3)
names(summary_1) <- c(
"Correlation_XY*", "Attenuation_Formula",
"Correlation_Formula", "Attenuation_Data",
"Correlation_Data"
)
knitr::kable(
summary_1,
align = "c",
col.names = " "
)
Correlation_XY* | 0.500 |
Attenuation_Formula | 0.759 |
Correlation_Formula | 0.379 |
Attenuation_Data | 0.747 |
Correlation_Data | 0.382 |
\[\text{Attenuation Factor} = \frac{E(XY) - E(X)E(Y)}{E(XY^*) - E(X)E(Y^*)}*\sqrt{\frac{\sigma^2_{Y^*}}{\sigma^2_{Y}}},\]
\[ \begin{aligned} \rho(X, Y^*) &= \frac{E(XY^*)}{\sqrt{\sigma^2_{X} \sigma^2_{Y^*}}} \\ &= E\left[\left(\frac{X - \mu_{x}}{\sigma_{X}}\right)\left(\frac{Y^* - \mu_{Y^*}}{\sigma_{Y^*}}\right)\right] \\ &= E(XY^*), \\ \end{aligned} \]
given that the correlation of \(X\) and \(Y^*\) is their standardized covariance. Then \(E(XY)\) can be calculated using thresholds of the categorical variable.
\[f_{x,y^{*}}(x,y^{*}) = \frac{1}{2\pi\sqrt{1 - \rho^2}}*e^{-\frac{1}{2(1 - \rho^2)}*[x^2 + {y^*}^2 - 2\rho x y^*]},\]
and the expected value of \(XY^*\) is:
\[E(XY^*) = \int_{-\infty}^{\infty}\int_{-\infty}^{\infty}xy^{*}f_{x,y^{*}}(x,y^{*}) d(x) d(y^*),\]
\[E(XY) = E(X | Y^* > \tau),\]
while \(E(XY) = E(X | Y^* < \tau) = E(X*0) = 0\). Further, the formula can be expanded as:
\[ \begin{aligned} E(X | Y^* > \tau) &= \int_{-\infty}^{\infty}\int_{\tau}^{\infty}xy^{*}f_{x,y^{*}}(x,y^{*}) d(x) d(y^*) \\ &= \int_{-\infty}^{\infty}\int_{\tau}^{\infty}x,y^{*}\frac{1}{2\pi\sqrt{1 - \rho^2}}*e^{-\frac{1}{2(1 - \rho^2)}*[x^2 + y^{*^2} - 2\rho x y^*]} d(x) d(y^*), \\ \end{aligned} \]
\[ \begin{aligned} E(X | Y^* = a) &= \int_{-\infty}^{\infty} xf_{X|Y^*}(x|Y^* = y^*)d(x) \\ &= \rho*a, \\ \end{aligned} \]
given that \(f_{X|Y^*}(x|Y^* = y^*) = \frac{f_{X,Y^*}(x, y^*)}{f_{Y^*}(y^*)}\), and \(\rho\) is the correlation between \(X\) and \(Y^*\).
Now we think of \(E(X | Y^* > a)\):
\[ \begin{aligned} E(X | Y^* > a) &= E(X - rY^* + rY^* | Y^* > a) \\ COV(X - rY^*, Y^*) &= E[(X - rY^*)Y^*] - E(X - rY^*)E(Y^*) \\ COV(X - rY^*, Y^*) &= E(XY^*) - rE(Y^*Y^*) - E(X)E(Y^*) + rE(Y^*)E(Y^*), \\ \end{aligned} \]
since we assume that \(X\) and \(Y^*\) both follow \(N ~ (0,1)\), therefore \(COV(X, Y^*) = E(XY^*) - E(X)E(Y^*) = r\) and \(VAR(Y^*) = E(Y^*)E(Y^*) - E(Y^*Y^*) = 1\). Then,
\[ \begin{aligned} COV(X - rY^*, Y^*) &= COV(X, Y^*) - r*1 \\ &= r - r \\ &= 0, \\ \end{aligned} \] \[ \begin{aligned} E(X | Y^* > a) &= E(X - rY^* + rY^* | Y^* > a) \\ &= E(X - rY^*|Y^* > a) + r*E(Y^*|Y^* > a), \\ \end{aligned} \]
We have shown that the correlation between \(X - rY^*\) and \(Y^*\) is 0, and thus \(E(X - rY^*)\) is independent of \(Y^*\). Furthermore,
\[ \begin{aligned} E(X | Y^* > a) &= E(X - rY^*) + r*E(Y^*|Y^* > a) \\ &= E(X) - r*E(Y^*) + r*E(Y^*|Y^* > a) \\ &= r*E(Y^*|Y^* > a), \\ \end{aligned} \]
According to the definition of conditional expectation,
\[ \begin{aligned} E(Y^*|Y^* > a) &= \int_{a}^{\infty} y^*\phi_{0}(y^*)d(y^*) \\ &= \frac{e^{-\frac{a^2}{2}}}{\sqrt{2\pi}}, \\ \end{aligned} \]
note that \(\phi_{0}\) is the p.d.f. of standard normal distribution.
dnorm()
function in R
calculates the p.d.f. of the normal distribution. For standard normal distribution, the R function dnorm(a)
would return \(\frac{e^{-\frac{a^2}{2}}}{\sqrt{2\pi}}\), which is the value of \(E(Y^*|Y^* > a)\). Thus, we are able to show that:\[E(X | Y^* > a) = \rho*\phi_0(a)\]
# Assuming X and Y* ~ N(0,1)
# for standard bivariate normal distribution, E(XY*) = rho
rho <- 0.5
var_ystar <- 1
thres_1 <- qnorm(0.5)
thres_2 <- qnorm(0.5 + 0.3)
thres_3 <- qnorm(0.5 + 0.3 + 0.1)
p_less_than_thres1 <- pnorm(thres_1)
p_thres1_thres2 <- pnorm(thres_2) - pnorm(thres_1)
p_thres2_thres3 <- pnorm(thres_3) - pnorm(thres_2)
p_larger_than_thres3 <- pnorm(thres_3, lower.tail = F)
e_y2 <- 0 * p_less_than_thres1 +
1^2 * p_thres1_thres2 +
2^2 * p_thres2_thres3 +
3^2 * p_larger_than_thres3
e_y <- 1 * p_thres1_thres2 + 2 * p_thres2_thres3 + 3 * p_larger_than_thres3
var_y <- e_y2 - e_y^2
attenuation_cat <- 1 * (dnorm(thres_1) - dnorm(thres_2)) +
2 * (dnorm(thres_2) - dnorm(thres_3)) +
3 * dnorm(thres_3) * sqrt(var_ystar / var_y)
# attenuation_cat <- (0*pbnorm(-Inf, Inf, -Inf, thres_1, 0, 0, 1, 1, 0.5) +
# 1*pbnorm(-Inf, Inf, thres_1, thres_2, 0, 0, 1, 1, 0.5) +
# 2*pbnorm(-Inf, Inf, thres_2, thres_3, 0, 0, 1, 1, 0.5) +
# 3*pbnorm(-Inf, Inf, thres_3, Inf, 0, 0, 1, 1, 0.5))/rho * sqrt(var_ystar/var_y)
cor_xy_cat <- attenuation_cat * rho
rho <- 0.5
sd_x <- 1
sd_y <- 1
cov_xy <- rho * sd_x * sd_y
df3 <- as.data.frame(mvrnorm(
n = 1e4,
mu = c(0, 0),
Sigma = matrix(
c(
sd_x^2, cov_xy,
cov_xy, sd_y^2
),
ncol = 2
)
))
names(df3) <- c("y1", "y2")
# HL: An easier way to do the categorization:
# findInterval(
# df3$y2,
# rightmost.closed = TRUE,
# quantile(df3$y2, c(0, 0.5, 0.5 + 0.3, 0.5 + 0.3 + 0.1, 1))
# ) - 1 # if starting from 0
# The above is based on the sample quantiles without
# assuming normality. If you want to assume normality, try
# findInterval(
# df3$y2,
# rightmost.closed = TRUE,
# qnorm(c(0, 0.5, 0.5 + 0.3, 0.5 + 0.3 + 0.1, 1),
# mean = mean(df3$y2), sd = sd(df3$y2))
# ) - 1
# Could you update the following accordingly? Thanks.
df3$y2_mul <- findInterval(
df3$y2,
rightmost.closed = TRUE,
qnorm(c(0, 0.5, 0.5 + 0.3, 0.5 + 0.3 + 0.1, 1),
mean = mean(df3$y2), sd = sd(df3$y2))
) - 1
lat_cor <- cor(df3$y1, df3$y2)
obs_cor <- cor(df3$y1, df3$y2_mul)
att_fac <- (cov(df3$y1, df3$y2_mul) / cov(df3$y1, df3$y2)) * sqrt(var(df3$y2) / var(df3$y2_mul))
cal_cor <- att_fac * lat_cor
summary_2 <- round(c(rho, attenuation_cat, cor_xy_cat, att_fac, cal_cor), 3)
names(summary_2) <- c(
"Correlation_XY*", "Attenuation_Formula",
"Correlation_Formula", "Attenuation_Data",
"Correlation_Data"
)
knitr::kable(summary_2,
align = "c",
col.names = " "
)
Correlation_XY* | 0.500 |
Attenuation_Formula | 0.865 |
Correlation_Formula | 0.433 |
Attenuation_Data | 0.876 |
Correlation_Data | 0.430 |
\[ \begin{aligned} E(X,Y) = \begin{cases} 0, & \text{if } Y^{*} \le \tau_{1} \\ E(X | Y = 1), & \text{if } \tau_{1} \le Y^{*} \le \tau_{2} \\ E(X | Y = 2), & \text{if } \tau_{2} \le Y^{*} \le \tau_{3} \\ E(X | Y = 3), & \text{if } Y^{*} > \tau_{3} \end{cases} \end{aligned} \]
Take one category as one example, for \(E(X | Y = 1)\) with \(\tau_{1} \le Y^{*} \le \tau_{2}\) and \(c = 1\): \(E(X | Y = 1) = \int_{-\infty}^{\infty} \int_{\tau_{1}}^{\tau_{2}} \text{x} y^{*} f_{(x, y^{\ast})} d_{x} d_{y^{*}}\)
\[f_{x,y^{*}}(x,y^{*}) = \frac{1}{2\pi\sigma_{x}\sigma_{y^{*}}\sqrt{1 - \rho^2}}*e^{-\frac{1}{2(1 - \rho^2)}*[z_{x}^2 + z_{y^{*}}^2 - 2\rho z_{x} z_{y^{*}}]},\]
and transform the formula to:
\[E(X | Y = 1) = \int_{-\infty}^{\infty}\int_{\tau_{1}}^{\tau_{2}}xy^{*}\frac{d_{x}d_{y^{*}}}{2\pi\sigma_{x}\sigma_{y^{*}}\sqrt{1 - \rho^2}}*e^{-\frac{1}{2(1 - \rho^2)}*[z_{x}^2 + z_{y^{*}}^2 - 2\rho z_{x} z_{y^{*}}]} d(x) d(y^*),\]
Because of the property of derivation,
\[ \begin{aligned} \frac{d(x)}{\sigma_{x}} &= d(\frac{x - \mu_{x}}{\sigma_{x}}) \\ &= d(z_{x}), \end{aligned} \]
and it is the same for \(d(z_{y^{*}})\).
\[E(X | Y = 1) = \int_{-\infty}^{\infty}\int_{\frac{\tau_{1} - \mu_{y^{*}}}{\sigma_{y^{*}}}}^{\frac{\tau_{2} - \mu_{y^{*}}}{\sigma_{y^{*}}}}xy^{*}\frac{1}{2\pi\sqrt{1 - \rho^2}}*e^{-\frac{1}{2(1 - \rho^2)}*[z_{x}^2 + z_{y^{*}}^2 - 2\rho z_{x} z_{y^{*}}]}d(z_{x})d(z_{y^{*}})\]
rho <- 0.5
sd_x <- rnorm(1, 1, 0.5)
sd_y <- rnorm(1, 1.5, 0.3)
cov_xy <- rho * sd_x * sd_y
df2 <- as.data.frame(mvrnorm(
n = 1e7,
mu = c(rnorm(1, 10, 2.1), rnorm(1, 8, 1.1)),
Sigma = matrix(
c(
sd_x^2, cov_xy,
cov_xy, sd_y^2
),
ncol = 2
)
))
names(df2) <- c("y1", "y2")
df2 <- df2 %>%
mutate(y2_cat = ifelse(y2 > qnorm(0.7, mean(df2$y2), sd(df2$y2)), 1, 0))
lat_cor <- cor(df2$y1, df2$y2)
obs_cor <- cor(df2$y1, df2$y2_cat)
att_fac <- (cov(df2$y1, df2$y2_cat) / cov(df2$y1, df2$y2)) * sqrt(var(df2$y2) / var(df2$y2_cat))
cal_cor <- att_fac * lat_cor
summary_3 <- round(c(rho, attenuation_bi, cor_xy_bi, att_fac, cal_cor), 3)
names(summary_3) <- c(
"Correlation_XY*", "Attenuation_Formula",
"Correlation_Formula", "Attenuation_Data",
"Correlation_Data"
)
knitr::kable(summary_3,
align = "c",
col.names = " "
)
Correlation_XY* | 0.500 |
Attenuation_Formula | 0.759 |
Correlation_Formula | 0.379 |
Attenuation_Data | 0.759 |
Correlation_Data | 0.379 |
rho <- 0.5
sd_x <- rnorm(1, 1, 0.5)
sd_y <- rnorm(1, 1.5, 0.3)
cov_xy <- rho * sd_x * sd_y
df3 <- as.data.frame(mvrnorm(
n = 1e7,
mu = c(rnorm(1, 10, 2.1), rnorm(1, 8, 1.1)),
Sigma = matrix(
c(
sd_x^2, cov_xy,
cov_xy, sd_y^2
),
ncol = 2
)
))
names(df3) <- c("y1", "y2")
df3 <- df3 %>%
mutate(y2_mul = ifelse(y2 < qnorm(0.5, mean(df3$y2), sd(df3$y2)), 0,
ifelse(qnorm(0.5, mean(df3$y2), sd(df3$y2)) < y2 & y2 < qnorm(0.5 + 0.3, mean(df3$y2), sd(df3$y2)), 1,
ifelse(qnorm(0.5 + 0.3, mean(df3$y2), sd(df3$y2)) < y2 & y2 < qnorm(0.5 + 0.3 + 0.1, mean(df3$y2), sd(df3$y2)), 2,
ifelse(y2 > qnorm(0.5 + 0.3 + 0.1, mean(df3$y2), sd(df3$y2)), 3, NA)
)
)
))
lat_cor <- cor(df3$y1, df3$y2)
obs_cor <- cor(df3$y1, df3$y2_mul)
att_fac <- (cov(df3$y1, df3$y2_mul) / cov(df3$y1, df3$y2)) * sqrt(var(df3$y2) / var(df3$y2_mul))
cal_cor <- att_fac * lat_cor
summary_4 <- round(c(rho, attenuation_cat, cor_xy_cat, att_fac, cal_cor), 3)
names(summary_4) <- c(
"Correlation_XY*", "Attenuation_Formula",
"Correlation_Formula", "Attenuation_Data",
"Correlation_Data"
)
knitr::kable(summary_4,
align = "c",
col.names = " "
)
Correlation_XY* | 0.500 |
Attenuation_Formula | 0.865 |
Correlation_Formula | 0.433 |
Attenuation_Data | 0.872 |
Correlation_Data | 0.436 |
For attribution, please cite this work as
Zhang (2023, April 19). Measurement & Multilevel Modeling Lab: Correlation Attenuation for Categorical Variables. Retrieved from https://mmmlab.rbind.io/posts/2023-04-19-correlation-attenuation-for-categorical-variables/
BibTeX citation
@misc{zhang2023correlation, author = {Zhang, Gengrui (Jimmy)}, title = {Measurement & Multilevel Modeling Lab: Correlation Attenuation for Categorical Variables}, url = {https://mmmlab.rbind.io/posts/2023-04-19-correlation-attenuation-for-categorical-variables/}, year = {2023} }