Measurement & Multilevel Modeling Lab: Correlation Attenuation for Categorical Variables

Gengrui (Jimmy) Zhang

# Set correlation between X and Y* to 0.5
rho <- 0.5

# Assume X and Y*~N(0,1) for now
sd_x <- 1
sd_y <- 1
cov_xy <- rho * sd_x * sd_y

# Simulate correlated X and Y*
df <- as.data.frame(
  mvrnorm(
    n = 1e4,
    mu = c(0, 0),
    Sigma = matrix(
      c(
        sd_x^2, cov_xy,
        cov_xy, sd_y^2
      ),
      ncol = 2
    )
  )
)
names(df) <- c("X", "Y*")

# Manually dichotomize Y* to 0 and 1
df <- df %>%
  mutate(Y = ifelse(`Y*` > qnorm(0.7, mean(`Y*`), sd(`Y*`)), 1, 0))

# Show proportion of Y
knitr::kable(table(df$Y) / nrow(df),
  col.names = c("Label", "Proportion"),
  align = "c"
)


# Show correlations between X and Y*, and X and Y
knitr::kable(
  cbind(cor(df$X, df$`Y*`), cor(df$X, df$Y)),
  col.names = c("$\\rho_{(X, Y*)}$", "$\\rho_{(X, Y)}$"),
  align = "c"
)

In this example, we can see the correlation is attenuated when one of the continuous variables is dichotomized. According to the correlation formula and expectation of covariance formula, we can derive the attenuation factor due to categorization. Note that the value of dichotomozing

for desired proportion is called “threshold.”

# Analytic calculation
thres <- qnorm(0.3)
var_ystar <- 1
var_y <- 0.7 * (1 - 0.7)
attenuation_bi <- dnorm(thres) * sqrt(var_ystar / var_y)
cor_xy_bi <- attenuation_bi * rho

# Simulated results
lat_cor <- cor(df$X, df$`Y*`)
obs_cor <- cor(df$X, df$Y)

att_fac <- (cov(df$X, df$Y) / cov(df$X, df$`Y*`)) * sqrt(var(df$`Y*`) / var(df$Y))
cal_cor <- att_fac * lat_cor

summary_1 <- round(c(rho, attenuation_bi, cor_xy_bi, att_fac, cal_cor), 3)
names(summary_1) <- c(
  "Correlation_XY*", "Attenuation_Formula",
  "Correlation_Formula", "Attenuation_Data",
  "Correlation_Data"
)
knitr::kable(
  summary_1,
  align = "c",
  col.names = " "
)

We can see that the covariances between

and

are needed to compute the attenuation factor, which requires raw data. Sometimes, however, researchers may have difficulties obtaining raw data or they only have the correlation between

and

reported in published articles.

given that the correlation of

and

is their standardized covariance. Then

can be calculated using thresholds of the categorical variable.

We can see that the cumulative probability is determined by the density value, or marginal distribution, of

. Specifically, it is determined by the threshold

. Note that the marginal distribution of one variable in a bivariate normal distribution is a normal distribution. Now we think of

. It can be thought of the marginal distribution of X at

and

does not depend on

We have shown that the correlation between

and

is 0, and thus

is independent of

. Furthermore,

The dnorm() function in R calculates the p.d.f. of the normal distribution. For standard normal distribution, the R function dnorm(a) would return

, which is the value of

. Thus, we are able to show that:

An Example of Attenuated Correlation for Categorical Variable with Three Thresholds

# Assuming X and Y* ~ N(0,1)
# for standard bivariate normal distribution, E(XY*) = rho
rho <- 0.5
var_ystar <- 1

thres_1 <- qnorm(0.5)
thres_2 <- qnorm(0.5 + 0.3)
thres_3 <- qnorm(0.5 + 0.3 + 0.1)

p_less_than_thres1 <- pnorm(thres_1)
p_thres1_thres2 <- pnorm(thres_2) - pnorm(thres_1)
p_thres2_thres3 <- pnorm(thres_3) - pnorm(thres_2)
p_larger_than_thres3 <- pnorm(thres_3, lower.tail = F)

e_y2 <- 0 * p_less_than_thres1 +
  1^2 * p_thres1_thres2 +
  2^2 * p_thres2_thres3 +
  3^2 * p_larger_than_thres3
e_y <- 1 * p_thres1_thres2 + 2 * p_thres2_thres3 + 3 * p_larger_than_thres3
var_y <- e_y2 - e_y^2
attenuation_cat <- 1 * (dnorm(thres_1) - dnorm(thres_2)) +
  2 * (dnorm(thres_2) - dnorm(thres_3)) +
  3 * dnorm(thres_3) * sqrt(var_ystar / var_y)
# attenuation_cat <- (0*pbnorm(-Inf, Inf, -Inf, thres_1, 0, 0, 1, 1, 0.5) +
#   1*pbnorm(-Inf, Inf, thres_1, thres_2, 0, 0, 1, 1, 0.5) +
#   2*pbnorm(-Inf, Inf, thres_2, thres_3, 0, 0, 1, 1, 0.5) +
#   3*pbnorm(-Inf, Inf, thres_3, Inf, 0, 0, 1, 1, 0.5))/rho * sqrt(var_ystar/var_y)
cor_xy_cat <- attenuation_cat * rho

Verification with simulated data

rho <- 0.5
sd_x <- 1
sd_y <- 1
cov_xy <- rho * sd_x * sd_y

df3 <- as.data.frame(mvrnorm(
  n = 1e4,
  mu = c(0, 0),
  Sigma = matrix(
    c(
      sd_x^2, cov_xy,
      cov_xy, sd_y^2
    ),
    ncol = 2
  )
))


names(df3) <- c("y1", "y2")

# HL: An easier way to do the categorization:
# findInterval(
#   df3$y2,
#   rightmost.closed = TRUE,
#   quantile(df3$y2, c(0, 0.5, 0.5 + 0.3, 0.5 + 0.3 + 0.1, 1))
# ) - 1  # if starting from 0
# The above is based on the sample quantiles without
# assuming normality. If you want to assume normality, try
# findInterval(
#   df3$y2,
#   rightmost.closed = TRUE,
#   qnorm(c(0, 0.5, 0.5 + 0.3, 0.5 + 0.3 + 0.1, 1),
#         mean = mean(df3$y2), sd = sd(df3$y2))
# ) - 1
# Could you update the following accordingly? Thanks.
df3$y2_mul <- findInterval(
             df3$y2,
             rightmost.closed = TRUE,
             qnorm(c(0, 0.5, 0.5 + 0.3, 0.5 + 0.3 + 0.1, 1),
                   mean = mean(df3$y2), sd = sd(df3$y2))
           ) - 1

lat_cor <- cor(df3$y1, df3$y2)
obs_cor <- cor(df3$y1, df3$y2_mul)

att_fac <- (cov(df3$y1, df3$y2_mul) / cov(df3$y1, df3$y2)) * sqrt(var(df3$y2) / var(df3$y2_mul))
cal_cor <- att_fac * lat_cor

summary_2 <- round(c(rho, attenuation_cat, cor_xy_cat, att_fac, cal_cor), 3)
names(summary_2) <- c(
  "Correlation_XY*", "Attenuation_Formula",
  "Correlation_Formula", "Attenuation_Data",
  "Correlation_Data"
)
knitr::kable(summary_2,
  align = "c",
  col.names = " "
)

Reasoning of Generalization to X and Y* with Any Means and Variances

The “new” values of limits, e.g.,

, are linear tranformed using the mean and variance of

. It means that no matter how threshold values change due to mean and variance of

, we can always z-tranform them back so that X and

always follow a standard bivariate normal distribution. In other words, as long as we know the threshold values and proportion of categories of the categorized variable, and X and

follow normal distributions, we should be able to compute the attenuated

no matter the mean and variance of

Verification with simulated data (random mean and variance)

rho <- 0.5
sd_x <- rnorm(1, 1, 0.5)
sd_y <- rnorm(1, 1.5, 0.3)
cov_xy <- rho * sd_x * sd_y

df2 <- as.data.frame(mvrnorm(
  n = 1e7,
  mu = c(rnorm(1, 10, 2.1), rnorm(1, 8, 1.1)),
  Sigma = matrix(
    c(
      sd_x^2, cov_xy,
      cov_xy, sd_y^2
    ),
    ncol = 2
  )
))

names(df2) <- c("y1", "y2")
df2 <- df2 %>%
  mutate(y2_cat = ifelse(y2 > qnorm(0.7, mean(df2$y2), sd(df2$y2)), 1, 0))

lat_cor <- cor(df2$y1, df2$y2)
obs_cor <- cor(df2$y1, df2$y2_cat)

att_fac <- (cov(df2$y1, df2$y2_cat) / cov(df2$y1, df2$y2)) * sqrt(var(df2$y2) / var(df2$y2_cat))
cal_cor <- att_fac * lat_cor

summary_3 <- round(c(rho, attenuation_bi, cor_xy_bi, att_fac, cal_cor), 3)
names(summary_3) <- c(
  "Correlation_XY*", "Attenuation_Formula",
  "Correlation_Formula", "Attenuation_Data",
  "Correlation_Data"
)
knitr::kable(summary_3,
  align = "c",
  col.names = " "
)

rho <- 0.5
sd_x <- rnorm(1, 1, 0.5)
sd_y <- rnorm(1, 1.5, 0.3)
cov_xy <- rho * sd_x * sd_y

df3 <- as.data.frame(mvrnorm(
  n = 1e7,
  mu = c(rnorm(1, 10, 2.1), rnorm(1, 8, 1.1)),
  Sigma = matrix(
    c(
      sd_x^2, cov_xy,
      cov_xy, sd_y^2
    ),
    ncol = 2
  )
))


names(df3) <- c("y1", "y2")

df3 <- df3 %>%
  mutate(y2_mul = ifelse(y2 < qnorm(0.5, mean(df3$y2), sd(df3$y2)), 0,
    ifelse(qnorm(0.5, mean(df3$y2), sd(df3$y2)) < y2 & y2 < qnorm(0.5 + 0.3, mean(df3$y2), sd(df3$y2)), 1,
      ifelse(qnorm(0.5 + 0.3, mean(df3$y2), sd(df3$y2)) < y2 & y2 < qnorm(0.5 + 0.3 + 0.1, mean(df3$y2), sd(df3$y2)), 2,
        ifelse(y2 > qnorm(0.5 + 0.3 + 0.1, mean(df3$y2), sd(df3$y2)), 3, NA)
      )
    )
  ))

lat_cor <- cor(df3$y1, df3$y2)
obs_cor <- cor(df3$y1, df3$y2_mul)

att_fac <- (cov(df3$y1, df3$y2_mul) / cov(df3$y1, df3$y2)) * sqrt(var(df3$y2) / var(df3$y2_mul))
cal_cor <- att_fac * lat_cor

summary_4 <- round(c(rho, attenuation_cat, cor_xy_cat, att_fac, cal_cor), 3)
names(summary_4) <- c(
  "Correlation_XY*", "Attenuation_Formula",
  "Correlation_Formula", "Attenuation_Data",
  "Correlation_Data"
)
knitr::kable(summary_4,
  align = "c",
  col.names = " "
)

Correlation Attenuation for Categorical Variables

Author

Affiliation

Published

Citation

An Intro to Correlation Attenuation

An Example of Attenuated Correlation for Dichotomous Variable

An Example of Attenuated Correlation for Categorical Variable with Three Thresholds

Verification with simulated data

Reasoning of Generalization to X and Y* with Any Means and Variances

Verification with simulated data (random mean and variance)

Footnotes

Citation

Correlation_XY*	0.500
Attenuation_Formula	0.759
Correlation_Formula	0.379
Attenuation_Data	0.747
Correlation_Data	0.382

Label	Proportion
0	0.6988
1	0.3012