m-Py · February 12, 2020 10:58
diff --git a/covariate_regression.R b/covariate_regression.R


 ## This document illustrates that type 1 sum of squares lead to increased alpha
 ## error rates when a predictive covariate is included in the regression model. 


 # Estimate p-value for treatment (null) effect via linear regression, 
 # including a covariate that is predictive of the outcome
 #
 # param N: sample size, default 100
 # param beta_covariate: regression weight for a covariate on an outcome, default 0.5
 # param coding: how is the treatment coded, defaults to 0/1
 # param AOV: sum of squares method; if `FALSE`, the p-value is just estimated via `summary()` 
 #    (= default); if 1, the p-value is estimated via `anova()`; if 2 or 3, the p-value
 #    is estimated using type 2 or type 3 sum of squares using `car::Anova()`
 # return: the p-value associated with the "treatment"
 #
 # Details:
 # Data is generated via a regression model, predicting the outcome from a covariate
 # `outcome = beta_covariate * rnorm(N) + error` 
 # [`+ treatment * 0`, i.e. a null effect of treatment, is implicit]
 #
 covariate_regression <- function(
  N = 100, 
  beta_covariate = 0.5, 
  coding = c(0, 1), 
  AOV = FALSE
 ) {
  # Simulate covariate and outcome data
  data <- data.frame(covariate = rnorm(N))
  error <- rnorm(N)
  data$outcome <- beta_covariate * data$covariate + error
  # Insert treatment variable that has no effect
  data$treatment <- sample(rep_len(coding, N))
  # do regression to test for treatment effect
  model <- lm(outcome ~ treatment * covariate, data = data)
  get_p_value(model, "treatment", AOV)
 }

 get_p_value <- function(model, effect, AOV = 0) {
  stopifnot(AOV %in% 0:3)
  if (AOV == 0) {
    tab <- summary(model)$coefficients
    # extract p-value
    return(tab[rownames(tab) == effect, "Pr(>|t|)"])
  } else if (AOV == 1) {
    tab <- anova(model)
    return(tab[rownames(tab) == effect, "Pr(>F)"])
  }
  tab <- car::Anova(model, type = AOV)
  tab[rownames(tab) == effect, "Pr(>F)"]
 }


 # Standard regression, using summary to print p-value:
 mean(replicate(10000, covariate_regression()) <= .05)
 #> 0.0513

 # Use `anova()` to print p-value:
 mean(replicate(10000, covariate_regression(AOV = 1)) <= .05)
 #> 0.0828

 # Coding scheme does not seem to matter: 
 mean(replicate(10000, covariate_regression(AOV = 1, coding = -1:1)) <= .05)
 #> 0.0793

 # No problem if covariate is not predictive of outcome:
 mean(replicate(10000, covariate_regression(AOV = 1, beta_covariate = 0)) <= .05)
 #> 0.05

 # No problem for type 2 or type 3 sum of squares
 mean(replicate(10000, covariate_regression(AOV = 2)) <= .05)
 #> 0.0484

 mean(replicate(10000, covariate_regression(AOV = 3)) <= .05)
 #> 0.0531

 ### Problem disappears when the covariate comes first in the regression model!
 # lm(outcome ~ covariate * treatment, data = data)


	## This document illustrates that type 1 sum of squares lead to increased alpha
	## error rates when a predictive covariate is included in the regression model.


	# Estimate p-value for treatment (null) effect via linear regression,
	# including a covariate that is predictive of the outcome
	#
	# param N: sample size, default 100
	# param beta_covariate: regression weight for a covariate on an outcome, default 0.5
	# param coding: how is the treatment coded, defaults to 0/1
	# param AOV: sum of squares method; if `FALSE`, the p-value is just estimated via `summary()`
	# (= default); if 1, the p-value is estimated via `anova()`; if 2 or 3, the p-value
	# is estimated using type 2 or type 3 sum of squares using `car::Anova()`
	# return: the p-value associated with the "treatment"
	#
	# Details:
	# Data is generated via a regression model, predicting the outcome from a covariate
	# `outcome = beta_covariate * rnorm(N) + error`
	# [`+ treatment * 0`, i.e. a null effect of treatment, is implicit]
	#
	covariate_regression <- function(
	N = 100,
	beta_covariate = 0.5,
	coding = c(0, 1),
	AOV = FALSE
	) {
	# Simulate covariate and outcome data
	data <- data.frame(covariate = rnorm(N))
	error <- rnorm(N)
	data$outcome <- beta_covariate * data$covariate + error
	# Insert treatment variable that has no effect
	data$treatment <- sample(rep_len(coding, N))
	# do regression to test for treatment effect
	model <- lm(outcome ~ treatment * covariate, data = data)
	get_p_value(model, "treatment", AOV)
	}

	get_p_value <- function(model, effect, AOV = 0) {
	stopifnot(AOV %in% 0:3)
	if (AOV == 0) {
	tab <- summary(model)$coefficients
	# extract p-value
	return(tab[rownames(tab) == effect, "Pr(>\|t\|)"])
	} else if (AOV == 1) {
	tab <- anova(model)
	return(tab[rownames(tab) == effect, "Pr(>F)"])
	}
	tab <- car::Anova(model, type = AOV)
	tab[rownames(tab) == effect, "Pr(>F)"]
	}


	# Standard regression, using summary to print p-value:
	mean(replicate(10000, covariate_regression()) <= .05)
	#> 0.0513

	# Use `anova()` to print p-value:
	mean(replicate(10000, covariate_regression(AOV = 1)) <= .05)
	#> 0.0828

	# Coding scheme does not seem to matter:
	mean(replicate(10000, covariate_regression(AOV = 1, coding = -1:1)) <= .05)
	#> 0.0793

	# No problem if covariate is not predictive of outcome:
	mean(replicate(10000, covariate_regression(AOV = 1, beta_covariate = 0)) <= .05)
	#> 0.05

	# No problem for type 2 or type 3 sum of squares
	mean(replicate(10000, covariate_regression(AOV = 2)) <= .05)
	#> 0.0484

	mean(replicate(10000, covariate_regression(AOV = 3)) <= .05)
	#> 0.0531

	### Problem disappears when the covariate comes first in the regression model!
	# lm(outcome ~ covariate * treatment, data = data)