m-Py · October 21, 2020 17:39
diff --git a/small_anticlust_simulation.R b/small_anticlust_simulation.R
 # Test if splitting data via anticlustering leads to closer groups means to the *true* population means,
 # as compared to a random split (e.g., for cross validation
 simulate <- function(N = 100, split = c(1, 3) / 4) { # default: split 75/25
  data <- rnorm(N)
  groups <- anticlustering(
    data,
    K = round(N * split),
    objective = "variance"
  )
  c(
    anticlust = total_deviation(data, groups),
    random = total_deviation(data, sample(groups))
  )
 }

 total_deviation <- function(data, groups, squared = FALSE) {
  if (squared) {
    return(sum(tapply(data, groups, mean)^2))
  }
  sum(tapply(data, groups, mean)^2)
 }

 rowMeans(replicate(500, simulate()))
 #> anticlust    random 
 #> 0.1558745 0.2424823 

 rowMeans(replicate(500, simulate(squared = TRUE)))
 #>  anticlust     random 
 #> 0.02033793 0.05104909 
 # anticlustering has lower deviation from *true* mean
	# Test if splitting data via anticlustering leads to closer groups means to the true population means,
	# as compared to a random split (e.g., for cross validation
	simulate <- function(N = 100, split = c(1, 3) / 4) { # default: split 75/25
	data <- rnorm(N)
	groups <- anticlustering(
	data,
	K = round(N * split),
	objective = "variance"
	)
	c(
	anticlust = total_deviation(data, groups),
	random = total_deviation(data, sample(groups))
	)
	}

	total_deviation <- function(data, groups, squared = FALSE) {
	if (squared) {
	return(sum(tapply(data, groups, mean)^2))
	}
	sum(tapply(data, groups, mean)^2)
	}

	rowMeans(replicate(500, simulate()))
	#> anticlust random
	#> 0.1558745 0.2424823

	rowMeans(replicate(500, simulate(squared = TRUE)))
	#> anticlust random
	#> 0.02033793 0.05104909
	# anticlustering has lower deviation from true mean