Skip to content

Instantly share code, notes, and snippets.

@bestdan
Created February 12, 2021 15:28
Show Gist options
  • Save bestdan/77740565c6d012983bbe2ace165546df to your computer and use it in GitHub Desktop.
Save bestdan/77740565c6d012983bbe2ace165546df to your computer and use it in GitHub Desktop.
Medians vs Log-Means
#' @title Log-means versus Medians
#' @author Daniel Egan
#' @description When data has a power law or extremely skewed distribution,
#' using a log-mean usually results in more stable and useful central estimates
#' compared to a mean or a median.
#' https://towardsdatascience.com/on-average-youre-using-the-wrong-average-geometric-harmonic-means-in-data-analysis-2a703e21ea0
library(dplyr)
library(tidyr)
library(ggplot2)
rdata<- 10^rnorm(30000, mean=3, sd=3)
n_samples <- 200
means_df <- data.frame(sample_id = 1:n_samples,
sample_median = NA_real_,
sample_mean = NA_real_,
sample_log_mean = NA_real_)
log10mean <- function(x){
10 ^ (mean(log10(x)))
}
# log10mean(rdata[1:10])
for(i in 1:n_samples){
this_sample <- sample(rdata, size = 200)
means_df$sample_median[i] <- median(this_sample)
means_df$sample_mean[i] <- mean(this_sample)
means_df$sample_log_mean[i] <- log10mean(this_sample)
}
means_df %>%
pivot_longer(c(sample_median, sample_mean, sample_log_mean), names_to = "type") %>%
ggplot(aes(x = value, group = type)) +
geom_density() +
facet_wrap(. ~ type, scales = "free") +
labs(title = "Sample means using different methods",
subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")
#' Suppose we used each method and looked at out-of-sample
#' error, i.e. the RMSE have compared to another.
rmse <- function(x){
sqrt(mean(x))
}
median_ratio <- data.frame(type = "median", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_median, FUN = "-"))))
mean_ratio <- data.frame(type = "mean", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_mean, FUN = "-"))))
logmean_ratio <- data.frame(type = "logmean", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_log_mean, FUN = "-"))))
ratios_df <- rbind(median_ratio, mean_ratio, logmean_ratio)
ratios_df %>%
ggplot(aes(sample = ratio, group = type)) +
geom_qq_line() +
facet_wrap(. ~ type, scales = ) +
labs(title = "Distribution of absolute errors",
subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")
#' Log-means vs medians?
means_df %>%
ggplot(aes(x = sample_median, y = sample_log_mean)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
coord_cartesian(xlim = c(0, 5000), ylim = c(0, 5000))
labs(title = "Distribution of absolute errors",
subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")
#' What about with data that aren't perfect base-10 generated dude?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment