Skip to content

Instantly share code, notes, and snippets.

@jschoeley
Created December 12, 2020 21:08
Show Gist options
  • Save jschoeley/45a0e05b54512744e85d3eb00eeeae43 to your computer and use it in GitHub Desktop.
Save jschoeley/45a0e05b54512744e85d3eb00eeeae43 to your computer and use it in GitHub Desktop.
Disaggregating counts from Lexis surface with irregular grouping via PCLMM
# Disaggregation of 2D histogram of Poisson-counts with irregular bins
# based upon the penalized composite link mixed model
#
# Jonas Schöley, based on code from
# Deirdre I.R. Douma: The Penalized Composite Link Mixed Model.
# see further references in code
# Given is a matrix of observed counts over 2 dimensions (here
# year and age, i.e. a so-called Lexis surface).
# Each entry in the matrix corresponds to a rectangular region on the
# 2D surface. The regions do not overlap, but may be of irregular
# dimensions. The regions completely fill the rectangle given by the
# range of the age and year coordinates.
#
# It is known that the counts have been aggregated into the regions
# from counts observed on a smaller grid. The goal is to estimate those
# original counts on a smaller grid. This is achieved by the penalized
# composite link (mixed) model. Part of this model is a matrix <C> which
# defines the aggregation pattern, i.e. the association between regions on
# the original, fine grid and regions on the aggregation grid.
# Init ------------------------------------------------------------
set.seed(1987)
library(Matrix)
cnst <- list(
# number of year-cells on the fine grid
years = 120,
# number of age-cells on the fine grid
ages = 100,
# two different age groupings
age_grouping_a =
rep(1:5, c(20, 10, 10, 10, 50)),
age_grouping_b =
rep(1:4, c(20, 20, 10, 50))
)
# Construct composition matrix ------------------------------------
# number of cells in ungrouped lexis grid
J = cnst$years*cnst$ages
# Lexis grid cells are sequentially enumerated in column-major order
lexis_grid_ungrouped <-
matrix(1:J, nrow = cnst$ages, ncol = cnst$years)
# superimpose the aggregation pattern on top of the ungrouped lexis grid
lexis_grid_aggregation_pattern <- matrix(
c(rep(cnst$age_grouping_a, 100), rep(cnst$age_grouping_b, 20)),
nrow = cnst$ages, ncol = cnst$years
)
# plot the aggregation pattern
image(t(lexis_grid_aggregation_pattern))
# give a unique label to each aggregation region on the ungrouped lexis grid
lexis_grid_aggregation_key <- lexis_grid_aggregation_pattern
for (j in 2:cnst$years) {
lexis_grid_aggregation_key[,j] <-
lexis_grid_aggregation_key[cnst$ages,j-1] +
lexis_grid_aggregation_key[,j]
}
# number of cells in grouped lexis surface
I = max(lexis_grid_aggregation_key)
# composition matrix, associates ungrouped Lexis grid cells (columns)
# with Lexis aggregation regions (rows)
C = matrix(0, nrow = I, ncol = J)
for (i in 1:I) {
C[i,] <- c(lexis_grid_aggregation_key)==i
}
# Simulate aggregated data ----------------------------------------
# simulate ungrouped (true) counts from Poisson distribution with
# linearly increasing trend with age
lexis_grid_ungrouped_counts <-
matrix(rpois(J, 1:cnst$ages), nrow = cnst$ages, ncol = cnst$years)
# aggregate ungrouped counts into vector according to the aggregation
# pattern defined in the composition matrix
# the indices of the vector correspond to the labels of the
# aggregation regions
y = C%*%c(lexis_grid_ungrouped_counts)
#sum(y) == sum(lexis_grid_ungrouped_counts)
# plot ungrouped counts
image(t(lexis_grid_ungrouped_counts))
# Construct B-spline basis over ungrouped Lexis grid --------------
# All code in this section copied and slightly changed by Jonas Schöley from
# Deirdre I.R. Douma: The Penalized Composite Link Mixed Model. Master Thesis.
# https://www.universiteitleiden.nl/binaries/content/assets/science/mi/scripties/statscience/douma-d-master-thesiss1577921.pdf
rowwisekr <- function (X1, X2) {
one.1 <- matrix(1, 1, ncol(X1))
one.2 <- matrix(1, 1, ncol(X2))
kronecker(X1, one.2) * kronecker(one.1, X2)
}
# 2 column matrix giving the year and age coordinates for the centroids
# of the cells on the fine Lexis grid. Row numbers correspond to cell_id.
coord <- cbind(
rep(1:cnst$years, each = cnst$ages),
rep(1:cnst$ages, cnst$years)
)
k = 15; deg = 3; ord_D = 2
# b-spline basis for year coordinates
coord1.min <- min(coord[, 1])
coord1.max <- max(coord[, 1])
d1 <- (coord1.max - coord1.min)/(k - deg)
knots1 <- seq(coord1.min - deg * d1, coord1.max + deg * d1, by = d1)
B1 <- splines::splineDesign(
x = coord[, 1], knots = knots1, ord = deg + 1,
outer.ok = TRUE, sparse = TRUE
)
# b-spline basis for age coordinates
coord2.min <- min(coord[, 2])
coord2.max <- max(coord[, 2])
d2 <- (coord2.max - coord2.min)/(k - deg)
knots2 <- seq(coord2.min - deg * d2, coord2.max + deg * d2, by = d2)
B2 <- splines::splineDesign(
x = coord[, 2], knots = knots2, ord = deg + 1,
outer.ok = TRUE, sparse = TRUE
)
# difference matrix
D <- diff(diag(k), diff = ord_D)
# reparametrisation - Mixed model decomposition
# SVD of P and constructing X and Z for two dimensions
# This is really clever as the mixed model formulation gets rid of
# the need to do an expensive grid search for the optimal lambda
# smoothing parameter. Instead one assumes the parameters of the
# b-spline to originate from a multivariate normal distribution, thereby
# performing the regularization.
# - Jonas Schöley
P <- t(D)%*%D #kxk
eigen.P <- eigen(P)
U <- eigen.P$vectors[, 1:(k - ord_D)]
Sigma <- eigen.P$values[1:(k - ord_D)]
L <- U %*% diag(sqrt(Sigma))
Z1 <- B1 %*% L %*% solve(t(L) %*% L)
Z2 <- B2 %*% L %*% solve(t(L) %*% L)
X_mat <- cbind(rep(1, k), 1:k) #kxp
X1 <- B1 %*% X_mat #Jxp
X2 <- B2 %*% X_mat #Jxp
# Constructing X and Z with row-wise Kronecker product
X <- rowwisekr(X2, X1)
Z <- cbind(rowwisekr(Z2, X1), rowwisekr(X2, Z1), rowwisekr(Z2, Z1))
# Fit PCLM --------------------------------------------------------
# All code in this section copied and slightly changed from
# Deirdre I.R. Douma: The Penalized Composite Link Mixed Model. Master Thesis.
# https://www.universiteitleiden.nl/binaries/content/assets/science/mi/scripties/statscience/douma-d-master-thesiss1577921.pdf
I <- nrow(y)
J <- nrow(X)
q <- ncol(Z)
y <- as.matrix(y, ncol = 1)
# Obtaining initial estimates
C <- Matrix(C, sparse = TRUE)
row_C <- rowSums(C) # cells contributing to aggregate count
mean_row <- y / row_C # counts per contributing cell if contributions to aggregate are uniform
y_long <- rep(mean_row, row_C) # estimated cell counts under uniform contributions to aggregate
fit <- glm(floor(y_long) ~ 0 + as.matrix(X), family = 'poisson')
# Setting up preliminaries for loop
dif <- 10
theta <- c(0)
beta_old <- fit$coefficients #Vector of p
u_old <- c(rep(0, q)) #Vector of q
# I changed this to log-fitted values as seemed to be intended in the
# original code. otherwise the exp in the next step does not make sense.
# I also changed the glm step above to poisson, mainly due to the log-link
# - Jonas Schöley
eta_init <- log(fit$fitted.values) #Jx1
gamm <- exp(eta_init) #Jx1
Gamm <- .sparseDiagonal(n = J, x = as.vector(gamm)) #JxJ
H <- numeric(1)
s <- numeric(1)
mu <- C %*% gamm #Ix1
mu <- Matrix(mu, sparse = TRUE)
W <- .sparseDiagonal(n = I, x = as.vector(mu)) #IxI
Winv <- .sparseDiagonal(n = I, x = (1 / as.vector(mu))) #IxI
X_breve <- Winv %*% C %*% Gamm %*% X #Ixp
Z_breve <- Winv %*% C %*% Gamm %*% Z #Ixq
z <- X_breve %*% beta_old + Z_breve %*% u_old + Winv %*% (y - mu) #Ix1
G <- .sparseDiagonal(n = q, x = theta) #qxq
ZGZ <- Z_breve %*% G %*% t(Z_breve) #IxI
V <- Winv + ZGZ #IxI
V <- Matrix(V, sparse = TRUE)
V_inv <- solve(V) #IxI
while(any(dif > 1e-6)){
#Estimating Coefficients
beta <- solve(t(X_breve) %*% V_inv %*% X_breve) %*% t(X_breve) %*% V_inv %*% z #p
u <- G %*% t(Z_breve) %*% V_inv %*% (z - X_breve %*% beta_old) #q
eta <- X %*% beta + Z %*% u #Jx1
# Applying Newton-Raphson to calculate theta
P <- V_inv - V_inv %*% X_breve %*% solve(t(X_breve) %*% V_inv %*% X_breve) %*%
t(X_breve) %*% V_inv
# Hessian matrix
H <- (1/2) %*% sum(diag((t(Z_breve) %*% P %*% Z_breve %*% t(Z_breve)%*% P %*% Z_breve))) -
t(z) %*% P %*% Z_breve %*% t(Z_breve) %*% P%*% Z_breve %*% t(Z_breve) %*% P %*% z
s <- -(1/2) %*% sum(diag((t(Z_breve) %*% P %*% Z_breve))) +
(1/2) %*%(t(z) %*% P %*% Z_breve %*% t(Z_breve) %*% P %*% z)
theta_new <- as.numeric(theta - solve(H) * s)
gamm <- as.numeric(exp(eta))
Gamm <- .sparseDiagonal(n = J, x = as.vector(gamm)) #JxJ
mu <- C %*% gamm #Ix1
mu <- Matrix(mu, sparse = TRUE)
W <- .sparseDiagonal(n = I, x = as.vector(mu)) #IxI
Winv <- .sparseDiagonal(n = I, x = (1 / as.vector(mu))) #IxI
X_breve <- Winv %*% C %*% Gamm %*% X #Ixp
Z_breve <- Winv %*% C %*% Gamm %*% Z #Ixq
z <- X_breve %*% beta + Z_breve %*% u + Winv %*% (y - mu) #Ix1
G <- .sparseDiagonal(n = q, x = theta_new) #qxq
ZGZ <- Z_breve %*% G %*% t(Z_breve) #IxI
V <- Winv + ZGZ #IxI
V <- Matrix(V, sparse = TRUE)
V_inv <- solve(V) #IxI
# Updating Parameters
dif <- abs(beta - beta_old)
beta_old <- beta
theta <- theta_new
}
# Show ungrouped counts -------------------------------------------
plot(mu-y)
abs(sum(gamm) - sum(y))
lexis_grid_ungrouped_counts_predicted <-
matrix(gamm, nrow = cnst$ages, ncol = cnst$years)
# true ungrouped (points) vs. predicted ungrouped (surface)
rgl::persp3d(
x = 1:cnst$ages, y = 1:cnst$years, lexis_grid_ungrouped_counts_predicted,
color = 'grey'
)
rgl::points3d(
x = coord[,2], y = coord[,1], z = c(lexis_grid_ungrouped_counts),
col = 'black', alpha = 0.2
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment