Skip to content

Instantly share code, notes, and snippets.

@hongyuanjia
Last active March 9, 2022 14:21
Show Gist options
  • Save hongyuanjia/ffafd318bf0403a8e7836c1711d3e95c to your computer and use it in GitHub Desktop.
Save hongyuanjia/ffafd318bf0403a8e7836c1711d3e95c to your computer and use it in GitHub Desktop.
Initialize input data for Stan
#' Initialize input data for Stan
#'
#' @param field A data.frame that contains field-measured data
#'
#' @param computed A data.frame that contains computed (simulated) data
#'
#' @param designed A data.frame that contains designed data for prediction.
#' Default is set to the same input as `computed.`
#'
#' @param inputs,outputs,params One or more unquoted expressions separated by
#' commas. Normally unquoted column names for input, output and
#' calibration parameter, respectively. Will directly pass to [dplyr::
#' select].
#'
#' @return A named list of 3 elements:
#'
#' - `input`: A list of 3 elements. They are all min-max normalized based on the
#' combination of raw observed and computed input.
#' - `observed`: A tibble with processed observed input data
#' - `computed`: A tibble with processed computed input data
#' - `designed`: A tibble with processed designed input data
#' - `output`: A list of 2 elements. They are all standarized to [0, 1].
#' - `observed`: A tibble with processed observed output data
#' - `computed`: A tibble with processed computed output data
#' - `param`: A tibble with processed data of calibration parameters. The data
#' has been min-max normalized.
#'
#' @examples
#'
#' init_stan_data(
#' field = data_field, computed = data_comp,
#' inputs = c(tdb, rh, solar_rad),
#' outputs = c(total_elec, heating_gas, cooling_elec),
#' params = c(tc1, tc2, tc3)
#' )
#'
init_stan_data <- function(field, computed, designed = computed, inputs, outputs, params) {
# Instead of using number or column index of input, output or parameters, I
# do suggest to use column names.
#
# 1. The names give you the basic information about the data.
#
# 2. And also using names can avoid mistakes when the numbers or positions
# of input data column changes but you did not update the index.
# Instead of using 'y', 'xf', 'xc', etc. that do not contain any meaning, I
# suggest to use names that are more meaningful, e.g. 'out_obs', 'in_obs'.
# observed inputs
in_obs <- field %>% dplyr::select({{inputs}})
# computed inputs
in_sim <- computed %>% select({{inputs}})
# designed inputs for predictions
in_pred <- designed %>% select({{inputs}})
# observed outputs
out_obs <- field %>% dplyr::select({{outputs}})
# computed outputs
out_sim <- computed %>% dplyr::select({{outputs}})
# calibration parameters
par <- computed %>% dplyr::select({{params}})
# min-max normalize observed, computed inputs and designed inputs
# NOTE: Here the normalization is based on the min and max of combined
# observed and computed inputs. It becomes a little verbose to use
# dplyr syntax. Using data.table will make the code much more cleaner.
in_comb <- dplyr::bind_rows(in_obs, in_sim)
in_comb_min <- in_comb %>% dplyr::summarise(
dplyr::across(dplyr::everything(), min, na.rm = TRUE)
)
in_comb_max <- in_comb %>% dplyr::summarise(
dplyr::across(dplyr::everything(), max, na.rm = TRUE)
)
in_obs_norm <- minmax_norm_df(in_obs, in_comb_min, in_comb_max)
in_sim_norm <- minmax_norm_df(in_sim, in_comb_min, in_comb_max)
in_pred_norm <- minmax_norm_df(in_pred, in_comb_min, in_comb_max)
# min-max normalize calibration parameters
par_norm <- par %>% dplyr::mutate(
dplyr::across(
dplyr::everything(),
~minmax_norm(., min(., na.rm = TRUE), max(., na.rm = TRUE))
)
)
# standardize observed and computed outputs
out_sim_std <- out_sim %>% dplyr::mutate(
dplyr::across(dplyr::everything(), zscore_norm)
)
out_obs_std <- out_obs %>% dplyr::mutate(
dplyr::across(dplyr::everything(), zscore_norm)
)
# create data as list for input to Stan
list(
input = list(
observed = in_obs_norm,
computed = in_sim_norm,
designed = in_pred_norm
),
output = list(
observed = out_obs_std,
computed = out_sim_std
),
param = par_norm
)
}
`%>%` <- magrittr::`%>%`
zscore_norm <- function(x, na.rm = TRUE) {
(x - mean(x, na.rm = na.rm)) / sd(x, na.rm = na.rm)
}
minmax_norm <- function(x, min, max) {
(x - min) / (max - min)
}
minmax_norm_df <- function(data, min, max) {
purrr::map_dfc(
setNames(names(data), names(data)),
~minmax_norm(data[[.]], min[[.]], max[[.]])
)
}
@hongyuanjia
Copy link
Author

Example workflow

library(rstan)
library(dplyr)
library(here)

# use here() to specify file path based on the project root
path_dcomp  <- here("data-raw", "comp", "off_HVAC_3_3_randn_Hourly_clsp.csv")
path_dfield <- here("data-raw", "field", "clsp_2_case_off_HVAC_3_3_Hourly.csv")
path_stan   <- here("src", "multop_bc_stan_rewrite_k_z_covariance.stan")

# read data
DATACOMP <- as_tibble(read.csv(path_dcomp, header = TRUE))
DATAFIELD <- as_tibble(read.csv(path_dfield, header = TRUE))

# rename variables
data_field <- DATAFIELD %>%
    rename(
        total_elec = Electricity.Facility,
        heating_gas = Heating.Gas,
        cooling_elec = Cooling.Electricity,
        tdb = Site.Outdoor.Air.Drybulb.Temperature,
        rh = Site.Outdoor.Air.Relative.Humidity,
        solar_rad = Site.Direct.Solar.Radiation.Rate.per.Area
    )
data_comp <- DATACOMP %>%
    rename(
        total_elec = Electricity.Facility,
        heating_gas = Heating.Gas,
        cooling_elec = Cooling.Electricity,
        tdb = Site.Outdoor.Air.Drybulb.Temperature,
        rh = Site.Outdoor.Air.Relative.Humidity,
        solar_rad = Site.Direct.Solar.Radiation.Rate.per.Area
    )

# init data for Stan input
data_stan <- init_stan_data(
    field = data_field, computed = data_comp,
    inputs = c(tdb, rh, solar_rad),
    outputs = c(total_elec, heating_gas, cooling_elec),
    params = c(tc1, tc2, tc3)
)

# create data as list to Stan
input_stan <- list(
    n      = data_stan$input$observed  %>% nrow(),
    m      = data_stan$input$computed  %>% nrow(),
    n_pred = data_stan$input$designed  %>% nrow(),

    p      = data_stan$input$observed  %>% ncol(),
    q      = data_stan$param           %>% ncol(),
    n_obs  = data_stan$output$observed %>% ncol(),

    xf     = data_stan$input$observed  %>% as.matrix(),
    xc     = data_stan$input$computed  %>% as.matrix(),
    x_pred = data_stan$input$designed  %>% as.matrix(),
    tc     = data_stan$param           %>% as.matrix(),
    eta    = data_stan$output$computed %>% as.matrix(),

    # add Gaussian noise
    y  = data_stan$output$observed %>%
        mutate(across(everything(), ~.x + rnorm(n(), 0, 0.0333))) %>%
        as.matrix()
)

rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

fit <- stan(
    file = path_stan,
    data = input_stan,
    iter = 1,
    chains = 1
)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment