hongyuanjia · March 9, 2022 14:21 · hongyuanjia · Feb 16, 2022
diff --git a/init_stan_data.R b/init_stan_data.R
 #' Initialize input data for Stan
 #'
 #' @param field A data.frame that contains field-measured data
 #'
 #' @param computed A data.frame that contains computed (simulated) data
 #'
 #' @param designed A data.frame that contains designed data for prediction.
 #'        Default is set to the same input as `computed.`
 #'
 #' @param inputs,outputs,params One or more unquoted expressions separated by
 #'        commas. Normally unquoted column names for input, output and
 #'        calibration parameter, respectively. Will directly pass to [dplyr::
 #'        select].
 #'
 #' @return A named list of 3 elements:
 #'
 #' - `input`: A list of 3 elements. They are all min-max normalized based on the
 #'   combination of raw observed and computed input.
 #'   - `observed`: A tibble with processed observed input data
 #'   - `computed`: A tibble with processed computed input data
 #'   - `designed`: A tibble with processed designed input data
 #' - `output`: A list of 2 elements. They are all standarized to [0, 1].
 #'   - `observed`: A tibble with processed observed output data
 #'   - `computed`: A tibble with processed computed output data
 #' - `param`: A tibble with processed data of calibration parameters. The data
 #'   has been min-max normalized.
 #'
 #' @examples
 #'
 #' init_stan_data(
 #'     field = data_field, computed = data_comp,
 #'     inputs = c(tdb, rh, solar_rad),
 #'     outputs = c(total_elec, heating_gas, cooling_elec),
 #'     params = c(tc1, tc2, tc3)
 #' )
 #'
 init_stan_data <- function(field, computed, designed = computed, inputs, outputs, params) {
    # Instead of using number or column index of input, output or parameters, I
    # do suggest to use column names.
    #
    # 1. The names give you the basic information about the data.
    #
    # 2. And also using names can avoid mistakes when the numbers or positions
    #    of input data column changes but you did not update the index.

    # Instead of using 'y', 'xf', 'xc', etc. that do not contain any meaning, I
    # suggest to use names that are more meaningful, e.g. 'out_obs', 'in_obs'.

    # observed inputs
    in_obs <- field %>% dplyr::select({{inputs}})
    # computed inputs
    in_sim <- computed %>% select({{inputs}})
    # designed inputs for predictions
    in_pred <- designed %>% select({{inputs}})

    # observed outputs
    out_obs <- field %>% dplyr::select({{outputs}})
    # computed outputs
    out_sim <- computed %>% dplyr::select({{outputs}})

    # calibration parameters
    par <- computed %>% dplyr::select({{params}})

    # min-max normalize observed, computed inputs and designed inputs
    # NOTE: Here the normalization is based on the min and max of combined
    #       observed and computed inputs. It becomes a little verbose to use
    #       dplyr syntax. Using data.table will make the code much more cleaner.
    in_comb <- dplyr::bind_rows(in_obs, in_sim)
    in_comb_min <- in_comb %>% dplyr::summarise(
        dplyr::across(dplyr::everything(), min, na.rm = TRUE)
    )
    in_comb_max <- in_comb %>% dplyr::summarise(
        dplyr::across(dplyr::everything(), max, na.rm = TRUE)
    )
    in_obs_norm  <- minmax_norm_df(in_obs, in_comb_min, in_comb_max)
    in_sim_norm  <- minmax_norm_df(in_sim, in_comb_min, in_comb_max)
    in_pred_norm <- minmax_norm_df(in_pred, in_comb_min, in_comb_max)

    # min-max normalize calibration parameters
    par_norm <- par %>% dplyr::mutate(
        dplyr::across(
            dplyr::everything(),
            ~minmax_norm(., min(., na.rm = TRUE), max(., na.rm = TRUE))
        )
    )

    # standardize observed and computed outputs
    out_sim_std <- out_sim %>% dplyr::mutate(
        dplyr::across(dplyr::everything(), zscore_norm)
    )
    out_obs_std <- out_obs %>% dplyr::mutate(
        dplyr::across(dplyr::everything(), zscore_norm)
    )

    # create data as list for input to Stan
    list(
        input = list(
            observed = in_obs_norm,
            computed = in_sim_norm,
            designed = in_pred_norm
        ),
        output = list(
            observed = out_obs_std,
            computed = out_sim_std
        ),
        param = par_norm
    )
 }

 `%>%` <- magrittr::`%>%`

 zscore_norm <- function(x, na.rm = TRUE) {
    (x - mean(x, na.rm = na.rm)) / sd(x, na.rm = na.rm)
 }

 minmax_norm <- function(x, min, max) {
    (x - min) / (max - min)
 }

 minmax_norm_df <- function(data, min, max) {
    purrr::map_dfc(
        setNames(names(data), names(data)),
        ~minmax_norm(data[[.]], min[[.]], max[[.]])
    )
 }
	#' Initialize input data for Stan
	#'
	#' @param field A data.frame that contains field-measured data
	#'
	#' @param computed A data.frame that contains computed (simulated) data
	#'
	#' @param designed A data.frame that contains designed data for prediction.
	#' Default is set to the same input as `computed.`
	#'
	#' @param inputs,outputs,params One or more unquoted expressions separated by
	#' commas. Normally unquoted column names for input, output and
	#' calibration parameter, respectively. Will directly pass to [dplyr::
	#' select].
	#'
	#' @return A named list of 3 elements:
	#'
	#' - `input`: A list of 3 elements. They are all min-max normalized based on the
	#' combination of raw observed and computed input.
	#' - `observed`: A tibble with processed observed input data
	#' - `computed`: A tibble with processed computed input data
	#' - `designed`: A tibble with processed designed input data
	#' - `output`: A list of 2 elements. They are all standarized to [0, 1].
	#' - `observed`: A tibble with processed observed output data
	#' - `computed`: A tibble with processed computed output data
	#' - `param`: A tibble with processed data of calibration parameters. The data
	#' has been min-max normalized.
	#'
	#' @examples
	#'
	#' init_stan_data(
	#' field = data_field, computed = data_comp,
	#' inputs = c(tdb, rh, solar_rad),
	#' outputs = c(total_elec, heating_gas, cooling_elec),
	#' params = c(tc1, tc2, tc3)
	#' )
	#'
	init_stan_data <- function(field, computed, designed = computed, inputs, outputs, params) {
	# Instead of using number or column index of input, output or parameters, I
	# do suggest to use column names.
	#
	# 1. The names give you the basic information about the data.
	#
	# 2. And also using names can avoid mistakes when the numbers or positions
	# of input data column changes but you did not update the index.

	# Instead of using 'y', 'xf', 'xc', etc. that do not contain any meaning, I
	# suggest to use names that are more meaningful, e.g. 'out_obs', 'in_obs'.

	# observed inputs
	in_obs <- field %>% dplyr::select({{inputs}})
	# computed inputs
	in_sim <- computed %>% select({{inputs}})
	# designed inputs for predictions
	in_pred <- designed %>% select({{inputs}})

	# observed outputs
	out_obs <- field %>% dplyr::select({{outputs}})
	# computed outputs
	out_sim <- computed %>% dplyr::select({{outputs}})

	# calibration parameters
	par <- computed %>% dplyr::select({{params}})

	# min-max normalize observed, computed inputs and designed inputs
	# NOTE: Here the normalization is based on the min and max of combined
	# observed and computed inputs. It becomes a little verbose to use
	# dplyr syntax. Using data.table will make the code much more cleaner.
	in_comb <- dplyr::bind_rows(in_obs, in_sim)
	in_comb_min <- in_comb %>% dplyr::summarise(
	dplyr::across(dplyr::everything(), min, na.rm = TRUE)
	)
	in_comb_max <- in_comb %>% dplyr::summarise(
	dplyr::across(dplyr::everything(), max, na.rm = TRUE)
	)
	in_obs_norm <- minmax_norm_df(in_obs, in_comb_min, in_comb_max)
	in_sim_norm <- minmax_norm_df(in_sim, in_comb_min, in_comb_max)
	in_pred_norm <- minmax_norm_df(in_pred, in_comb_min, in_comb_max)

	# min-max normalize calibration parameters
	par_norm <- par %>% dplyr::mutate(
	dplyr::across(
	dplyr::everything(),
	~minmax_norm(., min(., na.rm = TRUE), max(., na.rm = TRUE))
	)
	)

	# standardize observed and computed outputs
	out_sim_std <- out_sim %>% dplyr::mutate(
	dplyr::across(dplyr::everything(), zscore_norm)
	)
	out_obs_std <- out_obs %>% dplyr::mutate(
	dplyr::across(dplyr::everything(), zscore_norm)
	)

	# create data as list for input to Stan
	list(
	input = list(
	observed = in_obs_norm,
	computed = in_sim_norm,
	designed = in_pred_norm
	),
	output = list(
	observed = out_obs_std,
	computed = out_sim_std
	),
	param = par_norm
	)
	}

	`%>%` <- magrittr::`%>%`

	zscore_norm <- function(x, na.rm = TRUE) {
	(x - mean(x, na.rm = na.rm)) / sd(x, na.rm = na.rm)
	}

	minmax_norm <- function(x, min, max) {
	(x - min) / (max - min)
	}

	minmax_norm_df <- function(data, min, max) {
	purrr::map_dfc(
	setNames(names(data), names(data)),
	~minmax_norm(data[[.]], min[[.]], max[[.]])
	)
	}