Adding impact factor

library(tidyverse)

# fake some data
input_tbl <- lst(
  MarketDay = as.Date(c("2016-01-01", "2016-01-02", "2016-01-03")),
  HourEnding = 1:3,
  FuelType = c("Coal", "Gas", "Hydro"),
  Month = 1,  # because only Jan dates
  Region = "North"
) %>%
  cross_df() %>%
  mutate(
    MarketDay = as.Date(MarketDay, origin = "1970-01-01"),
    # skipping GenMISObyFuel, consumption_factor, withdrawal_factor
    consumption = 10 * (1:n() + rnorm(n())),  # changed this up to make later steps more clear
    withdrawal = 20 * (1:n() + rnorm(n()))
  )

input_tbl
#> # A tibble: 27 x 7
#>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
#>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
#>  1 2016-01-01          1 Coal         1 North       -11.3        23.8
#>  2 2016-01-02          1 Coal         1 North         9.01       16.9
#>  3 2016-01-03          1 Coal         1 North        42.1        47.2
#>  4 2016-01-01          2 Coal         1 North        33.0        94.1
#>  5 2016-01-02          2 Coal         1 North        43.0        88.3
#>  6 2016-01-03          2 Coal         1 North        55.4       137. 
#>  7 2016-01-01          3 Coal         1 North        68.6       165. 
#>  8 2016-01-02          3 Coal         1 North        82.0       153. 
#>  9 2016-01-03          3 Coal         1 North        77.2       158. 
#> 10 2016-01-01          1 Gas          1 North        84.7       171. 
#> # … with 17 more rows

# we want total consumption & withdrawal per MarketDay & HourEnding.
# we'll do this as a separate tibble because it'll simplify the next step
summary_tbl <- input_tbl %>%
  group_by(MarketDay, HourEnding) %>%
  summarize(
    hourly_total_consumption = sum(consumption),
    hourly_total_withdrawal = sum(withdrawal)
  ) %>%
  ungroup()

summary_tbl
#> # A tibble: 9 x 4
#>   MarketDay  HourEnding hourly_total_consumption hourly_total_withdrawal
#>   <date>          <int>                    <dbl>                   <dbl>
#> 1 2016-01-01          1                     261.                    549.
#> 2 2016-01-01          2                     388.                    753.
#> 3 2016-01-01          3                     484.                    941.
#> 4 2016-01-02          1                     319.                    680.
#> 5 2016-01-02          2                     398.                    877.
#> 6 2016-01-02          3                     507.                    987.
#> 7 2016-01-03          1                     362.                    696.
#> 8 2016-01-03          2                     464.                    947.
#> 9 2016-01-03          3                     537.                   1075.

# now we compute deltas by subtracting a lagged value from our value
# this would be trickier to implement and reason about if our hourly_total_* columns
# were already in our bigger input_tbl
summary_tbl <- summary_tbl %>%
  arrange(MarketDay, HourEnding) %>%  # to be extra sure our order is chronological
  mutate(
    delta_hourly_consumption = hourly_total_consumption - lag(hourly_total_consumption),
    delta_hourly_withdrawal = hourly_total_withdrawal - lag(hourly_total_withdrawal)
  )

# notice the NA's at the top here!
# there's nothing to subtract; the lag starts with NA, and x - NA = NA
summary_tbl
#> # A tibble: 9 x 6
#>   MarketDay  HourEnding hourly_total_co… hourly_total_wi… delta_hourly_co…
#>   <date>          <int>            <dbl>            <dbl>            <dbl>
#> 1 2016-01-01          1             261.             549.             NA  
#> 2 2016-01-01          2             388.             753.            127. 
#> 3 2016-01-01          3             484.             941.             96.4
#> 4 2016-01-02          1             319.             680.           -165. 
#> 5 2016-01-02          2             398.             877.             78.5
#> 6 2016-01-02          3             507.             987.            109. 
#> 7 2016-01-03          1             362.             696.           -144. 
#> 8 2016-01-03          2             464.             947.            102. 
#> 9 2016-01-03          3             537.            1075.             73.2
#> # … with 1 more variable: delta_hourly_withdrawal <dbl>

# now we can join this back in to our original table
tbl <- input_tbl %>%
  left_join(summary_tbl, by = c("MarketDay", "HourEnding")) %>%
  arrange(MarketDay, HourEnding)

tbl
#> # A tibble: 27 x 11
#>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
#>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
#>  1 2016-01-01          1 Coal         1 North       -11.3        23.8
#>  2 2016-01-01          1 Gas          1 North        84.7       171. 
#>  3 2016-01-01          1 Hydro        1 North       188.        354. 
#>  4 2016-01-01          2 Coal         1 North        33.0        94.1
#>  5 2016-01-01          2 Gas          1 North       122.        249. 
#>  6 2016-01-01          2 Hydro        1 North       233.        410. 
#>  7 2016-01-01          3 Coal         1 North        68.6       165. 
#>  8 2016-01-01          3 Gas          1 North       161.        296. 
#>  9 2016-01-01          3 Hydro        1 North       255.        481. 
#> 10 2016-01-02          1 Coal         1 North         9.01       16.9
#> # … with 17 more rows, and 4 more variables:
#> #   hourly_total_consumption <dbl>, hourly_total_withdrawal <dbl>,
#> #   delta_hourly_consumption <dbl>, delta_hourly_withdrawal <dbl>

# too many columns to show in this format, so we'll focus on comsumption-only
tbl %>%
  select(-Month, -Region, -FuelType, -contains("withdrawal"))
#> # A tibble: 27 x 5
#>    MarketDay  HourEnding consumption hourly_total_consu… delta_hourly_cons…
#>    <date>          <int>       <dbl>               <dbl>              <dbl>
#>  1 2016-01-01          1      -11.3                 261.               NA  
#>  2 2016-01-01          1       84.7                 261.               NA  
#>  3 2016-01-01          1      188.                  261.               NA  
#>  4 2016-01-01          2       33.0                 388.              127. 
#>  5 2016-01-01          2      122.                  388.              127. 
#>  6 2016-01-01          2      233.                  388.              127. 
#>  7 2016-01-01          3       68.6                 484.               96.4
#>  8 2016-01-01          3      161.                  484.               96.4
#>  9 2016-01-01          3      255.                  484.               96.4
#> 10 2016-01-02          1        9.01                319.             -165. 
#> # … with 17 more rows

^{Created on 2019-10-30 by the reprex package (v0.3.0)}

library(tidyverse) # all the good stuff library(lubridate) #> #> Attaching package: 'lubridate' #> The following object is masked from 'package:base': #> #> date # I want to build something kinda like your actual data, but simpler # fewer levels of important cols, only a few extra columns, but same general idea df2016north <- lst( MarketDay = seq.Date(from = as_date("2019-01-01"), to = as_date("2019-02-28"), by = "day"), HourEnding = c(1, 2), FuelType = c("Coal", "Gas", "Other"), Month = month(MarketDay), Region = "North" ) %>% cross_df() %>% mutate( MarketDay = as_date(MarketDay), # this one got messed up by cross_df GenMISObyFuel = sample.int(20:7000, nrow(.), replace = TRUE) ) # let's take a peek # should be one row for every possible combo of the stuff in the lst above df2016north #> # A tibble: 20,886 x 6 #> MarketDay HourEnding FuelType Month Region GenMISObyFuel #> <date> <dbl> <chr> <dbl> <chr> <int> #> 1 2019-01-01 1 Coal 1 North 7 #> 2 2019-01-02 1 Coal 1 North 7 #> 3 2019-01-03 1 Coal 1 North 18 #> 4 2019-01-04 1 Coal 1 North 8 #> 5 2019-01-05 1 Coal 1 North 11 #> 6 2019-01-06 1 Coal 1 North 20 #> 7 2019-01-07 1 Coal 1 North 18 #> 8 2019-01-08 1 Coal 1 North 2 #> 9 2019-01-09 1 Coal 1 North 5 #> 10 2019-01-10 1 Coal 1 North 7 #> # … with 20,876 more rows # to map (FuelType, Month) to impact_factor, we'd like a table with each as a column # it might look like this, after we've read it in and formatted it appropriately impact_df <- lst( FuelType = unique(df2016north$FuelType), Month = unique(df2016north$Month) ) %>% cross_df() %>% mutate( impact_factor = sample.int(150:800, nrow(.)) ) # take a peek impact_df #> # A tibble: 6 x 3 #> FuelType Month impact_factor #> <chr> <dbl> <int> #> 1 Coal 1 88 #> 2 Gas 1 22 #> 3 Other 1 113 #> 4 Coal 2 140 #> 5 Gas 2 130 #> 6 Other 2 67 # now we can join to get impact_factor into our df2016north tbl # left join means we'll keep all the columns from the left one, df2016north, # even the rows with no match in impact_df (all rows have matched in this example) df2016north <- left_join(df2016north, impact_df, by = c("FuelType", "Month")) # peek df2016north #> # A tibble: 20,886 x 7 #> MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor #> <date> <dbl> <chr> <dbl> <chr> <int> <int> #> 1 2019-01-01 1 Coal 1 North 7 88 #> 2 2019-01-02 1 Coal 1 North 7 88 #> 3 2019-01-03 1 Coal 1 North 18 88 #> 4 2019-01-04 1 Coal 1 North 8 88 #> 5 2019-01-05 1 Coal 1 North 11 88 #> 6 2019-01-06 1 Coal 1 North 20 88 #> 7 2019-01-07 1 Coal 1 North 18 88 #> 8 2019-01-08 1 Coal 1 North 2 88 #> 9 2019-01-09 1 Coal 1 North 5 88 #> 10 2019-01-10 1 Coal 1 North 7 88 #> # … with 20,876 more rows # that only showed one FuelType-Month combo # we can randomly sample to better understand what happened sample_n(df2016north, 20) #> # A tibble: 20 x 7 #> MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor #> <date> <dbl> <chr> <dbl> <chr> <int> <int> #> 1 2019-02-01 2 Coal 2 North 1 140 #> 2 2019-01-18 2 Coal 2 North 6 140 #> 3 2019-01-31 1 Gas 2 North 3 130 #> 4 2019-02-11 2 Other 2 North 4 67 #> 5 2019-01-07 1 Coal 2 North 13 140 #> 6 2019-01-18 1 Other 2 North 12 67 #> 7 2019-01-20 1 Other 2 North 17 67 #> 8 2019-01-10 1 Other 1 North 1 113 #> 9 2019-01-30 2 Other 2 North 14 67 #> 10 2019-01-23 2 Other 1 North 2 113 #> 11 2019-01-09 1 Gas 1 North 11 22 #> 12 2019-01-04 2 Other 1 North 13 113 #> 13 2019-01-02 2 Coal 1 North 4 88 #> 14 2019-02-24 2 Gas 1 North 9 22 #> 15 2019-01-18 1 Other 1 North 5 113 #> 16 2019-02-20 2 Coal 1 North 14 88 #> 17 2019-02-04 1 Other 2 North 15 67 #> 18 2019-02-26 2 Coal 1 North 1 88 #> 19 2019-02-09 1 Other 2 North 19 67 #> 20 2019-01-21 2 Coal 1 North 19 88

ClaytonJY/group-and-summarize.md