Skip to content

Instantly share code, notes, and snippets.

@ClaytonJY
Last active October 31, 2019 02:59
Show Gist options
  • Save ClaytonJY/ac6853d55cb61d9a2f6885e0679bf66f to your computer and use it in GitHub Desktop.
Save ClaytonJY/ac6853d55cb61d9a2f6885e0679bf66f to your computer and use it in GitHub Desktop.
Adding impact factor
library(tidyverse)

# fake some data
input_tbl <- lst(
  MarketDay = as.Date(c("2016-01-01", "2016-01-02", "2016-01-03")),
  HourEnding = 1:3,
  FuelType = c("Coal", "Gas", "Hydro"),
  Month = 1,  # because only Jan dates
  Region = "North"
) %>%
  cross_df() %>%
  mutate(
    MarketDay = as.Date(MarketDay, origin = "1970-01-01"),
    # skipping GenMISObyFuel, consumption_factor, withdrawal_factor
    consumption = 10 * (1:n() + rnorm(n())),  # changed this up to make later steps more clear
    withdrawal = 20 * (1:n() + rnorm(n()))
  )

input_tbl
#> # A tibble: 27 x 7
#>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
#>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
#>  1 2016-01-01          1 Coal         1 North       -11.3        23.8
#>  2 2016-01-02          1 Coal         1 North         9.01       16.9
#>  3 2016-01-03          1 Coal         1 North        42.1        47.2
#>  4 2016-01-01          2 Coal         1 North        33.0        94.1
#>  5 2016-01-02          2 Coal         1 North        43.0        88.3
#>  6 2016-01-03          2 Coal         1 North        55.4       137. 
#>  7 2016-01-01          3 Coal         1 North        68.6       165. 
#>  8 2016-01-02          3 Coal         1 North        82.0       153. 
#>  9 2016-01-03          3 Coal         1 North        77.2       158. 
#> 10 2016-01-01          1 Gas          1 North        84.7       171. 
#> # … with 17 more rows

# we want total consumption & withdrawal per MarketDay & HourEnding.
# we'll do this as a separate tibble because it'll simplify the next step
summary_tbl <- input_tbl %>%
  group_by(MarketDay, HourEnding) %>%
  summarize(
    hourly_total_consumption = sum(consumption),
    hourly_total_withdrawal = sum(withdrawal)
  ) %>%
  ungroup()

summary_tbl
#> # A tibble: 9 x 4
#>   MarketDay  HourEnding hourly_total_consumption hourly_total_withdrawal
#>   <date>          <int>                    <dbl>                   <dbl>
#> 1 2016-01-01          1                     261.                    549.
#> 2 2016-01-01          2                     388.                    753.
#> 3 2016-01-01          3                     484.                    941.
#> 4 2016-01-02          1                     319.                    680.
#> 5 2016-01-02          2                     398.                    877.
#> 6 2016-01-02          3                     507.                    987.
#> 7 2016-01-03          1                     362.                    696.
#> 8 2016-01-03          2                     464.                    947.
#> 9 2016-01-03          3                     537.                   1075.

# now we compute deltas by subtracting a lagged value from our value
# this would be trickier to implement and reason about if our hourly_total_* columns
# were already in our bigger input_tbl
summary_tbl <- summary_tbl %>%
  arrange(MarketDay, HourEnding) %>%  # to be extra sure our order is chronological
  mutate(
    delta_hourly_consumption = hourly_total_consumption - lag(hourly_total_consumption),
    delta_hourly_withdrawal = hourly_total_withdrawal - lag(hourly_total_withdrawal)
  )

# notice the NA's at the top here!
# there's nothing to subtract; the lag starts with NA, and x - NA = NA
summary_tbl
#> # A tibble: 9 x 6
#>   MarketDay  HourEnding hourly_total_co… hourly_total_wi… delta_hourly_co…
#>   <date>          <int>            <dbl>            <dbl>            <dbl>
#> 1 2016-01-01          1             261.             549.             NA  
#> 2 2016-01-01          2             388.             753.            127. 
#> 3 2016-01-01          3             484.             941.             96.4
#> 4 2016-01-02          1             319.             680.           -165. 
#> 5 2016-01-02          2             398.             877.             78.5
#> 6 2016-01-02          3             507.             987.            109. 
#> 7 2016-01-03          1             362.             696.           -144. 
#> 8 2016-01-03          2             464.             947.            102. 
#> 9 2016-01-03          3             537.            1075.             73.2
#> # … with 1 more variable: delta_hourly_withdrawal <dbl>

# now we can join this back in to our original table
tbl <- input_tbl %>%
  left_join(summary_tbl, by = c("MarketDay", "HourEnding")) %>%
  arrange(MarketDay, HourEnding)

tbl
#> # A tibble: 27 x 11
#>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
#>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
#>  1 2016-01-01          1 Coal         1 North       -11.3        23.8
#>  2 2016-01-01          1 Gas          1 North        84.7       171. 
#>  3 2016-01-01          1 Hydro        1 North       188.        354. 
#>  4 2016-01-01          2 Coal         1 North        33.0        94.1
#>  5 2016-01-01          2 Gas          1 North       122.        249. 
#>  6 2016-01-01          2 Hydro        1 North       233.        410. 
#>  7 2016-01-01          3 Coal         1 North        68.6       165. 
#>  8 2016-01-01          3 Gas          1 North       161.        296. 
#>  9 2016-01-01          3 Hydro        1 North       255.        481. 
#> 10 2016-01-02          1 Coal         1 North         9.01       16.9
#> # … with 17 more rows, and 4 more variables:
#> #   hourly_total_consumption <dbl>, hourly_total_withdrawal <dbl>,
#> #   delta_hourly_consumption <dbl>, delta_hourly_withdrawal <dbl>

# too many columns to show in this format, so we'll focus on comsumption-only
tbl %>%
  select(-Month, -Region, -FuelType, -contains("withdrawal"))
#> # A tibble: 27 x 5
#>    MarketDay  HourEnding consumption hourly_total_consu… delta_hourly_cons…
#>    <date>          <int>       <dbl>               <dbl>              <dbl>
#>  1 2016-01-01          1      -11.3                 261.               NA  
#>  2 2016-01-01          1       84.7                 261.               NA  
#>  3 2016-01-01          1      188.                  261.               NA  
#>  4 2016-01-01          2       33.0                 388.              127. 
#>  5 2016-01-01          2      122.                  388.              127. 
#>  6 2016-01-01          2      233.                  388.              127. 
#>  7 2016-01-01          3       68.6                 484.               96.4
#>  8 2016-01-01          3      161.                  484.               96.4
#>  9 2016-01-01          3      255.                  484.               96.4
#> 10 2016-01-02          1        9.01                319.             -165. 
#> # … with 17 more rows

Created on 2019-10-30 by the reprex package (v0.3.0)

library(tidyverse)  # all the good stuff
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date

# I want to build something kinda like your actual data, but simpler
# fewer levels of important cols, only a few extra columns, but same general idea
df2016north <- lst(
  MarketDay = seq.Date(from = as_date("2019-01-01"), to = as_date("2019-02-28"), by = "day"),
  HourEnding = c(1, 2),
  FuelType = c("Coal", "Gas", "Other"),
  Month = month(MarketDay),
  Region = "North"
) %>%
  cross_df() %>%
  mutate(
    MarketDay = as_date(MarketDay),  # this one got messed up by cross_df
    GenMISObyFuel = sample.int(20:7000, nrow(.), replace = TRUE)
  )

# let's take a peek
# should be one row for every possible combo of the stuff in the lst above
df2016north
#> # A tibble: 20,886 x 6
#>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel
#>    <date>          <dbl> <chr>    <dbl> <chr>          <int>
#>  1 2019-01-01          1 Coal         1 North              7
#>  2 2019-01-02          1 Coal         1 North              7
#>  3 2019-01-03          1 Coal         1 North             18
#>  4 2019-01-04          1 Coal         1 North              8
#>  5 2019-01-05          1 Coal         1 North             11
#>  6 2019-01-06          1 Coal         1 North             20
#>  7 2019-01-07          1 Coal         1 North             18
#>  8 2019-01-08          1 Coal         1 North              2
#>  9 2019-01-09          1 Coal         1 North              5
#> 10 2019-01-10          1 Coal         1 North              7
#> # … with 20,876 more rows

# to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
# it might look like this, after we've read it in and formatted it appropriately
impact_df <- lst(
  FuelType = unique(df2016north$FuelType),
  Month = unique(df2016north$Month)
) %>%
  cross_df() %>%
  mutate(
    impact_factor = sample.int(150:800, nrow(.))
  )

# take a peek
impact_df
#> # A tibble: 6 x 3
#>   FuelType Month impact_factor
#>   <chr>    <dbl>         <int>
#> 1 Coal         1            88
#> 2 Gas          1            22
#> 3 Other        1           113
#> 4 Coal         2           140
#> 5 Gas          2           130
#> 6 Other        2            67

# now we can join to get impact_factor into our df2016north tbl
# left join means we'll keep all the columns from the left one, df2016north,
# even the rows with no match in impact_df (all rows have matched in this example)
df2016north <- left_join(df2016north, impact_df, by = c("FuelType", "Month"))

# peek
df2016north
#> # A tibble: 20,886 x 7
#>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel impact_factor
#>    <date>          <dbl> <chr>    <dbl> <chr>          <int>         <int>
#>  1 2019-01-01          1 Coal         1 North              7            88
#>  2 2019-01-02          1 Coal         1 North              7            88
#>  3 2019-01-03          1 Coal         1 North             18            88
#>  4 2019-01-04          1 Coal         1 North              8            88
#>  5 2019-01-05          1 Coal         1 North             11            88
#>  6 2019-01-06          1 Coal         1 North             20            88
#>  7 2019-01-07          1 Coal         1 North             18            88
#>  8 2019-01-08          1 Coal         1 North              2            88
#>  9 2019-01-09          1 Coal         1 North              5            88
#> 10 2019-01-10          1 Coal         1 North              7            88
#> # … with 20,876 more rows

# that only showed one FuelType-Month combo
# we can randomly sample to better understand what happened
sample_n(df2016north, 20)
#> # A tibble: 20 x 7
#>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel impact_factor
#>    <date>          <dbl> <chr>    <dbl> <chr>          <int>         <int>
#>  1 2019-02-01          2 Coal         2 North              1           140
#>  2 2019-01-18          2 Coal         2 North              6           140
#>  3 2019-01-31          1 Gas          2 North              3           130
#>  4 2019-02-11          2 Other        2 North              4            67
#>  5 2019-01-07          1 Coal         2 North             13           140
#>  6 2019-01-18          1 Other        2 North             12            67
#>  7 2019-01-20          1 Other        2 North             17            67
#>  8 2019-01-10          1 Other        1 North              1           113
#>  9 2019-01-30          2 Other        2 North             14            67
#> 10 2019-01-23          2 Other        1 North              2           113
#> 11 2019-01-09          1 Gas          1 North             11            22
#> 12 2019-01-04          2 Other        1 North             13           113
#> 13 2019-01-02          2 Coal         1 North              4            88
#> 14 2019-02-24          2 Gas          1 North              9            22
#> 15 2019-01-18          1 Other        1 North              5           113
#> 16 2019-02-20          2 Coal         1 North             14            88
#> 17 2019-02-04          1 Other        2 North             15            67
#> 18 2019-02-26          2 Coal         1 North              1            88
#> 19 2019-02-09          1 Other        2 North             19            67
#> 20 2019-01-21          2 Coal         1 North             19            88

Created on 2019-10-30 by the reprex package (v0.3.0)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment