alekrutkowski · September 12, 2024 15:22
diff --git a/helper_functions.R b/helper_functions.R
 renameColumns <- function(dt, ...) {
  pairs <-  
    substitute(list(...)) %>% 
    as.list %>% 
    tail(-1) %>% 
    lapply(. %>% as.list %>% tail(-1) %>% rev) 
  from <-
    pairs %>% 
    sapply(. %>% .[[1]] %>% as.character)
  to <-
    pairs %>% 
    sapply(. %>% .[[2]] %>% as.character)
  setnames(dt, from, to)
 }

 renameColumnsWithFunction <- function(dt, FUN)
  setnames(dt,
           colnames(dt),
           colnames(dt) %>% sapply(FUN))

 addUp <- function(dt, colnames.)
  dt %>% 
  .[, paste0(colnames.,collapse="") := 
      Reduce(`+`,
             c(mget(colnames.)))] %>% 
  .[, (colnames.) := NULL]

 v <- function(...) # to avoid typing single or double quotes
  substitute(list(...)) %>% 
  as.list %>% 
  tail(-1) %>% 
  as.character
diff --git a/main.R b/main.R
 library(magrittr)
 library(data.table)
 library(openxlsx2)
 library(rvest)

 source('helper_functions.R')

 EU <-
  v(AT,BE,BG,CY,CZ,DE,DK,EE,EL,ES,FI,FR,HR,HU,IE,
    IT,LT,LU,LV,MT,NL,PL,PT,RO,SE,SI,SK,
    EU27_2020)

 # Import early estimates --------------------------------------------------

 WEB_PAGE <-
  "https://ec.europa.eu/eurostat/web/social-protection/database/early-estimates"

 URL <-
  WEB_PAGE %>% 
  read_html %>%
  html_elements("a[href$='.xlsx']") %>%  # Select all <a> tags with href ending in .xlsx
  html_attr("href") %>%  # Extract the href attribute (the link itself)
  url_absolute(WEB_PAGE) # Handle relative URLs by converting them to absolute ones if needed

 if (is.character(URL) && length(URL)==1 && URL!="")
  message('Found Excel file URL:\n',URL) else
    stop('Could not find URL of Excel file in the early estimates web page!')

 LATEST_YEAR <-
  read_xlsx(URL, sheet="MIO_NAC", rows=2, cols=6,
            col_names=FALSE) %>% 
  as.integer

 addUpColumns <- function(dt)
  dt %>% 
  addUp(v(HOUSE,EXCLU)) %>% 
  addUp(v(OLD,SURVIV)) %>% 
  addUp(v(SICK,DISA)) # %>% "other" too small to report
  # .[, OTHER :=
  #     SPBENEFNOREROUTE - HOUSEEXCLU - OLDSURVIV - SICKDISA - FAM - UNEMPLOY]

 early_estim <-
  URL %>% 
  read_xlsx(sheet="MIO_NAC", start_row=4, cols=1:10) %>% 
  set_colnames(colnames(.) %>% 
                 ifelse(is.na(.),'geo_labels',.)) %>% 
  as.data.table %>% 
  .[geo_labels != 'EU27_2020'] %>% # empty EU27
  rbind(
    URL %>%
      read_xlsx(sheet="MIO_EUR", cols=1:10, rows=4:5) %>% # not empty EU27
      set_colnames(colnames(.) %>% 
                     ifelse(is.na(.),'geo_labels',.))
  ) %>% 
  .[, geo_labels := geo_labels %>% 
      ifelse(grepl('EU27_2020',.),'European Union - 27 countries (from 2020)',.)] %>% 
  renameColumnsWithFunction(. %>% 
                              trimws %>% 
                              gsub('\n',"",.,fixed=TRUE) %>% 
                              ifelse(grepl('TOTAL',.),'SPBENEFNOREROUTE',.)) %>% 
  renameColumns("Sickness/Health care" -> SICK,
                Disability -> DISA,
                "Old age" -> OLD,
                Survivors -> SURVIV,
                "Family/Children" -> FAM,
                Unemployment -> UNEMPLOY,
                Housing -> HOUSE,
                "Social exclusion n.e.c." -> EXCLU) %>% 
  addUpColumns %>% 
  melt(id.vars='geo_labels', variable.name='spdeps',
       value.name='value_', variable.factor=FALSE) %>% 
  .[, time := LATEST_YEAR] %>% 
  merge(importLabels('geo'), by='geo_labels') %>% 
  .[, geo_labels := NULL] %>% 
  .[geo %in% EU]


 # Import time series for expenditure --------------------------------------

 expend <-
  importData('spr_exp_sum',
             list(unit='MIO_NAC',
                  spdeps=v(SPBENEFNOREROUTE, SICK, DISA,
                           OLD, SURVIV, FAM,
                           UNEMPLOY, HOUSE, EXCLU))) %>% 
  as.data.table %>% 
  .[geo != 'EU27_2020'] %>% 
  rbind(
    importData('spr_exp_sum',
               list(unit='MIO_EUR',
                    geo='EU27_2020',
                    spdeps=v(SPBENEFNOREROUTE, SICK, DISA,
                             OLD, SURVIV, FAM,
                             UNEMPLOY, HOUSE, EXCLU)))
  ) %>% 
  .[, v(flags_,freq,unit) := NULL] %>% 
  renameColumns(TIME_PERIOD -> time) %>% 
  .[, time := time %>% as.character %>% as.integer] %>% 
  .[geo %in% EU] %>% 
  dcast(geo + time ~ spdeps,
        fun.aggregate=identity,
        value.var='value_',
        fill=NA_real_) %>% 
  addUpColumns %>% 
  melt(id.vars=v(geo,time), variable.name='spdeps',
       value.name='value_', variable.factor=FALSE)


 stacked_expend <-
  rbind(expend, early_estim, fill=TRUE)


 # Import HICP -------------------------------------------------------------

 hicp_data <-
  importData('prc_hicp_aind',
             list(coicop='CP00', # All-items HICP
                  unit='INX_A_AVG' # Annual average index
             )) %>% 
  as.data.table %>% 
  .[, hicp := value_/100] %>% 
  .[, v(flags_,freq,coicop,unit,value_) := NULL] %>% 
  renameColumns(TIME_PERIOD -> time) %>% 
  .[, time := time %>% as.character %>% as.integer]


 # Real growth decomposition -----------------------------------------------

 DATA <-
  merge(stacked_expend, hicp_data,
        by=v(geo,time)) 

 decomposeGrowth <- function(dt) 
  copy(dt) %>% 
  .[, real_value := value_/hicp] %>% 
  .[, D_real_value := collapse::D(real_value, t=time)
    , by = .(geo,spdeps)] %>% 
  .[, D_real_value_TOTAL := 
      D_real_value[spdeps=='SPBENEFNOREROUTE']
    , by=.(geo,time)] %>% # across spdeps components
  .[, D_real_value_percent := 100*D_real_value/collapse::L(real_value, t=time)
    , by = .(geo,spdeps)] %>% 
  .[, D_real_value_percent_TOTAL := 
      D_real_value_percent[spdeps=='SPBENEFNOREROUTE']
    , by=.(geo,time)] %>% # across spdeps components
  .[, D_real_value_percentage_points :=
      ifelse(D_real_value_TOTAL==0, 0,
             D_real_value_percent_TOTAL * D_real_value / D_real_value_TOTAL)] %>% 
  .[, spdeps := spdeps %>% ifelse(.=='SPBENEFNOREROUTE','TOTAL',.)]

 componentsToColumns <- function(dt)
  dt %>% 
  dcast(geo + time ~ spdeps,
        fun.aggregate=identity,
        value.var='D_real_value_percentage_points',
        fill=NA_real_) %>% 
  setcolorder('TOTAL',after=ncol(.))

 EU_Agg <-
  DATA %>% 
  .[geo=='EU27_2020'] %>% 
  decomposeGrowth %>% 
  componentsToColumns

 MS_LongTerm <-
  DATA %>% 
  .[geo != 'EU27_2020'] %>% 
  .[time==LATEST_YEAR | time==LATEST_YEAR-6] %>% 
  .[, time := time %>% ifelse(.!=max(.), max(.)-1, .)] %>% 
  decomposeGrowth %>% 
  componentsToColumns %>% 
  .[time==LATEST_YEAR] %>% 
  .[, time := NULL] %>% 
  setorder(-TOTAL)

 MS_ShortTerm <-
  DATA %>% 
  .[geo != 'EU27_2020'] %>% 
  .[time==LATEST_YEAR | time==LATEST_YEAR-1] %>% 
  decomposeGrowth %>% 
  componentsToColumns %>% 
  .[time==LATEST_YEAR] %>% 
  .[, time := NULL] %>% 
  setorder(-TOTAL)

 list(EU_Agg, MS_LongTerm, MS_ShortTerm) %>% 
  set_names(c('EU_Agg',
              paste('MS',LATEST_YEAR-6,LATEST_YEAR,sep='_'),
              paste('MS',LATEST_YEAR-1,LATEST_YEAR,sep='_'))) %>% 
  write_xlsx('ESSPROS social protection expenditure decomposition with early estimates.xlsx')
	renameColumns <- function(dt, ...) {
	pairs <-
	substitute(list(...)) %>%
	as.list %>%
	tail(-1) %>%
	lapply(. %>% as.list %>% tail(-1) %>% rev)
	from <-
	pairs %>%
	sapply(. %>% .[[1]] %>% as.character)
	to <-
	pairs %>%
	sapply(. %>% .[[2]] %>% as.character)
	setnames(dt, from, to)
	}

	renameColumnsWithFunction <- function(dt, FUN)
	setnames(dt,
	colnames(dt),
	colnames(dt) %>% sapply(FUN))

	addUp <- function(dt, colnames.)
	dt %>%
	.[, paste0(colnames.,collapse="") :=
	Reduce(`+`,
	c(mget(colnames.)))] %>%
	.[, (colnames.) := NULL]

	v <- function(...) # to avoid typing single or double quotes
	substitute(list(...)) %>%
	as.list %>%
	tail(-1) %>%
	as.character
	library(magrittr)
	library(data.table)
	library(openxlsx2)
	library(rvest)

	source('helper_functions.R')

	EU <-
	v(AT,BE,BG,CY,CZ,DE,DK,EE,EL,ES,FI,FR,HR,HU,IE,
	IT,LT,LU,LV,MT,NL,PL,PT,RO,SE,SI,SK,
	EU27_2020)

	# Import early estimates --------------------------------------------------

	WEB_PAGE <-
	"https://ec.europa.eu/eurostat/web/social-protection/database/early-estimates"

	URL <-
	WEB_PAGE %>%
	read_html %>%
	html_elements("a[href$='.xlsx']") %>% # Select all <a> tags with href ending in .xlsx
	html_attr("href") %>% # Extract the href attribute (the link itself)
	url_absolute(WEB_PAGE) # Handle relative URLs by converting them to absolute ones if needed

	if (is.character(URL) && length(URL)==1 && URL!="")
	message('Found Excel file URL:\n',URL) else
	stop('Could not find URL of Excel file in the early estimates web page!')

	LATEST_YEAR <-
	read_xlsx(URL, sheet="MIO_NAC", rows=2, cols=6,
	col_names=FALSE) %>%
	as.integer

	addUpColumns <- function(dt)
	dt %>%
	addUp(v(HOUSE,EXCLU)) %>%
	addUp(v(OLD,SURVIV)) %>%
	addUp(v(SICK,DISA)) # %>% "other" too small to report
	# .[, OTHER :=
	# SPBENEFNOREROUTE - HOUSEEXCLU - OLDSURVIV - SICKDISA - FAM - UNEMPLOY]

	early_estim <-
	URL %>%
	read_xlsx(sheet="MIO_NAC", start_row=4, cols=1:10) %>%
	set_colnames(colnames(.) %>%
	ifelse(is.na(.),'geo_labels',.)) %>%
	as.data.table %>%
	.[geo_labels != 'EU27_2020'] %>% # empty EU27
	rbind(
	URL %>%
	read_xlsx(sheet="MIO_EUR", cols=1:10, rows=4:5) %>% # not empty EU27
	set_colnames(colnames(.) %>%
	ifelse(is.na(.),'geo_labels',.))
	) %>%
	.[, geo_labels := geo_labels %>%
	ifelse(grepl('EU27_2020',.),'European Union - 27 countries (from 2020)',.)] %>%
	renameColumnsWithFunction(. %>%
	trimws %>%
	gsub('\n',"",.,fixed=TRUE) %>%
	ifelse(grepl('TOTAL',.),'SPBENEFNOREROUTE',.)) %>%
	renameColumns("Sickness/Health care" -> SICK,
	Disability -> DISA,
	"Old age" -> OLD,
	Survivors -> SURVIV,
	"Family/Children" -> FAM,
	Unemployment -> UNEMPLOY,
	Housing -> HOUSE,
	"Social exclusion n.e.c." -> EXCLU) %>%
	addUpColumns %>%
	melt(id.vars='geo_labels', variable.name='spdeps',
	value.name='value_', variable.factor=FALSE) %>%
	.[, time := LATEST_YEAR] %>%
	merge(importLabels('geo'), by='geo_labels') %>%
	.[, geo_labels := NULL] %>%
	.[geo %in% EU]


	# Import time series for expenditure --------------------------------------

	expend <-
	importData('spr_exp_sum',
	list(unit='MIO_NAC',
	spdeps=v(SPBENEFNOREROUTE, SICK, DISA,
	OLD, SURVIV, FAM,
	UNEMPLOY, HOUSE, EXCLU))) %>%
	as.data.table %>%
	.[geo != 'EU27_2020'] %>%
	rbind(
	importData('spr_exp_sum',
	list(unit='MIO_EUR',
	geo='EU27_2020',
	spdeps=v(SPBENEFNOREROUTE, SICK, DISA,
	OLD, SURVIV, FAM,
	UNEMPLOY, HOUSE, EXCLU)))
	) %>%
	.[, v(flags_,freq,unit) := NULL] %>%
	renameColumns(TIME_PERIOD -> time) %>%
	.[, time := time %>% as.character %>% as.integer] %>%
	.[geo %in% EU] %>%
	dcast(geo + time ~ spdeps,
	fun.aggregate=identity,
	value.var='value_',
	fill=NA_real_) %>%
	addUpColumns %>%
	melt(id.vars=v(geo,time), variable.name='spdeps',
	value.name='value_', variable.factor=FALSE)


	stacked_expend <-
	rbind(expend, early_estim, fill=TRUE)


	# Import HICP -------------------------------------------------------------

	hicp_data <-
	importData('prc_hicp_aind',
	list(coicop='CP00', # All-items HICP
	unit='INX_A_AVG' # Annual average index
	)) %>%
	as.data.table %>%
	.[, hicp := value_/100] %>%
	.[, v(flags_,freq,coicop,unit,value_) := NULL] %>%
	renameColumns(TIME_PERIOD -> time) %>%
	.[, time := time %>% as.character %>% as.integer]


	# Real growth decomposition -----------------------------------------------

	DATA <-
	merge(stacked_expend, hicp_data,
	by=v(geo,time))

	decomposeGrowth <- function(dt)
	copy(dt) %>%
	.[, real_value := value_/hicp] %>%
	.[, D_real_value := collapse::D(real_value, t=time)
	, by = .(geo,spdeps)] %>%
	.[, D_real_value_TOTAL :=
	D_real_value[spdeps=='SPBENEFNOREROUTE']
	, by=.(geo,time)] %>% # across spdeps components
	.[, D_real_value_percent := 100*D_real_value/collapse::L(real_value, t=time)
	, by = .(geo,spdeps)] %>%
	.[, D_real_value_percent_TOTAL :=
	D_real_value_percent[spdeps=='SPBENEFNOREROUTE']
	, by=.(geo,time)] %>% # across spdeps components
	.[, D_real_value_percentage_points :=
	ifelse(D_real_value_TOTAL==0, 0,
	D_real_value_percent_TOTAL * D_real_value / D_real_value_TOTAL)] %>%
	.[, spdeps := spdeps %>% ifelse(.=='SPBENEFNOREROUTE','TOTAL',.)]

	componentsToColumns <- function(dt)
	dt %>%
	dcast(geo + time ~ spdeps,
	fun.aggregate=identity,
	value.var='D_real_value_percentage_points',
	fill=NA_real_) %>%
	setcolorder('TOTAL',after=ncol(.))

	EU_Agg <-
	DATA %>%
	.[geo=='EU27_2020'] %>%
	decomposeGrowth %>%
	componentsToColumns

	MS_LongTerm <-
	DATA %>%
	.[geo != 'EU27_2020'] %>%
	.[time==LATEST_YEAR \| time==LATEST_YEAR-6] %>%
	.[, time := time %>% ifelse(.!=max(.), max(.)-1, .)] %>%
	decomposeGrowth %>%
	componentsToColumns %>%
	.[time==LATEST_YEAR] %>%
	.[, time := NULL] %>%
	setorder(-TOTAL)

	MS_ShortTerm <-
	DATA %>%
	.[geo != 'EU27_2020'] %>%
	.[time==LATEST_YEAR \| time==LATEST_YEAR-1] %>%
	decomposeGrowth %>%
	componentsToColumns %>%
	.[time==LATEST_YEAR] %>%
	.[, time := NULL] %>%
	setorder(-TOTAL)

	list(EU_Agg, MS_LongTerm, MS_ShortTerm) %>%
	set_names(c('EU_Agg',
	paste('MS',LATEST_YEAR-6,LATEST_YEAR,sep='_'),
	paste('MS',LATEST_YEAR-1,LATEST_YEAR,sep='_'))) %>%
	write_xlsx('ESSPROS social protection expenditure decomposition with early estimates.xlsx')