jrosell · July 26, 2024 10:38 · jrosell · Jul 26, 2024 · jrosell · Jul 26, 2024
diff --git a/plot_total_ejecutado_banco-datos-2015-2022_pdfs-2022-2023 b/plot_total_ejecutado_banco-datos-2015-2022_pdfs-2022-2023
 # NOTE: It's not working on Posistron because of rJava, it works on RStudio.

 if(!"rlang" %in% installed.packages()){
    if(!interactive()) { stop("The package \"rlang\" is required.") }
    cat("The package \"rlang\" is required.\n✖ Would you like to install it?\n\n1: Yes\n2: No\n\nSelection:")  
    if (readLines(n = 1) == "1"){
        install.packages("rlang")
    }  
 }
 rlang::check_installed(
    "jrrosell (>= 0.0.0.9006)",
    action = \(pkg,...) {
        if("jrrosell" %in% installed.packages()){ remove.packages("jrrosell") }
        pak::pak("jrosell/jrrosell")
    }
 )
 rlang::check_installed("tidyverse")
 rlang::check_installed("rJava") 
 jrrosell::check_installed_gihub("cynkra/constructive")
 jrrosell::check_installed_gihub("ropensci/tabulapdf")

 # $ sudo apt-get install -y default-jre default-jdk
 # $ sudo R CMD javareconf
 # > install.packages("rJava")

 library(tidyverse)
 library(magick)
 library(tesseract)
 invisible(!dir.exists("data") && dir.create("data"))
 jrrosell::theme_set_roboto_darkblue()

 library(tabulapdf)

 # extracted_areas <- "~/2022.pdf" |> 
 #     locate_areas(
 #         pages = c(10,36,64,86),
 #         copy = TRUE
 #     )
 # constructive::construct(extracted_areas)
 extracted_areas <- list(
    c(
        top = 201.27430329932372,
        left = 42.011084579588,
        bottom = 479.40651566442375,
        right = 586.3601466864
    ),
    c(
        top = 218.25947657352378,
        left = 42.011084579588,
        bottom = 496.39168893862376,
        right = 579.98105611484
    ),
    c(
        top = 199.15115664004372,
        left = 39.884721055733,
        bottom = 470,
        right = 575.72832906713
    ),
    c(
        top = 203.3974499585937,
        left = 44.137448103443,
        bottom = 479.40651566442375,
        right = 567.22287497171
    )
 )

 lists_2022 <- "~/2022.pdf" |> 
    extract_tables(
        pages = c(10,36,64,86),
        area = extracted_areas,
        guess = FALSE,
        col_names = c("comunidad", "inicial", "value", "percent")
    )
 lists_2022[[1]]$name <- "Oblig. Reconocidas de la AGE"
 lists_2022[[2]]$name <- "Oblig. Reconocidas de ORGANISMOS"
 lists_2022[[3]]$name <- "Inversión real de ESTIMATIVOS"
 lists_2022[[4]]$name <- "Inversión real de EMPRESAS"
 df_2022 <- lists_2022 |>
    list_rbind() |> 
    mutate(ano = 2022)

 lists_2023 <- "~/2023.pdf" |> 
    extract_tables(
        pages = c(10,36,64,86),
        area = extracted_areas,
        guess = FALSE,
        col_names = c("comunidad", "inicial", "value", "percent")
    )
 lists_2023[[1]]$name <- "Oblig. Reconocidas de la AGE"
 lists_2023[[2]]$name <- "Oblig. Reconocidas de ORGANISMOS"
 lists_2023[[3]]$name <- "Inversión real de ESTIMATIVOS"
 lists_2023[[4]]$name <- "Inversión real de EMPRESAS"
 df_2023 <- lists_2023 |>
    list_rbind() |> 
    mutate(ano = 2023)

 locale_es <- readr::locale(
    encoding = "latin1",
    decimal_mark = ",",
    grouping_mark = "."
 )
 df <- bind_rows(df_2022, df_2023) |> 
    mutate(value = parse_number(value, locale = locale_es)) |> 
    select(comunidad, ano, name, value) |> 
    print(n = Inf)

 # Dades 2015-2021
 # Baixar manualment o executar la seguent funció
 download_banco_datos <- \(x) {
    rlang::check_installed("selenider")
    library(selenider)
    sess <- selenider_session(
        "chromote",
        timeout = 10,
        options = chromote_options(headless = FALSE)
    )
    sess$driver$Browser$setDownloadBehavior(
        behavior = "allow",
        downloadPath = "data"
    )
    open_url("http://buscadorcdi.gob.es/Cifra/es/buscador/resultados/Ultimos-Distribucion-Territorial")
    try({
        elem_click(s(".introjs-skipbutton"), timeout = 10)    
    })
    try({
        elem_click(s(".guardarBusqueda li:nth-child(3) img"), timeout = 20)
    })
    Sys.sleep(10)
    selenider::close_session()
 }

 comunidades <- c(
    "PAÍS VASCO",
    "CATALUÑA",
    "GALICIA",
    "ANDALUCÍA",
    "PRINCIPADO DE ASTURIAS",
    "CANTABRIA",
    "LA RIOJA",
    "REGION DE MURCIA",
    "COMUNITAT VALENCIANA",
    "COMUNITAD VALENCIANA",
    "ARAGÓN",
    "CASTILLA-LA MANCHA",
    "CANARIAS",
    "COMUNIDAD FORAL DE NAVARRA",
    "EXTREMADURA",
    "ILLES BALEARS",
    "COMUNIDAD DE MADRID",
    "CASTILLA Y LEÓN",
    "CASTILLA Y LEON",
    "CEUTA",
    "MELILLA"
 )

 dat <- list.files("data") |> 
    {\(x) paste0("data/", x[str_detect(x, "Informe_Cifra")]) }() |> 
    read_csv2(
        show_col_types = FALSE,
        locale = readr::locale(
            encoding = "latin1",
            decimal_mark = ",",
            grouping_mark = "."
        ),
        skip = 1,
        col_names = c(
            "cod_administracion",                "administracion",
            "jerarquia_administracion",          "submateria",
            "materia",                           "cod_variable",
            "variable",                          "nombre_corto_variable",
            "unidad",                            "escala",
            "ano",                               "trimestre",
            "mes",                               "observaciones",                    
            "valor",                             "variable_referencia",              
            "base_submateria_referencia",        "unidad_variable_referencia",       
            "escala_variable_referencia",        "valor_variable_referencia",        
            "ano_variable_referencia",           "trimestre_variable_referencia",    
            "mes_variable_referencia",           "observaciones_variable_referencia"
        ),
        col_select = 1:24
    ) |>     
    janitor::clean_names() |> 
    separate_wider_position(
        cod_variable, c(cod_comunidad = 2), too_many = "debug"
    ) |> 
    select(-cod_comunidad, -cod_variable, -cod_variable_width, -cod_variable_ok) |> 
    rename(cod_variable = cod_variable_remainder) |> 
    filter(cod_variable %in% c("AGE_OR","OOAA_OR","ESTI_IR","EMP_IR")) |> 
    filter(str_detect(variable, paste(comunidades, collapse = "|"))) |> 
    separate_wider_delim(
        variable, delim = ":", names = c("comunidad", "variable_resto")
    ) |> 
    select(-variable_resto) |> 
    mutate(
        name = nombre_corto_variable |> 
            str_replace(paste0(" en ", comunidad), "")  |> 
            str_replace(" en MURCIA", "") |> 
            str_replace(" en ASTURIAS", "") |> 
            str_replace(" en C. VALENCIANA", "") |> 
            str_replace(" en NAVARRA", "") |> 
            str_replace(" en C.LA MANCHA", "") |> 
            str_replace(" en MADRID", "")
    ) |> 
    filter(ano >= 2015) |>     
    arrange(comunidad, ano) |> 
    rename(value = valor) |> 
    select(comunidad, ano, value, name) 

 df_ccaa <- bind_rows(dat, df) |> 
    mutate(comunidad = comunidad |> 
               str_replace("CASTILLA Y LEON", "CASTILLA Y LEÓN") |> 
               str_replace("REGION DE MURCIA", "REGIÓN DE MURCIA") |> 
               str_replace("PRINCIPADO DE ASTURIAS", "P. ASTURIAS") |> 
               str_replace("COMUNIDAD DE MADRID", "C. MADRID") |> 
               str_replace("COMUNIDAD FORAL DE NAVARRA", "C.F. NAVARRA") |> 
               str_replace("COMUNITAT VALENCIANA", "C. VALENCIANA")
    )

 df_ccaa_all <- df_ccaa |> 
    group_by(comunidad, ano) |> 
    mutate(total = sum(value)) |>
    ungroup() |> 
    pivot_longer(c(value,total), names_to = "name1") |>
    mutate(
        name = if_else(
            name1 != "total", name, "Total ejecutado"
        )
    )|> 
    select(-name1) |> 
    unique() |> 
    print(n = Inf)



 p1 <- df_ccaa_all |> 
    ggplot(aes(ano, value, colour = name)) +
    geom_line(data = df_ccaa_all |> filter(name != "Total ejecutado"), size = 0.3) +
    geom_line(data = df_ccaa_all |> filter(name == "Total ejecutado"), size = 0.9) +
    facet_wrap("comunidad") +
    scale_x_continuous(breaks = 2015:2023, guide = guide_axis(angle = 90)) +
    scale_y_continuous(
        labels = scales::unit_format(unit = "M €", scale = 1e-6, sep = "")
    ) +
    labs(
        subtitle = "Distribución territorial de la inversión del Sector Público Estatal",
        caption = "Fuente: Ultimos-Distribucion-Territorial en buscadorcdi.gob.es y PDF ejercicios 2022-2023 hasta 31 diciembre | Autor: @jrosell",
        y = "", x = "", colour = ""
    ) +
    theme(
        legend.position = "bottom", 
        panel.spacing.y = unit(2, "lines")
    )
 print(p1)

 df_cat_mad <- df_ccaa |>
    filter(comunidad %in% c("C. MADRID", "CATALUÑA")) |> 
    group_by(comunidad, name) |> 
    summarize(value = sum(value)) |> 
    ungroup() |> 
    mutate(ano = 2024, .after = 1)

 df_cat_mad_all <- df_ccaa |> 
    bind_rows(df_cat_mad) |> 
    group_by(comunidad, ano) |> 
    mutate(total = sum(value)) |>
    ungroup() |> 
    pivot_longer(c(value,total), names_to = "name1") |>
    mutate(
        name = if_else(
            name1 != "total", name, "Total ejecutado"
        )
    )|> 
    select(-name1) |> 
    unique() |> 
    mutate(ano_label = factor(if_else(ano == 2024, "Total", as.character(ano)), c("Total", as.character(rev(c(2015:2023)))))) |> 
    filter(comunidad %in% c("C. MADRID", "CATALUÑA"), name == "Total ejecutado") |>
    mutate(comunidad = factor(comunidad, c( "CATALUÑA", "C. MADRID"))) |>
    print(n = Inf)

 p2 <- df_ccaa |> 
    bind_rows(df_cat_mad) |> 
    mutate(ano_label = factor(if_else(ano == 2024, "Total", as.character(ano)), c("Total", as.character(rev(c(2015:2023)))))) |> 
    filter(comunidad %in% c("C. MADRID", "CATALUÑA")) |>
    mutate(comunidad = factor(comunidad, c( "CATALUÑA", "C. MADRID"))) |>
    ggplot(aes(ano_label, value)) +
    geom_col(aes(fill = name)) +
    geom_text(
        nudge_y = 500000000,
        aes(label = scales::number(value, suffix = "M €", scale = 1e-6, sep = "", accuracy = 1, big.mark = "")),
        data = df_cat_mad_all
    ) +
    scale_y_continuous(
        labels = scales::unit_format(unit = "M €", scale = 1e-6, sep = ""),
        n.breaks = 15,
        guide = guide_axis(angle = 90)
    ) +
    facet_wrap("comunidad", ncol = 1) +
    coord_flip() +
    labs(
        title = "Distribución territorial de la inversión del Sector Público Estatal en Cataluña y Comunidad de Madrid",
        caption = "Fuente: Ultimos-Distribucion-Territorial en buscadorcdi.gob.es y PDF ejercicios 2022-2023 hasta 31 diciembre | Autor: @jrosell",
        y = "", x = "", fill = ""
    ) +
    theme(
        legend.position = "top", 
        panel.spacing.y = unit(2, "lines")
    )

 print(p2)
	# NOTE: It's not working on Posistron because of rJava, it works on RStudio.

	if(!"rlang" %in% installed.packages()){
	if(!interactive()) { stop("The package \"rlang\" is required.") }
	cat("The package \"rlang\" is required.\n✖ Would you like to install it?\n\n1: Yes\n2: No\n\nSelection:")
	if (readLines(n = 1) == "1"){
	install.packages("rlang")
	}
	}
	rlang::check_installed(
	"jrrosell (>= 0.0.0.9006)",
	action = \(pkg,...) {
	if("jrrosell" %in% installed.packages()){ remove.packages("jrrosell") }
	pak::pak("jrosell/jrrosell")
	}
	)
	rlang::check_installed("tidyverse")
	rlang::check_installed("rJava")
	jrrosell::check_installed_gihub("cynkra/constructive")
	jrrosell::check_installed_gihub("ropensci/tabulapdf")

	# $ sudo apt-get install -y default-jre default-jdk
	# $ sudo R CMD javareconf
	# > install.packages("rJava")

	library(tidyverse)
	library(magick)
	library(tesseract)
	invisible(!dir.exists("data") && dir.create("data"))
	jrrosell::theme_set_roboto_darkblue()

	library(tabulapdf)

	# extracted_areas <- "~/2022.pdf" \|>
	# locate_areas(
	# pages = c(10,36,64,86),
	# copy = TRUE
	# )
	# constructive::construct(extracted_areas)
	extracted_areas <- list(
	c(
	top = 201.27430329932372,
	left = 42.011084579588,
	bottom = 479.40651566442375,
	right = 586.3601466864
	),
	c(
	top = 218.25947657352378,
	left = 42.011084579588,
	bottom = 496.39168893862376,
	right = 579.98105611484
	),
	c(
	top = 199.15115664004372,
	left = 39.884721055733,
	bottom = 470,
	right = 575.72832906713
	),
	c(
	top = 203.3974499585937,
	left = 44.137448103443,
	bottom = 479.40651566442375,
	right = 567.22287497171
	)
	)

	lists_2022 <- "~/2022.pdf" \|>
	extract_tables(
	pages = c(10,36,64,86),
	area = extracted_areas,
	guess = FALSE,
	col_names = c("comunidad", "inicial", "value", "percent")
	)
	lists_2022[[1]]$name <- "Oblig. Reconocidas de la AGE"
	lists_2022[[2]]$name <- "Oblig. Reconocidas de ORGANISMOS"
	lists_2022[[3]]$name <- "Inversión real de ESTIMATIVOS"
	lists_2022[[4]]$name <- "Inversión real de EMPRESAS"
	df_2022 <- lists_2022 \|>
	list_rbind() \|>
	mutate(ano = 2022)

	lists_2023 <- "~/2023.pdf" \|>
	extract_tables(
	pages = c(10,36,64,86),
	area = extracted_areas,
	guess = FALSE,
	col_names = c("comunidad", "inicial", "value", "percent")
	)
	lists_2023[[1]]$name <- "Oblig. Reconocidas de la AGE"
	lists_2023[[2]]$name <- "Oblig. Reconocidas de ORGANISMOS"
	lists_2023[[3]]$name <- "Inversión real de ESTIMATIVOS"
	lists_2023[[4]]$name <- "Inversión real de EMPRESAS"
	df_2023 <- lists_2023 \|>
	list_rbind() \|>
	mutate(ano = 2023)

	locale_es <- readr::locale(
	encoding = "latin1",
	decimal_mark = ",",
	grouping_mark = "."
	)
	df <- bind_rows(df_2022, df_2023) \|>
	mutate(value = parse_number(value, locale = locale_es)) \|>
	select(comunidad, ano, name, value) \|>
	print(n = Inf)

	# Dades 2015-2021
	# Baixar manualment o executar la seguent funció
	download_banco_datos <- \(x) {
	rlang::check_installed("selenider")
	library(selenider)
	sess <- selenider_session(
	"chromote",
	timeout = 10,
	options = chromote_options(headless = FALSE)
	)
	sess$driver$Browser$setDownloadBehavior(
	behavior = "allow",
	downloadPath = "data"
	)
	open_url("http://buscadorcdi.gob.es/Cifra/es/buscador/resultados/Ultimos-Distribucion-Territorial")
	try({
	elem_click(s(".introjs-skipbutton"), timeout = 10)
	})
	try({
	elem_click(s(".guardarBusqueda li:nth-child(3) img"), timeout = 20)
	})
	Sys.sleep(10)
	selenider::close_session()
	}

	comunidades <- c(
	"PAÍS VASCO",
	"CATALUÑA",
	"GALICIA",
	"ANDALUCÍA",
	"PRINCIPADO DE ASTURIAS",
	"CANTABRIA",
	"LA RIOJA",
	"REGION DE MURCIA",
	"COMUNITAT VALENCIANA",
	"COMUNITAD VALENCIANA",
	"ARAGÓN",
	"CASTILLA-LA MANCHA",
	"CANARIAS",
	"COMUNIDAD FORAL DE NAVARRA",
	"EXTREMADURA",
	"ILLES BALEARS",
	"COMUNIDAD DE MADRID",
	"CASTILLA Y LEÓN",
	"CASTILLA Y LEON",
	"CEUTA",
	"MELILLA"
	)

	dat <- list.files("data") \|>
	{\(x) paste0("data/", x[str_detect(x, "Informe_Cifra")]) }() \|>
	read_csv2(
	show_col_types = FALSE,
	locale = readr::locale(
	encoding = "latin1",
	decimal_mark = ",",
	grouping_mark = "."
	),
	skip = 1,
	col_names = c(
	"cod_administracion", "administracion",
	"jerarquia_administracion", "submateria",
	"materia", "cod_variable",
	"variable", "nombre_corto_variable",
	"unidad", "escala",
	"ano", "trimestre",
	"mes", "observaciones",
	"valor", "variable_referencia",
	"base_submateria_referencia", "unidad_variable_referencia",
	"escala_variable_referencia", "valor_variable_referencia",
	"ano_variable_referencia", "trimestre_variable_referencia",
	"mes_variable_referencia", "observaciones_variable_referencia"
	),
	col_select = 1:24
	) \|>
	janitor::clean_names() \|>
	separate_wider_position(
	cod_variable, c(cod_comunidad = 2), too_many = "debug"
	) \|>
	select(-cod_comunidad, -cod_variable, -cod_variable_width, -cod_variable_ok) \|>
	rename(cod_variable = cod_variable_remainder) \|>
	filter(cod_variable %in% c("AGE_OR","OOAA_OR","ESTI_IR","EMP_IR")) \|>
	filter(str_detect(variable, paste(comunidades, collapse = "\|"))) \|>
	separate_wider_delim(
	variable, delim = ":", names = c("comunidad", "variable_resto")
	) \|>
	select(-variable_resto) \|>
	mutate(
	name = nombre_corto_variable \|>
	str_replace(paste0(" en ", comunidad), "") \|>
	str_replace(" en MURCIA", "") \|>
	str_replace(" en ASTURIAS", "") \|>
	str_replace(" en C. VALENCIANA", "") \|>
	str_replace(" en NAVARRA", "") \|>
	str_replace(" en C.LA MANCHA", "") \|>
	str_replace(" en MADRID", "")
	) \|>
	filter(ano >= 2015) \|>
	arrange(comunidad, ano) \|>
	rename(value = valor) \|>
	select(comunidad, ano, value, name)

	df_ccaa <- bind_rows(dat, df) \|>
	mutate(comunidad = comunidad \|>
	str_replace("CASTILLA Y LEON", "CASTILLA Y LEÓN") \|>
	str_replace("REGION DE MURCIA", "REGIÓN DE MURCIA") \|>
	str_replace("PRINCIPADO DE ASTURIAS", "P. ASTURIAS") \|>
	str_replace("COMUNIDAD DE MADRID", "C. MADRID") \|>
	str_replace("COMUNIDAD FORAL DE NAVARRA", "C.F. NAVARRA") \|>
	str_replace("COMUNITAT VALENCIANA", "C. VALENCIANA")
	)

	df_ccaa_all <- df_ccaa \|>
	group_by(comunidad, ano) \|>
	mutate(total = sum(value)) \|>
	ungroup() \|>
	pivot_longer(c(value,total), names_to = "name1") \|>
	mutate(
	name = if_else(
	name1 != "total", name, "Total ejecutado"
	)
	)\|>
	select(-name1) \|>
	unique() \|>
	print(n = Inf)



	p1 <- df_ccaa_all \|>
	ggplot(aes(ano, value, colour = name)) +
	geom_line(data = df_ccaa_all \|> filter(name != "Total ejecutado"), size = 0.3) +
	geom_line(data = df_ccaa_all \|> filter(name == "Total ejecutado"), size = 0.9) +
	facet_wrap("comunidad") +
	scale_x_continuous(breaks = 2015:2023, guide = guide_axis(angle = 90)) +
	scale_y_continuous(
	labels = scales::unit_format(unit = "M €", scale = 1e-6, sep = "")
	) +
	labs(
	subtitle = "Distribución territorial de la inversión del Sector Público Estatal",
	caption = "Fuente: Ultimos-Distribucion-Territorial en buscadorcdi.gob.es y PDF ejercicios 2022-2023 hasta 31 diciembre \| Autor: @jrosell",
	y = "", x = "", colour = ""
	) +
	theme(
	legend.position = "bottom",
	panel.spacing.y = unit(2, "lines")
	)
	print(p1)

	df_cat_mad <- df_ccaa \|>
	filter(comunidad %in% c("C. MADRID", "CATALUÑA")) \|>
	group_by(comunidad, name) \|>
	summarize(value = sum(value)) \|>
	ungroup() \|>
	mutate(ano = 2024, .after = 1)

	df_cat_mad_all <- df_ccaa \|>
	bind_rows(df_cat_mad) \|>
	group_by(comunidad, ano) \|>
	mutate(total = sum(value)) \|>
	ungroup() \|>
	pivot_longer(c(value,total), names_to = "name1") \|>
	mutate(
	name = if_else(
	name1 != "total", name, "Total ejecutado"
	)
	)\|>
	select(-name1) \|>
	unique() \|>
	mutate(ano_label = factor(if_else(ano == 2024, "Total", as.character(ano)), c("Total", as.character(rev(c(2015:2023)))))) \|>
	filter(comunidad %in% c("C. MADRID", "CATALUÑA"), name == "Total ejecutado") \|>
	mutate(comunidad = factor(comunidad, c( "CATALUÑA", "C. MADRID"))) \|>
	print(n = Inf)

	p2 <- df_ccaa \|>
	bind_rows(df_cat_mad) \|>
	mutate(ano_label = factor(if_else(ano == 2024, "Total", as.character(ano)), c("Total", as.character(rev(c(2015:2023)))))) \|>
	filter(comunidad %in% c("C. MADRID", "CATALUÑA")) \|>
	mutate(comunidad = factor(comunidad, c( "CATALUÑA", "C. MADRID"))) \|>
	ggplot(aes(ano_label, value)) +
	geom_col(aes(fill = name)) +
	geom_text(
	nudge_y = 500000000,
	aes(label = scales::number(value, suffix = "M €", scale = 1e-6, sep = "", accuracy = 1, big.mark = "")),
	data = df_cat_mad_all
	) +
	scale_y_continuous(
	labels = scales::unit_format(unit = "M €", scale = 1e-6, sep = ""),
	n.breaks = 15,
	guide = guide_axis(angle = 90)
	) +
	facet_wrap("comunidad", ncol = 1) +
	coord_flip() +
	labs(
	title = "Distribución territorial de la inversión del Sector Público Estatal en Cataluña y Comunidad de Madrid",
	caption = "Fuente: Ultimos-Distribucion-Territorial en buscadorcdi.gob.es y PDF ejercicios 2022-2023 hasta 31 diciembre \| Autor: @jrosell",
	y = "", x = "", fill = ""
	) +
	theme(
	legend.position = "top",
	panel.spacing.y = unit(2, "lines")
	)

	print(p2)