Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jrosell/09f460f58015f25b0d36300a3851830b to your computer and use it in GitHub Desktop.
Save jrosell/09f460f58015f25b0d36300a3851830b to your computer and use it in GitHub Desktop.
# NOTE: It's not working on Posistron because of rJava, it works on RStudio.
if(!"rlang" %in% installed.packages()){
if(!interactive()) { stop("The package \"rlang\" is required.") }
cat("The package \"rlang\" is required.\n✖ Would you like to install it?\n\n1: Yes\n2: No\n\nSelection:")
if (readLines(n = 1) == "1"){
install.packages("rlang")
}
}
rlang::check_installed(
"jrrosell (>= 0.0.0.9006)",
action = \(pkg,...) {
if("jrrosell" %in% installed.packages()){ remove.packages("jrrosell") }
pak::pak("jrosell/jrrosell")
}
)
rlang::check_installed("tidyverse")
rlang::check_installed("rJava")
jrrosell::check_installed_gihub("cynkra/constructive")
jrrosell::check_installed_gihub("ropensci/tabulapdf")
# $ sudo apt-get install -y default-jre default-jdk
# $ sudo R CMD javareconf
# > install.packages("rJava")
library(tidyverse)
library(magick)
library(tesseract)
invisible(!dir.exists("data") && dir.create("data"))
jrrosell::theme_set_roboto_darkblue()
library(tabulapdf)
# extracted_areas <- "~/2022.pdf" |>
# locate_areas(
# pages = c(10,36,64,86),
# copy = TRUE
# )
# constructive::construct(extracted_areas)
extracted_areas <- list(
c(
top = 201.27430329932372,
left = 42.011084579588,
bottom = 479.40651566442375,
right = 586.3601466864
),
c(
top = 218.25947657352378,
left = 42.011084579588,
bottom = 496.39168893862376,
right = 579.98105611484
),
c(
top = 199.15115664004372,
left = 39.884721055733,
bottom = 470,
right = 575.72832906713
),
c(
top = 203.3974499585937,
left = 44.137448103443,
bottom = 479.40651566442375,
right = 567.22287497171
)
)
lists_2022 <- "~/2022.pdf" |>
extract_tables(
pages = c(10,36,64,86),
area = extracted_areas,
guess = FALSE,
col_names = c("comunidad", "inicial", "value", "percent")
)
lists_2022[[1]]$name <- "Oblig. Reconocidas de la AGE"
lists_2022[[2]]$name <- "Oblig. Reconocidas de ORGANISMOS"
lists_2022[[3]]$name <- "Inversión real de ESTIMATIVOS"
lists_2022[[4]]$name <- "Inversión real de EMPRESAS"
df_2022 <- lists_2022 |>
list_rbind() |>
mutate(ano = 2022)
lists_2023 <- "~/2023.pdf" |>
extract_tables(
pages = c(10,36,64,86),
area = extracted_areas,
guess = FALSE,
col_names = c("comunidad", "inicial", "value", "percent")
)
lists_2023[[1]]$name <- "Oblig. Reconocidas de la AGE"
lists_2023[[2]]$name <- "Oblig. Reconocidas de ORGANISMOS"
lists_2023[[3]]$name <- "Inversión real de ESTIMATIVOS"
lists_2023[[4]]$name <- "Inversión real de EMPRESAS"
df_2023 <- lists_2023 |>
list_rbind() |>
mutate(ano = 2023)
locale_es <- readr::locale(
encoding = "latin1",
decimal_mark = ",",
grouping_mark = "."
)
df <- bind_rows(df_2022, df_2023) |>
mutate(value = parse_number(value, locale = locale_es)) |>
select(comunidad, ano, name, value) |>
print(n = Inf)
# Dades 2015-2021
# Baixar manualment o executar la seguent funció
download_banco_datos <- \(x) {
rlang::check_installed("selenider")
library(selenider)
sess <- selenider_session(
"chromote",
timeout = 10,
options = chromote_options(headless = FALSE)
)
sess$driver$Browser$setDownloadBehavior(
behavior = "allow",
downloadPath = "data"
)
open_url("http://buscadorcdi.gob.es/Cifra/es/buscador/resultados/Ultimos-Distribucion-Territorial")
try({
elem_click(s(".introjs-skipbutton"), timeout = 10)
})
try({
elem_click(s(".guardarBusqueda li:nth-child(3) img"), timeout = 20)
})
Sys.sleep(10)
selenider::close_session()
}
comunidades <- c(
"PAÍS VASCO",
"CATALUÑA",
"GALICIA",
"ANDALUCÍA",
"PRINCIPADO DE ASTURIAS",
"CANTABRIA",
"LA RIOJA",
"REGION DE MURCIA",
"COMUNITAT VALENCIANA",
"COMUNITAD VALENCIANA",
"ARAGÓN",
"CASTILLA-LA MANCHA",
"CANARIAS",
"COMUNIDAD FORAL DE NAVARRA",
"EXTREMADURA",
"ILLES BALEARS",
"COMUNIDAD DE MADRID",
"CASTILLA Y LEÓN",
"CASTILLA Y LEON",
"CEUTA",
"MELILLA"
)
dat <- list.files("data") |>
{\(x) paste0("data/", x[str_detect(x, "Informe_Cifra")]) }() |>
read_csv2(
show_col_types = FALSE,
locale = readr::locale(
encoding = "latin1",
decimal_mark = ",",
grouping_mark = "."
),
skip = 1,
col_names = c(
"cod_administracion", "administracion",
"jerarquia_administracion", "submateria",
"materia", "cod_variable",
"variable", "nombre_corto_variable",
"unidad", "escala",
"ano", "trimestre",
"mes", "observaciones",
"valor", "variable_referencia",
"base_submateria_referencia", "unidad_variable_referencia",
"escala_variable_referencia", "valor_variable_referencia",
"ano_variable_referencia", "trimestre_variable_referencia",
"mes_variable_referencia", "observaciones_variable_referencia"
),
col_select = 1:24
) |>
janitor::clean_names() |>
separate_wider_position(
cod_variable, c(cod_comunidad = 2), too_many = "debug"
) |>
select(-cod_comunidad, -cod_variable, -cod_variable_width, -cod_variable_ok) |>
rename(cod_variable = cod_variable_remainder) |>
filter(cod_variable %in% c("AGE_OR","OOAA_OR","ESTI_IR","EMP_IR")) |>
filter(str_detect(variable, paste(comunidades, collapse = "|"))) |>
separate_wider_delim(
variable, delim = ":", names = c("comunidad", "variable_resto")
) |>
select(-variable_resto) |>
mutate(
name = nombre_corto_variable |>
str_replace(paste0(" en ", comunidad), "") |>
str_replace(" en MURCIA", "") |>
str_replace(" en ASTURIAS", "") |>
str_replace(" en C. VALENCIANA", "") |>
str_replace(" en NAVARRA", "") |>
str_replace(" en C.LA MANCHA", "") |>
str_replace(" en MADRID", "")
) |>
filter(ano >= 2015) |>
arrange(comunidad, ano) |>
rename(value = valor) |>
select(comunidad, ano, value, name)
df_ccaa <- bind_rows(dat, df) |>
mutate(comunidad = comunidad |>
str_replace("CASTILLA Y LEON", "CASTILLA Y LEÓN") |>
str_replace("REGION DE MURCIA", "REGIÓN DE MURCIA") |>
str_replace("PRINCIPADO DE ASTURIAS", "P. ASTURIAS") |>
str_replace("COMUNIDAD DE MADRID", "C. MADRID") |>
str_replace("COMUNIDAD FORAL DE NAVARRA", "C.F. NAVARRA") |>
str_replace("COMUNITAT VALENCIANA", "C. VALENCIANA")
)
df_ccaa_all <- df_ccaa |>
group_by(comunidad, ano) |>
mutate(total = sum(value)) |>
ungroup() |>
pivot_longer(c(value,total), names_to = "name1") |>
mutate(
name = if_else(
name1 != "total", name, "Total ejecutado"
)
)|>
select(-name1) |>
unique() |>
print(n = Inf)
p1 <- df_ccaa_all |>
ggplot(aes(ano, value, colour = name)) +
geom_line(data = df_ccaa_all |> filter(name != "Total ejecutado"), size = 0.3) +
geom_line(data = df_ccaa_all |> filter(name == "Total ejecutado"), size = 0.9) +
facet_wrap("comunidad") +
scale_x_continuous(breaks = 2015:2023, guide = guide_axis(angle = 90)) +
scale_y_continuous(
labels = scales::unit_format(unit = "M €", scale = 1e-6, sep = "")
) +
labs(
subtitle = "Distribución territorial de la inversión del Sector Público Estatal",
caption = "Fuente: Ultimos-Distribucion-Territorial en buscadorcdi.gob.es y PDF ejercicios 2022-2023 hasta 31 diciembre | Autor: @jrosell",
y = "", x = "", colour = ""
) +
theme(
legend.position = "bottom",
panel.spacing.y = unit(2, "lines")
)
print(p1)
df_cat_mad <- df_ccaa |>
filter(comunidad %in% c("C. MADRID", "CATALUÑA")) |>
group_by(comunidad, name) |>
summarize(value = sum(value)) |>
ungroup() |>
mutate(ano = 2024, .after = 1)
df_cat_mad_all <- df_ccaa |>
bind_rows(df_cat_mad) |>
group_by(comunidad, ano) |>
mutate(total = sum(value)) |>
ungroup() |>
pivot_longer(c(value,total), names_to = "name1") |>
mutate(
name = if_else(
name1 != "total", name, "Total ejecutado"
)
)|>
select(-name1) |>
unique() |>
mutate(ano_label = factor(if_else(ano == 2024, "Total", as.character(ano)), c("Total", as.character(rev(c(2015:2023)))))) |>
filter(comunidad %in% c("C. MADRID", "CATALUÑA"), name == "Total ejecutado") |>
mutate(comunidad = factor(comunidad, c( "CATALUÑA", "C. MADRID"))) |>
print(n = Inf)
p2 <- df_ccaa |>
bind_rows(df_cat_mad) |>
mutate(ano_label = factor(if_else(ano == 2024, "Total", as.character(ano)), c("Total", as.character(rev(c(2015:2023)))))) |>
filter(comunidad %in% c("C. MADRID", "CATALUÑA")) |>
mutate(comunidad = factor(comunidad, c( "CATALUÑA", "C. MADRID"))) |>
ggplot(aes(ano_label, value)) +
geom_col(aes(fill = name)) +
geom_text(
nudge_y = 500000000,
aes(label = scales::number(value, suffix = "M €", scale = 1e-6, sep = "", accuracy = 1, big.mark = "")),
data = df_cat_mad_all
) +
scale_y_continuous(
labels = scales::unit_format(unit = "M €", scale = 1e-6, sep = ""),
n.breaks = 15,
guide = guide_axis(angle = 90)
) +
facet_wrap("comunidad", ncol = 1) +
coord_flip() +
labs(
title = "Distribución territorial de la inversión del Sector Público Estatal en Cataluña y Comunidad de Madrid",
caption = "Fuente: Ultimos-Distribucion-Territorial en buscadorcdi.gob.es y PDF ejercicios 2022-2023 hasta 31 diciembre | Autor: @jrosell",
y = "", x = "", fill = ""
) +
theme(
legend.position = "top",
panel.spacing.y = unit(2, "lines")
)
print(p2)
@jrosell
Copy link
Author

jrosell commented Jul 26, 2024

imatge

@jrosell
Copy link
Author

jrosell commented Jul 26, 2024

imatge

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment