Created
November 5, 2021 01:30
-
-
Save sientifiko/2f893c18b9d55ea01250cddfdf6dac25 to your computer and use it in GitHub Desktop.
Código para replicar la estadística de texto de los programas de gobierno
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
library(RCurl) | |
library(tm) | |
library(wordcloud2) | |
library(stm) | |
library(pdftools) | |
library(tidyverse) | |
library(patchwork) | |
library(ggwordcloud) | |
theme_set(theme_classic()) | |
limpieza <- function(x){ | |
# construye un corpus | |
corpus <- Corpus(VectorSource(x)) | |
# lleva a min?sculas | |
d <- tm_map(corpus, tolower) | |
# quita espacios en blanco | |
d <- tm_map(d, stripWhitespace) | |
# remueve la puntuaci?n | |
d <- tm_map(d, removePunctuation) | |
# remove numbers | |
d <- tm_map(d, removeNumbers) | |
# remove certain words | |
vector <- c("ano", "columna", "anos", "refiere", "via","torno","nota", | |
"forma","formas","area", "sino", "mas", | |
"solo", "ello", "ser", "hacia", | |
"uso", "parte", "debe", "cada", | |
"deben", "sera", "dos", "asi", | |
"fin", "pro", "cion", "traves", | |
"ponga", "ademas", "manera") # palabras a remover | |
d <- tm_map(d, removeWords, vector) | |
# carga mi archivo de palabras vac?as personalizada y lo convierte a ASCII | |
sw <- readLines("http://www.webmining.cl/wp-content/uploads/2011/03/stopwords.es.txt", encoding="UTF-8") | |
sw <- iconv(sw, to="ASCII//TRANSLIT") | |
# remueve palabras genericas | |
d <- tm_map(d, removeWords, stopwords("spanish")) | |
# remueve palabras vac?as personalizadas | |
d <- tm_map(d, removeWords, sw) | |
d | |
} | |
matriztexto <- function(x){ | |
d <- limpieza(x) | |
# crea matriz de terminos | |
tdm <- TermDocumentMatrix(d) | |
m <- as.matrix(tdm) | |
v <- sort(rowSums(m),decreasing=TRUE) | |
df <- data.frame(word = names(v),freq=v) | |
df | |
} | |
topicModeler <- function(data, k=10, n=3, title= ""){ | |
#'@param data un vector de texto | |
#'@param k la cantidad de tópicos esperados | |
#'@param n la cantidad de palabras que se espera graficar por tópico | |
#'@return una gráfica de tópicos | |
# convierto el vector de texto en un formato de procesamiento e indico el idioma | |
texto <- textProcessor(data, language = "spanish") | |
# preparo el documento extrayendo los respectivos valores del objeto creado | |
out <- prepDocuments(texto$documents, texto$vocab, texto$meta) | |
# genero el modelo de tópicos | |
modelo <- stm(out$documents, vocab = out$vocab, K= k, | |
max.em.its = 75, data = out$meta, | |
init.type = "Spectral", verbose = FALSE) | |
# grafico el modelo | |
plot(modelo, xlab = "Proporción esperada de tópicos", | |
xlim = c(0,1), n=n, main= title) | |
} # fin de topicModeler() | |
# matrizartes <- pdf_text("artes.pdf") %>% | |
# iconv(., from="UTF-8", to="ASCII//TRANSLIT") %>% | |
# matriztexto(.) | |
txt_yasna <- pdf_text("yasna.pdf") %>% | |
iconv(., from="UTF-8", to="ASCII//TRANSLIT") %>% | |
removeWords("iii") | |
matrizyasna <- txt_yasna | |
matriztexto(.) | |
txt_boric <- pdf_text("boric.pdf") %>% | |
iconv(., from="UTF-8", to="ASCII//TRANSLIT") | |
matrizboric <- txt_boric %>% | |
matriztexto(.) | |
txt_nazi <- pdf_text("nazi.pdf") %>% | |
iconv(., from="UTF-8", to="ASCII//TRANSLIT") | |
matriznazi <- txt_nazi %>% | |
matriztexto(.) | |
txt_pinera2 <- pdf_text("pinera2.pdf")%>% | |
iconv(., from="UTF-8", to="ASCII//TRANSLIT") | |
matrizpinera2 <- txt_pinera2 %>% | |
matriztexto(.) | |
(matrizyasna %>% | |
filter(freq >= quantile(freq, .99)) %>% | |
ggplot() + | |
aes(label = word, | |
size = freq, color = word) + | |
geom_text_wordcloud_area() + | |
scale_size_area(max_size = 10)+ | |
labs(subtitle = "Programa Yasna")) + | |
(matrizboric %>% | |
filter(freq >= quantile(freq, .99)) %>% | |
ggplot() + | |
aes(label = word, | |
size = freq, color = word) + | |
geom_text_wordcloud_area() + | |
scale_size_area(max_size = 10)+ | |
labs(subtitle = "Programa Boric"))+ | |
(matriznazi %>% | |
filter(freq >= quantile(freq, .99)) %>% | |
ggplot() + | |
aes(label = word, | |
size = freq, color = word) + | |
geom_text_wordcloud_area() + | |
scale_size_area(max_size = 10)+ | |
labs(subtitle = "Programa Nazi"))+ | |
(matrizpinera2 %>% | |
filter(freq >= quantile(freq, .99)) %>% | |
ggplot() + | |
aes(label = word, | |
size = freq, color = word) + | |
geom_text_wordcloud_area() + | |
scale_size_area(max_size = 10)+ | |
labs(subtitle = "Programa Aborto de Piñera", | |
caption = "1% de palabras más repetidas por programa")) + | |
plot_annotation(title = "Nube de palabras por programa") | |
topicModeler(txt_yasna, n=5, title = "Topicos Yasna") | |
topicModeler(txt_boric, n=5, title = "Topicos Boric") | |
topicModeler(txt_nazi, n=5, title = "Topicos Nazi") | |
topicModeler(txt_pinera2, n=5, title = "Topicos Aborto de Piñera") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment