Skip to content

Instantly share code, notes, and snippets.

@sientifiko
Created November 5, 2021 01:30
Show Gist options
  • Save sientifiko/2f893c18b9d55ea01250cddfdf6dac25 to your computer and use it in GitHub Desktop.
Save sientifiko/2f893c18b9d55ea01250cddfdf6dac25 to your computer and use it in GitHub Desktop.
Código para replicar la estadística de texto de los programas de gobierno
library(XML)
library(RCurl)
library(tm)
library(wordcloud2)
library(stm)
library(pdftools)
library(tidyverse)
library(patchwork)
library(ggwordcloud)
theme_set(theme_classic())
limpieza <- function(x){
# construye un corpus
corpus <- Corpus(VectorSource(x))
# lleva a min?sculas
d <- tm_map(corpus, tolower)
# quita espacios en blanco
d <- tm_map(d, stripWhitespace)
# remueve la puntuaci?n
d <- tm_map(d, removePunctuation)
# remove numbers
d <- tm_map(d, removeNumbers)
# remove certain words
vector <- c("ano", "columna", "anos", "refiere", "via","torno","nota",
"forma","formas","area", "sino", "mas",
"solo", "ello", "ser", "hacia",
"uso", "parte", "debe", "cada",
"deben", "sera", "dos", "asi",
"fin", "pro", "cion", "traves",
"ponga", "ademas", "manera") # palabras a remover
d <- tm_map(d, removeWords, vector)
# carga mi archivo de palabras vac?as personalizada y lo convierte a ASCII
sw <- readLines("http://www.webmining.cl/wp-content/uploads/2011/03/stopwords.es.txt", encoding="UTF-8")
sw <- iconv(sw, to="ASCII//TRANSLIT")
# remueve palabras genericas
d <- tm_map(d, removeWords, stopwords("spanish"))
# remueve palabras vac?as personalizadas
d <- tm_map(d, removeWords, sw)
d
}
matriztexto <- function(x){
d <- limpieza(x)
# crea matriz de terminos
tdm <- TermDocumentMatrix(d)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
df <- data.frame(word = names(v),freq=v)
df
}
topicModeler <- function(data, k=10, n=3, title= ""){
#'@param data un vector de texto
#'@param k la cantidad de tópicos esperados
#'@param n la cantidad de palabras que se espera graficar por tópico
#'@return una gráfica de tópicos
# convierto el vector de texto en un formato de procesamiento e indico el idioma
texto <- textProcessor(data, language = "spanish")
# preparo el documento extrayendo los respectivos valores del objeto creado
out <- prepDocuments(texto$documents, texto$vocab, texto$meta)
# genero el modelo de tópicos
modelo <- stm(out$documents, vocab = out$vocab, K= k,
max.em.its = 75, data = out$meta,
init.type = "Spectral", verbose = FALSE)
# grafico el modelo
plot(modelo, xlab = "Proporción esperada de tópicos",
xlim = c(0,1), n=n, main= title)
} # fin de topicModeler()
# matrizartes <- pdf_text("artes.pdf") %>%
# iconv(., from="UTF-8", to="ASCII//TRANSLIT") %>%
# matriztexto(.)
txt_yasna <- pdf_text("yasna.pdf") %>%
iconv(., from="UTF-8", to="ASCII//TRANSLIT") %>%
removeWords("iii")
matrizyasna <- txt_yasna
matriztexto(.)
txt_boric <- pdf_text("boric.pdf") %>%
iconv(., from="UTF-8", to="ASCII//TRANSLIT")
matrizboric <- txt_boric %>%
matriztexto(.)
txt_nazi <- pdf_text("nazi.pdf") %>%
iconv(., from="UTF-8", to="ASCII//TRANSLIT")
matriznazi <- txt_nazi %>%
matriztexto(.)
txt_pinera2 <- pdf_text("pinera2.pdf")%>%
iconv(., from="UTF-8", to="ASCII//TRANSLIT")
matrizpinera2 <- txt_pinera2 %>%
matriztexto(.)
(matrizyasna %>%
filter(freq >= quantile(freq, .99)) %>%
ggplot() +
aes(label = word,
size = freq, color = word) +
geom_text_wordcloud_area() +
scale_size_area(max_size = 10)+
labs(subtitle = "Programa Yasna")) +
(matrizboric %>%
filter(freq >= quantile(freq, .99)) %>%
ggplot() +
aes(label = word,
size = freq, color = word) +
geom_text_wordcloud_area() +
scale_size_area(max_size = 10)+
labs(subtitle = "Programa Boric"))+
(matriznazi %>%
filter(freq >= quantile(freq, .99)) %>%
ggplot() +
aes(label = word,
size = freq, color = word) +
geom_text_wordcloud_area() +
scale_size_area(max_size = 10)+
labs(subtitle = "Programa Nazi"))+
(matrizpinera2 %>%
filter(freq >= quantile(freq, .99)) %>%
ggplot() +
aes(label = word,
size = freq, color = word) +
geom_text_wordcloud_area() +
scale_size_area(max_size = 10)+
labs(subtitle = "Programa Aborto de Piñera",
caption = "1% de palabras más repetidas por programa")) +
plot_annotation(title = "Nube de palabras por programa")
topicModeler(txt_yasna, n=5, title = "Topicos Yasna")
topicModeler(txt_boric, n=5, title = "Topicos Boric")
topicModeler(txt_nazi, n=5, title = "Topicos Nazi")
topicModeler(txt_pinera2, n=5, title = "Topicos Aborto de Piñera")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment