Created
March 21, 2019 06:45
-
-
Save kbenoit/4593ca1deeb4d890077f3b12ba468888 to your computer and use it in GitHub Desktop.
Analysis from Text as Data: An Overview
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(quanteda) | |
## Package version: 1.4.3 | |
## Parallel computing: 2 of 12 threads used. | |
## See https://quanteda.io for tutorials and examples. | |
## | |
## Attaching package: 'quanteda' | |
## The following object is masked from 'package:utils': | |
## | |
## View | |
# inflation | |
kwic(data_corpus_inaugural, phrase("inflation"), 20) | |
## | |
## [1981-Reagan, 806] | |
## [1985-Reagan, 468] | |
## [1985-Reagan, 572] | |
## | |
## born of bigotry or discrimination. Putting America back to work means putting all Americans back to work. Ending | |
## Government that properly belonged to States or to local governments or to the people themselves. We allowed taxes and | |
## free to follow their dreams. And we were right to believe that. Tax rates have been reduced, | |
## | |
## | inflation | | |
## | inflation | | |
## | inflation | | |
## | |
## means freeing all Americans from the terror of runaway living costs. All must share in the productive work of | |
## to rob us of our earnings and savings and watched the great industrial machine that had made us the most | |
## cut dramatically, and more people are employed than ever before in our history. We are creating a nation | |
## workflow figure | |
sotu_dfm <- dfm(data_corpus_sotu, remove_punct = TRUE) %>% | |
dfm_remove(stopwords("en")) %>% | |
dfm_sort() | |
## Error in is(x, "dfm"): object 'data_corpus_sotu' not found | |
head(sotu_dfm[ | |
c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"), | |
c("economy", "united", "wall", "crime", "climate") | |
], nf = 8) | |
## Error in head(sotu_dfm[c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"), : object 'sotu_dfm' not found | |
## "kind" for dictionaries | |
library("spacyr") | |
# see https://github.com/quanteda/quanteda.corpora | |
data(data_corpus_sotu, package = "quanteda.corpora") | |
# create corpus of just sentences containing "kind" | |
corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences") | |
corp_kind <- kwic(corp_sents, "kind", window = 200) %>% | |
corpus(split_context = FALSE, extract_keyword = FALSE) | |
# tag the parts of speech | |
sp <- spacyr::spacy_parse(texts(corp_kind)) | |
## Found 'spacy_condaenv'. spacyr will use this environment | |
## successfully initialized (spaCy Version: 2.1.0, language model: en) | |
## (python options: type = "condaenv", value = "spacy_condaenv") | |
# convert to quanteda tokens with pos tags | |
toks <- as.tokens(sp, include_pos = "pos") | |
# get frequencies of different variants of "kind", summarize | |
tstat <- dfm(toks, select = "kind/*") %>% | |
textstat_frequency() | |
tstat | |
## feature frequency rank docfreq group | |
## 1 kind/noun 298 1 286 all | |
## 2 kind/adj 16 2 16 all | |
## 3 kind/adv 3 3 3 all | |
## 4 kind/propn 1 4 1 all | |
sum(tstat$frequency) | |
## [1] 318 | |
tstat$frequency / sum(tstat$frequency) | |
## [1] 0.937106918 0.050314465 0.009433962 0.003144654 | |
## illustrate sparsity | |
inaugdfm <- corpus_subset(data_corpus_inaugural, Year <= 2019) %>% | |
dfm(remove_punct = TRUE, remove_numbers = TRUE, tolower = TRUE) | |
inaugdfm | |
## Document-feature matrix of: 58 documents, 9,273 features (91.8% sparse). | |
prod(dim(inaugdfm)) | |
## [1] 537834 | |
hapaxes <- featnames(inaugdfm)[colSums(inaugdfm) == 1] | |
length(hapaxes) | |
## [1] 3846 | |
length(hapaxes) / nfeat(inaugdfm) | |
## [1] 0.4147525 | |
head(sort(hapaxes), 100) | |
## [1] "14th" "18th" "30th" | |
## [4] "3d" "4th" "6th" | |
## [7] "abate" "abdicated" "abeyance" | |
## [10] "abhorring" "abject" "ably" | |
## [13] "abode" "abodes" "abolishing" | |
## [16] "aborigines" "abound" "abounds" | |
## [19] "abridging" "absolutism" "absorb" | |
## [22] "absorbed" "absorbing" "absorbs" | |
## [25] "abstaining" "abstract" "abstractions" | |
## [28] "absurd" "academies" "accepts" | |
## [31] "accident" "accidental" "accidents" | |
## [34] "accommodation" "accommodations" "accompany" | |
## [37] "accorded" "accords" "accrue" | |
## [40] "accrued" "accruing" "accumulate" | |
## [43] "accumulated" "accurately" "accustom" | |
## [46] "achieving" "acknowledgment" "acquaintance" | |
## [49] "acquires" "acquiring" "acquit" | |
## [52] "acrimony" "actively" "activism" | |
## [55] "actuate" "acute" "adams" | |
## [58] "addiction" "additions" "addresses" | |
## [61] "adduced" "adhered" "adheres" | |
## [64] "adjective" "adjunct" "adjustments" | |
## [67] "administrated" "administration's" "administrators" | |
## [70] "admirably" "admissions" "admitting" | |
## [73] "admonishes" "admonitions" "adopting" | |
## [76] "adore" "adoring" "adorn" | |
## [79] "adorns" "adventurers" "adventurously" | |
## [82] "adverted" "advisers" "advisory" | |
## [85] "advocates" "affiliation" "affirmation" | |
## [88] "affirmations" "afflict" "affliction" | |
## [91] "afghanistan" "afield" "afloat" | |
## [94] "afresh" "afte" "aftermath" | |
## [97] "aggravated" "aggravation" "aggressive" | |
## [100] "aggressor" | |
kwic(data_corpus_inaugural, "aborigines", window = 20) | |
## | |
## [1873-Grant, 951] | |
## | |
## a specie basis; to the elevation of labor; and, by a humane course, to bring the | |
## | |
## | aborigines | | |
## | |
## of the country under the benign influences of education and civilization. It is either this or war of extermination | |
## uninteresting ngrams | |
toks <- tokens(data_corpus_inaugural, remove_punct = TRUE) %>% | |
tokens_remove(stopwords("en"), pad = TRUE) %>% | |
tokens_ngrams(n = 2) | |
dfm(toks) %>% | |
topfeatures() | |
## united_states let_us fellow_citizens | |
## 157 97 78 | |
## american_people federal_government years_ago | |
## 40 32 26 | |
## four_years general_government upon_us | |
## 26 25 24 | |
## every_citizen | |
## 18 | |
## tokens to text to matrix | |
txt <- c( | |
t1 = "The Social Democratic Party opposes tax cuts for the wealthy.", | |
t2 = "We are opposed to spending another 10 million on social welfare." | |
) | |
tokens(txt) | |
## tokens from 2 documents. | |
## t1 : | |
## [1] "The" "Social" "Democratic" "Party" "opposes" | |
## [6] "tax" "cuts" "for" "the" "wealthy" | |
## [11] "." | |
## | |
## t2 : | |
## [1] "We" "are" "opposed" "to" "spending" "another" | |
## [7] "10" "million" "on" "social" "welfare" "." | |
tokens(txt, remove_punct = TRUE) | |
## tokens from 2 documents. | |
## t1 : | |
## [1] "The" "Social" "Democratic" "Party" "opposes" | |
## [6] "tax" "cuts" "for" "the" "wealthy" | |
## | |
## t2 : | |
## [1] "We" "are" "opposed" "to" "spending" "another" | |
## [7] "10" "million" "on" "social" "welfare" | |
tokens(txt, remove_numbers = TRUE) | |
## tokens from 2 documents. | |
## t1 : | |
## [1] "The" "Social" "Democratic" "Party" "opposes" | |
## [6] "tax" "cuts" "for" "the" "wealthy" | |
## [11] "." | |
## | |
## t2 : | |
## [1] "We" "are" "opposed" "to" "spending" "another" | |
## [7] "million" "on" "social" "welfare" "." | |
tokens(txt) %>% | |
tokens_wordstem() | |
## tokens from 2 documents. | |
## t1 : | |
## [1] "The" "Social" "Democrat" "Parti" "oppos" "tax" | |
## [7] "cut" "for" "the" "wealthi" "." | |
## | |
## t2 : | |
## [1] "We" "are" "oppos" "to" "spend" "anoth" "10" | |
## [8] "million" "on" "social" "welfar" "." | |
tokens(txt) %>% | |
tokens_wordstem() %>% | |
tokens_tolower() | |
## tokens from 2 documents. | |
## t1 : | |
## [1] "the" "social" "democrat" "parti" "oppos" "tax" | |
## [7] "cut" "for" "the" "wealthi" "." | |
## | |
## t2 : | |
## [1] "we" "are" "oppos" "to" "spend" "anoth" "10" | |
## [8] "million" "on" "social" "welfar" "." | |
spacy_parse(txt, nounphrase = TRUE) %>% | |
nounphrase_extract() | |
## doc_id sentence_id nounphrase | |
## 1 t1 1 The_Social_Democratic_Party | |
## 2 t1 1 tax_cuts | |
## 3 t2 1 We | |
## 4 t2 1 social_welfare | |
spacy_parse(txt, entity = TRUE) %>% | |
entity_extract() | |
## doc_id sentence_id entity entity_type | |
## 1 t1 1 The_Social_Democratic_Party ORG | |
spacy_parse(txt, nounphrase = TRUE) %>% | |
nounphrase_consolidate() %>% | |
as.tokens(include_pos = "pos") | |
## tokens from 2 documents. | |
## t1 : | |
## [1] "The_Social_Democratic_Party/nounphrase" | |
## [2] "opposes/VERB" | |
## [3] "tax_cuts/nounphrase" | |
## [4] "for/ADP" | |
## [5] "the/DET" | |
## [6] "wealthy/ADJ" | |
## [7] "./PUNCT" | |
## | |
## t2 : | |
## [1] "We/nounphrase" "are/VERB" | |
## [3] "opposed/VERB" "to/ADP" | |
## [5] "spending/VERB" "another/DET" | |
## [7] "10/NUM" "million/NUM" | |
## [9] "on/ADP" "social_welfare/nounphrase" | |
## [11] "./PUNCT" | |
## annotating tokens with POS tags | |
spacyr::spacy_parse("My kind of friend is kind of kind.") %>% | |
as.tokens(include_pos = "pos") %>% | |
tokens_select("kind/*") | |
## tokens from 1 document. | |
## text1 : | |
## [1] "kind/NOUN" "kind/ADV" "kind/ADJ" | |
spacyr::spacy_parse("The President sanctions the sanctions against Iran.") %>% | |
as.tokens(include_pos = "pos") | |
## tokens from 1 document. | |
## text1 : | |
## [1] "The/DET" "President/PROPN" "sanctions/VERB" "the/DET" | |
## [5] "sanctions/NOUN" "against/ADP" "Iran/PROPN" "./PUNCT" | |
## similarity example | |
txt <- c( | |
"Party X prioritizes economic growth, even at the cost of environmental protection.", | |
"Party X prioritizes environmental protection, even at the cost of economic growth.", | |
"Party Y embraces protection of citizens through universal health care." | |
) | |
dfm(txt) %>% | |
textstat_simil(method = "cosine") | |
## text1 text2 | |
## text2 1.0000000 | |
## text3 0.3223292 0.3223292 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment