kbenoit · March 21, 2019 06:45
diff --git a/text_as_data_an_overview.R b/text_as_data_an_overview.R
 library(quanteda)
 ## Package version: 1.4.3
 ## Parallel computing: 2 of 12 threads used.
 ## See https://quanteda.io for tutorials and examples.
 ## 
 ## Attaching package: 'quanteda'
 ## The following object is masked from 'package:utils':
 ## 
 ##     View

 # inflation
 kwic(data_corpus_inaugural, phrase("inflation"), 20)
 ##                    
 ##  [1981-Reagan, 806]
 ##  [1985-Reagan, 468]
 ##  [1985-Reagan, 572]
 ##                                                                                                                       
 ##       born of bigotry or discrimination. Putting America back to work means putting all Americans back to work. Ending
 ##  Government that properly belonged to States or to local governments or to the people themselves. We allowed taxes and
 ##                           free to follow their dreams. And we were right to believe that. Tax rates have been reduced,
 ##               
 ##  | inflation |
 ##  | inflation |
 ##  | inflation |
 ##                                                                                                               
 ##  means freeing all Americans from the terror of runaway living costs. All must share in the productive work of
 ##  to rob us of our earnings and savings and watched the great industrial machine that had made us the most     
 ##  cut dramatically, and more people are employed than ever before in our history. We are creating a nation
 ## workflow figure

 sotu_dfm <- dfm(data_corpus_sotu, remove_punct = TRUE) %>%
  dfm_remove(stopwords("en")) %>%
  dfm_sort()
 ## Error in is(x, "dfm"): object 'data_corpus_sotu' not found
 head(sotu_dfm[
  c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"),
  c("economy", "united", "wall", "crime", "climate")
 ], nf = 8)
 ## Error in head(sotu_dfm[c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"), : object 'sotu_dfm' not found


 ## "kind" for dictionaries

 library("spacyr")

 # see https://github.com/quanteda/quanteda.corpora
 data(data_corpus_sotu, package = "quanteda.corpora")

 # create corpus of just sentences containing "kind"
 corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
 corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
  corpus(split_context = FALSE, extract_keyword = FALSE)

 # tag the parts of speech
 sp <- spacyr::spacy_parse(texts(corp_kind))
 ## Found 'spacy_condaenv'. spacyr will use this environment
 ## successfully initialized (spaCy Version: 2.1.0, language model: en)
 ## (python options: type = "condaenv", value = "spacy_condaenv")

 # convert to quanteda tokens with pos tags
 toks <- as.tokens(sp, include_pos = "pos")

 # get frequencies of different variants of "kind", summarize
 tstat <- dfm(toks, select = "kind/*") %>%
  textstat_frequency()
 tstat
 ##      feature frequency rank docfreq group
 ## 1  kind/noun       298    1     286   all
 ## 2   kind/adj        16    2      16   all
 ## 3   kind/adv         3    3       3   all
 ## 4 kind/propn         1    4       1   all
 sum(tstat$frequency)
 ## [1] 318
 tstat$frequency / sum(tstat$frequency)
 ## [1] 0.937106918 0.050314465 0.009433962 0.003144654


 ## illustrate sparsity

 inaugdfm <- corpus_subset(data_corpus_inaugural, Year <= 2019) %>%
  dfm(remove_punct = TRUE, remove_numbers = TRUE, tolower = TRUE)
 inaugdfm
 ## Document-feature matrix of: 58 documents, 9,273 features (91.8% sparse).
 prod(dim(inaugdfm))
 ## [1] 537834
 hapaxes <- featnames(inaugdfm)[colSums(inaugdfm) == 1]
 length(hapaxes)
 ## [1] 3846
 length(hapaxes) / nfeat(inaugdfm)
 ## [1] 0.4147525
 head(sort(hapaxes), 100)
 ##   [1] "14th"             "18th"             "30th"            
 ##   [4] "3d"               "4th"              "6th"             
 ##   [7] "abate"            "abdicated"        "abeyance"        
 ##  [10] "abhorring"        "abject"           "ably"            
 ##  [13] "abode"            "abodes"           "abolishing"      
 ##  [16] "aborigines"       "abound"           "abounds"         
 ##  [19] "abridging"        "absolutism"       "absorb"          
 ##  [22] "absorbed"         "absorbing"        "absorbs"         
 ##  [25] "abstaining"       "abstract"         "abstractions"    
 ##  [28] "absurd"           "academies"        "accepts"         
 ##  [31] "accident"         "accidental"       "accidents"       
 ##  [34] "accommodation"    "accommodations"   "accompany"       
 ##  [37] "accorded"         "accords"          "accrue"          
 ##  [40] "accrued"          "accruing"         "accumulate"      
 ##  [43] "accumulated"      "accurately"       "accustom"        
 ##  [46] "achieving"        "acknowledgment"   "acquaintance"    
 ##  [49] "acquires"         "acquiring"        "acquit"          
 ##  [52] "acrimony"         "actively"         "activism"        
 ##  [55] "actuate"          "acute"            "adams"           
 ##  [58] "addiction"        "additions"        "addresses"       
 ##  [61] "adduced"          "adhered"          "adheres"         
 ##  [64] "adjective"        "adjunct"          "adjustments"     
 ##  [67] "administrated"    "administration's" "administrators"  
 ##  [70] "admirably"        "admissions"       "admitting"       
 ##  [73] "admonishes"       "admonitions"      "adopting"        
 ##  [76] "adore"            "adoring"          "adorn"           
 ##  [79] "adorns"           "adventurers"      "adventurously"   
 ##  [82] "adverted"         "advisers"         "advisory"        
 ##  [85] "advocates"        "affiliation"      "affirmation"     
 ##  [88] "affirmations"     "afflict"          "affliction"      
 ##  [91] "afghanistan"      "afield"           "afloat"          
 ##  [94] "afresh"           "afte"             "aftermath"       
 ##  [97] "aggravated"       "aggravation"      "aggressive"      
 ## [100] "aggressor"
 kwic(data_corpus_inaugural, "aborigines", window = 20)
 ##                   
 ##  [1873-Grant, 951]
 ##                                                                                  
 ##  a specie basis; to the elevation of labor; and, by a humane course, to bring the
 ##                
 ##  | aborigines |
 ##                                                                                                                     
 ##  of the country under the benign influences of education and civilization. It is either this or war of extermination


 ## uninteresting ngrams
 toks <- tokens(data_corpus_inaugural, remove_punct = TRUE) %>%
  tokens_remove(stopwords("en"), pad = TRUE) %>%
  tokens_ngrams(n = 2)
 dfm(toks) %>%
  topfeatures()
 ##      united_states             let_us    fellow_citizens 
 ##                157                 97                 78 
 ##    american_people federal_government          years_ago 
 ##                 40                 32                 26 
 ##         four_years general_government            upon_us 
 ##                 26                 25                 24 
 ##      every_citizen 
 ##                 18

 ## tokens to text to matrix
 txt <- c(
  t1 = "The Social Democratic Party opposes tax cuts for the wealthy.",
  t2 = "We are opposed to spending another 10 million on social welfare."
 )
 tokens(txt)
 ## tokens from 2 documents.
 ## t1 :
 ##  [1] "The"        "Social"     "Democratic" "Party"      "opposes"   
 ##  [6] "tax"        "cuts"       "for"        "the"        "wealthy"   
 ## [11] "."         
 ## 
 ## t2 :
 ##  [1] "We"       "are"      "opposed"  "to"       "spending" "another" 
 ##  [7] "10"       "million"  "on"       "social"   "welfare"  "."
 tokens(txt, remove_punct = TRUE)
 ## tokens from 2 documents.
 ## t1 :
 ##  [1] "The"        "Social"     "Democratic" "Party"      "opposes"   
 ##  [6] "tax"        "cuts"       "for"        "the"        "wealthy"   
 ## 
 ## t2 :
 ##  [1] "We"       "are"      "opposed"  "to"       "spending" "another" 
 ##  [7] "10"       "million"  "on"       "social"   "welfare"
 tokens(txt, remove_numbers = TRUE)
 ## tokens from 2 documents.
 ## t1 :
 ##  [1] "The"        "Social"     "Democratic" "Party"      "opposes"   
 ##  [6] "tax"        "cuts"       "for"        "the"        "wealthy"   
 ## [11] "."         
 ## 
 ## t2 :
 ##  [1] "We"       "are"      "opposed"  "to"       "spending" "another" 
 ##  [7] "million"  "on"       "social"   "welfare"  "."
 tokens(txt) %>%
  tokens_wordstem()
 ## tokens from 2 documents.
 ## t1 :
 ##  [1] "The"      "Social"   "Democrat" "Parti"    "oppos"    "tax"     
 ##  [7] "cut"      "for"      "the"      "wealthi"  "."       
 ## 
 ## t2 :
 ##  [1] "We"      "are"     "oppos"   "to"      "spend"   "anoth"   "10"     
 ##  [8] "million" "on"      "social"  "welfar"  "."
 tokens(txt) %>%
  tokens_wordstem() %>%
  tokens_tolower()
 ## tokens from 2 documents.
 ## t1 :
 ##  [1] "the"      "social"   "democrat" "parti"    "oppos"    "tax"     
 ##  [7] "cut"      "for"      "the"      "wealthi"  "."       
 ## 
 ## t2 :
 ##  [1] "we"      "are"     "oppos"   "to"      "spend"   "anoth"   "10"     
 ##  [8] "million" "on"      "social"  "welfar"  "."
 spacy_parse(txt, nounphrase = TRUE) %>%
  nounphrase_extract()
 ##   doc_id sentence_id                  nounphrase
 ## 1     t1           1 The_Social_Democratic_Party
 ## 2     t1           1                    tax_cuts
 ## 3     t2           1                          We
 ## 4     t2           1              social_welfare
 spacy_parse(txt, entity = TRUE) %>%
  entity_extract()
 ##   doc_id sentence_id                      entity entity_type
 ## 1     t1           1 The_Social_Democratic_Party         ORG
 spacy_parse(txt, nounphrase = TRUE) %>%
  nounphrase_consolidate() %>%
  as.tokens(include_pos = "pos")
 ## tokens from 2 documents.
 ## t1 :
 ## [1] "The_Social_Democratic_Party/nounphrase"
 ## [2] "opposes/VERB"                          
 ## [3] "tax_cuts/nounphrase"                   
 ## [4] "for/ADP"                               
 ## [5] "the/DET"                               
 ## [6] "wealthy/ADJ"                           
 ## [7] "./PUNCT"                               
 ## 
 ## t2 :
 ##  [1] "We/nounphrase"             "are/VERB"                 
 ##  [3] "opposed/VERB"              "to/ADP"                   
 ##  [5] "spending/VERB"             "another/DET"              
 ##  [7] "10/NUM"                    "million/NUM"              
 ##  [9] "on/ADP"                    "social_welfare/nounphrase"
 ## [11] "./PUNCT"

 ## annotating tokens with POS tags
 spacyr::spacy_parse("My kind of friend is kind of kind.") %>%
  as.tokens(include_pos = "pos") %>%
  tokens_select("kind/*")
 ## tokens from 1 document.
 ## text1 :
 ## [1] "kind/NOUN" "kind/ADV"  "kind/ADJ"
 spacyr::spacy_parse("The President sanctions the sanctions against Iran.") %>%
  as.tokens(include_pos = "pos")
 ## tokens from 1 document.
 ## text1 :
 ## [1] "The/DET"         "President/PROPN" "sanctions/VERB"  "the/DET"        
 ## [5] "sanctions/NOUN"  "against/ADP"     "Iran/PROPN"      "./PUNCT"

 ## similarity example
 txt <- c(
  "Party X prioritizes economic growth, even at the cost of environmental protection.",
  "Party X prioritizes environmental protection, even at the cost of economic growth.",
  "Party Y embraces protection of citizens through universal health care."
 )
 dfm(txt) %>%
  textstat_simil(method = "cosine")
 ##           text1     text2
 ## text2 1.0000000          
 ## text3 0.3223292 0.3223292
	library(quanteda)
	## Package version: 1.4.3
	## Parallel computing: 2 of 12 threads used.
	## See https://quanteda.io for tutorials and examples.
	##
	## Attaching package: 'quanteda'
	## The following object is masked from 'package:utils':
	##
	## View

	# inflation
	kwic(data_corpus_inaugural, phrase("inflation"), 20)
	##
	## [1981-Reagan, 806]
	## [1985-Reagan, 468]
	## [1985-Reagan, 572]
	##
	## born of bigotry or discrimination. Putting America back to work means putting all Americans back to work. Ending
	## Government that properly belonged to States or to local governments or to the people themselves. We allowed taxes and
	## free to follow their dreams. And we were right to believe that. Tax rates have been reduced,
	##
	## \| inflation \|
	## \| inflation \|
	## \| inflation \|
	##
	## means freeing all Americans from the terror of runaway living costs. All must share in the productive work of
	## to rob us of our earnings and savings and watched the great industrial machine that had made us the most
	## cut dramatically, and more people are employed than ever before in our history. We are creating a nation
	## workflow figure

	sotu_dfm <- dfm(data_corpus_sotu, remove_punct = TRUE) %>%
	dfm_remove(stopwords("en")) %>%
	dfm_sort()
	## Error in is(x, "dfm"): object 'data_corpus_sotu' not found
	head(sotu_dfm[
	c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"),
	c("economy", "united", "wall", "crime", "climate")
	], nf = 8)
	## Error in head(sotu_dfm[c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"), : object 'sotu_dfm' not found


	## "kind" for dictionaries

	library("spacyr")

	# see https://github.com/quanteda/quanteda.corpora
	data(data_corpus_sotu, package = "quanteda.corpora")

	# create corpus of just sentences containing "kind"
	corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
	corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
	corpus(split_context = FALSE, extract_keyword = FALSE)

	# tag the parts of speech
	sp <- spacyr::spacy_parse(texts(corp_kind))
	## Found 'spacy_condaenv'. spacyr will use this environment
	## successfully initialized (spaCy Version: 2.1.0, language model: en)
	## (python options: type = "condaenv", value = "spacy_condaenv")

	# convert to quanteda tokens with pos tags
	toks <- as.tokens(sp, include_pos = "pos")

	# get frequencies of different variants of "kind", summarize
	tstat <- dfm(toks, select = "kind/*") %>%
	textstat_frequency()
	tstat
	## feature frequency rank docfreq group
	## 1 kind/noun 298 1 286 all
	## 2 kind/adj 16 2 16 all
	## 3 kind/adv 3 3 3 all
	## 4 kind/propn 1 4 1 all
	sum(tstat$frequency)
	## [1] 318
	tstat$frequency / sum(tstat$frequency)
	## [1] 0.937106918 0.050314465 0.009433962 0.003144654


	## illustrate sparsity

	inaugdfm <- corpus_subset(data_corpus_inaugural, Year <= 2019) %>%
	dfm(remove_punct = TRUE, remove_numbers = TRUE, tolower = TRUE)
	inaugdfm
	## Document-feature matrix of: 58 documents, 9,273 features (91.8% sparse).
	prod(dim(inaugdfm))
	## [1] 537834
	hapaxes <- featnames(inaugdfm)[colSums(inaugdfm) == 1]
	length(hapaxes)
	## [1] 3846
	length(hapaxes) / nfeat(inaugdfm)
	## [1] 0.4147525
	head(sort(hapaxes), 100)
	## [1] "14th" "18th" "30th"
	## [4] "3d" "4th" "6th"
	## [7] "abate" "abdicated" "abeyance"
	## [10] "abhorring" "abject" "ably"
	## [13] "abode" "abodes" "abolishing"
	## [16] "aborigines" "abound" "abounds"
	## [19] "abridging" "absolutism" "absorb"
	## [22] "absorbed" "absorbing" "absorbs"
	## [25] "abstaining" "abstract" "abstractions"
	## [28] "absurd" "academies" "accepts"
	## [31] "accident" "accidental" "accidents"
	## [34] "accommodation" "accommodations" "accompany"
	## [37] "accorded" "accords" "accrue"
	## [40] "accrued" "accruing" "accumulate"
	## [43] "accumulated" "accurately" "accustom"
	## [46] "achieving" "acknowledgment" "acquaintance"
	## [49] "acquires" "acquiring" "acquit"
	## [52] "acrimony" "actively" "activism"
	## [55] "actuate" "acute" "adams"
	## [58] "addiction" "additions" "addresses"
	## [61] "adduced" "adhered" "adheres"
	## [64] "adjective" "adjunct" "adjustments"
	## [67] "administrated" "administration's" "administrators"
	## [70] "admirably" "admissions" "admitting"
	## [73] "admonishes" "admonitions" "adopting"
	## [76] "adore" "adoring" "adorn"
	## [79] "adorns" "adventurers" "adventurously"
	## [82] "adverted" "advisers" "advisory"
	## [85] "advocates" "affiliation" "affirmation"
	## [88] "affirmations" "afflict" "affliction"
	## [91] "afghanistan" "afield" "afloat"
	## [94] "afresh" "afte" "aftermath"
	## [97] "aggravated" "aggravation" "aggressive"
	## [100] "aggressor"
	kwic(data_corpus_inaugural, "aborigines", window = 20)
	##
	## [1873-Grant, 951]
	##
	## a specie basis; to the elevation of labor; and, by a humane course, to bring the
	##
	## \| aborigines \|
	##
	## of the country under the benign influences of education and civilization. It is either this or war of extermination


	## uninteresting ngrams
	toks <- tokens(data_corpus_inaugural, remove_punct = TRUE) %>%
	tokens_remove(stopwords("en"), pad = TRUE) %>%
	tokens_ngrams(n = 2)
	dfm(toks) %>%
	topfeatures()
	## united_states let_us fellow_citizens
	## 157 97 78
	## american_people federal_government years_ago
	## 40 32 26
	## four_years general_government upon_us
	## 26 25 24
	## every_citizen
	## 18

	## tokens to text to matrix
	txt <- c(
	t1 = "The Social Democratic Party opposes tax cuts for the wealthy.",
	t2 = "We are opposed to spending another 10 million on social welfare."
	)
	tokens(txt)
	## tokens from 2 documents.
	## t1 :
	## [1] "The" "Social" "Democratic" "Party" "opposes"
	## [6] "tax" "cuts" "for" "the" "wealthy"
	## [11] "."
	##
	## t2 :
	## [1] "We" "are" "opposed" "to" "spending" "another"
	## [7] "10" "million" "on" "social" "welfare" "."
	tokens(txt, remove_punct = TRUE)
	## tokens from 2 documents.
	## t1 :
	## [1] "The" "Social" "Democratic" "Party" "opposes"
	## [6] "tax" "cuts" "for" "the" "wealthy"
	##
	## t2 :
	## [1] "We" "are" "opposed" "to" "spending" "another"
	## [7] "10" "million" "on" "social" "welfare"
	tokens(txt, remove_numbers = TRUE)
	## tokens from 2 documents.
	## t1 :
	## [1] "The" "Social" "Democratic" "Party" "opposes"
	## [6] "tax" "cuts" "for" "the" "wealthy"
	## [11] "."
	##
	## t2 :
	## [1] "We" "are" "opposed" "to" "spending" "another"
	## [7] "million" "on" "social" "welfare" "."
	tokens(txt) %>%
	tokens_wordstem()
	## tokens from 2 documents.
	## t1 :
	## [1] "The" "Social" "Democrat" "Parti" "oppos" "tax"
	## [7] "cut" "for" "the" "wealthi" "."
	##
	## t2 :
	## [1] "We" "are" "oppos" "to" "spend" "anoth" "10"
	## [8] "million" "on" "social" "welfar" "."
	tokens(txt) %>%
	tokens_wordstem() %>%
	tokens_tolower()
	## tokens from 2 documents.
	## t1 :
	## [1] "the" "social" "democrat" "parti" "oppos" "tax"
	## [7] "cut" "for" "the" "wealthi" "."
	##
	## t2 :
	## [1] "we" "are" "oppos" "to" "spend" "anoth" "10"
	## [8] "million" "on" "social" "welfar" "."
	spacy_parse(txt, nounphrase = TRUE) %>%
	nounphrase_extract()
	## doc_id sentence_id nounphrase
	## 1 t1 1 The_Social_Democratic_Party
	## 2 t1 1 tax_cuts
	## 3 t2 1 We
	## 4 t2 1 social_welfare
	spacy_parse(txt, entity = TRUE) %>%
	entity_extract()
	## doc_id sentence_id entity entity_type
	## 1 t1 1 The_Social_Democratic_Party ORG
	spacy_parse(txt, nounphrase = TRUE) %>%
	nounphrase_consolidate() %>%
	as.tokens(include_pos = "pos")
	## tokens from 2 documents.
	## t1 :
	## [1] "The_Social_Democratic_Party/nounphrase"
	## [2] "opposes/VERB"
	## [3] "tax_cuts/nounphrase"
	## [4] "for/ADP"
	## [5] "the/DET"
	## [6] "wealthy/ADJ"
	## [7] "./PUNCT"
	##
	## t2 :
	## [1] "We/nounphrase" "are/VERB"
	## [3] "opposed/VERB" "to/ADP"
	## [5] "spending/VERB" "another/DET"
	## [7] "10/NUM" "million/NUM"
	## [9] "on/ADP" "social_welfare/nounphrase"
	## [11] "./PUNCT"

	## annotating tokens with POS tags
	spacyr::spacy_parse("My kind of friend is kind of kind.") %>%
	as.tokens(include_pos = "pos") %>%
	tokens_select("kind/*")
	## tokens from 1 document.
	## text1 :
	## [1] "kind/NOUN" "kind/ADV" "kind/ADJ"
	spacyr::spacy_parse("The President sanctions the sanctions against Iran.") %>%
	as.tokens(include_pos = "pos")
	## tokens from 1 document.
	## text1 :
	## [1] "The/DET" "President/PROPN" "sanctions/VERB" "the/DET"
	## [5] "sanctions/NOUN" "against/ADP" "Iran/PROPN" "./PUNCT"

	## similarity example
	txt <- c(
	"Party X prioritizes economic growth, even at the cost of environmental protection.",
	"Party X prioritizes environmental protection, even at the cost of economic growth.",
	"Party Y embraces protection of citizens through universal health care."
	)
	dfm(txt) %>%
	textstat_simil(method = "cosine")
	## text1 text2
	## text2 1.0000000
	## text3 0.3223292 0.3223292