kbenoit · March 11, 2019 05:09
diff --git a/compare_kind.R b/compare_kind.R
 library("quanteda")
 ## Package version: 1.4.2
 ## Parallel computing: 2 of 12 threads used.
 ## See https://quanteda.io for tutorials and examples.
 ## 
 ## Attaching package: 'quanteda'
 ## The following object is masked from 'package:utils':
 ## 
 ##     View
 library("spacyr")

 # see https://github.com/quanteda/quanteda.corpora
 data(data_corpus_sotu, package = "quanteda.corpora")

 # create corpus of just sentences containing "kind"
 corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
 corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
  corpus(split_context = FALSE, extract_keyword = FALSE)

 # tag the parts of speech
 sp <- spacyr::spacy_parse(texts(corp_kind))
 ## Found 'spacy_condaenv'. spacyr will use this environment
 ## successfully initialized (spaCy Version: 2.0.18, language model: en)
 ## (python options: type = "condaenv", value = "spacy_condaenv")

 # convert to quanteda tokens with pos tags
 toks <- as.tokens(sp, include_pos = "pos")

 # get frequencies of different variants of "kind", summarize
 tstat <- dfm(toks, select = "kind/*") %>%
  textstat_frequency()
 tstat
 ##     feature frequency rank docfreq group
 ## 1 kind/noun       302    1     290   all
 ## 2  kind/adj        13    2      13   all
 ## 3  kind/adv         3    3       3   all
 sum(tstat$frequency)
 ## [1] 318
 tstat$frequency / sum(tstat$frequency)
 ## [1] 0.949685535 0.040880503 0.009433962
	library("quanteda")
	## Package version: 1.4.2
	## Parallel computing: 2 of 12 threads used.
	## See https://quanteda.io for tutorials and examples.
	##
	## Attaching package: 'quanteda'
	## The following object is masked from 'package:utils':
	##
	## View
	library("spacyr")

	# see https://github.com/quanteda/quanteda.corpora
	data(data_corpus_sotu, package = "quanteda.corpora")

	# create corpus of just sentences containing "kind"
	corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
	corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
	corpus(split_context = FALSE, extract_keyword = FALSE)

	# tag the parts of speech
	sp <- spacyr::spacy_parse(texts(corp_kind))
	## Found 'spacy_condaenv'. spacyr will use this environment
	## successfully initialized (spaCy Version: 2.0.18, language model: en)
	## (python options: type = "condaenv", value = "spacy_condaenv")

	# convert to quanteda tokens with pos tags
	toks <- as.tokens(sp, include_pos = "pos")

	# get frequencies of different variants of "kind", summarize
	tstat <- dfm(toks, select = "kind/*") %>%
	textstat_frequency()
	tstat
	## feature frequency rank docfreq group
	## 1 kind/noun 302 1 290 all
	## 2 kind/adj 13 2 13 all
	## 3 kind/adv 3 3 3 all
	sum(tstat$frequency)
	## [1] 318
	tstat$frequency / sum(tstat$frequency)
	## [1] 0.949685535 0.040880503 0.009433962