Created
March 11, 2019 05:09
-
-
Save kbenoit/6f069707120c64a5e4e4550d94a9fb7b to your computer and use it in GitHub Desktop.
Examples of polysemy for "kind" in the State of the Union corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("quanteda") | |
## Package version: 1.4.2 | |
## Parallel computing: 2 of 12 threads used. | |
## See https://quanteda.io for tutorials and examples. | |
## | |
## Attaching package: 'quanteda' | |
## The following object is masked from 'package:utils': | |
## | |
## View | |
library("spacyr") | |
# see https://github.com/quanteda/quanteda.corpora | |
data(data_corpus_sotu, package = "quanteda.corpora") | |
# create corpus of just sentences containing "kind" | |
corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences") | |
corp_kind <- kwic(corp_sents, "kind", window = 200) %>% | |
corpus(split_context = FALSE, extract_keyword = FALSE) | |
# tag the parts of speech | |
sp <- spacyr::spacy_parse(texts(corp_kind)) | |
## Found 'spacy_condaenv'. spacyr will use this environment | |
## successfully initialized (spaCy Version: 2.0.18, language model: en) | |
## (python options: type = "condaenv", value = "spacy_condaenv") | |
# convert to quanteda tokens with pos tags | |
toks <- as.tokens(sp, include_pos = "pos") | |
# get frequencies of different variants of "kind", summarize | |
tstat <- dfm(toks, select = "kind/*") %>% | |
textstat_frequency() | |
tstat | |
## feature frequency rank docfreq group | |
## 1 kind/noun 302 1 290 all | |
## 2 kind/adj 13 2 13 all | |
## 3 kind/adv 3 3 3 all | |
sum(tstat$frequency) | |
## [1] 318 | |
tstat$frequency / sum(tstat$frequency) | |
## [1] 0.949685535 0.040880503 0.009433962 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment