Created March 11, 2019
#> Loading required package: NLP
#> Attaching package: 'NLP'
#> The following object is masked from 'package:ggplot2':
#>     annotate

set.seed(123) # random number generator seed

# get USCongress data
data(USCongress, package = "RTextTools")
congress <- as_tibble(USCongress) %>%
  mutate(text = as.character(text))

# split congress_tokens into training/test sets

congress_split <- initial_split(congress, prop = 0.7)
congress_train <- training(congress_split)
congress_test <- testing(congress_split)

#> Observations: 4,449
#> Variables: 6
#> $ ID       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
#> $ cong     <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum  <int> 4499, 4500, 4501, 4502, 4503, 4504, 4505, 4506, 4507, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major    <int> 18, 18, 18, 18, 5, 21, 15, 18, 18, 18, 18, 16, 18, 12, …
#> $ text     <chr> "To suspend temporarily the duty on Fast Magenta 2 Stag…
#> Observations: 3,115
#> Variables: 6
#> $ ID       <int> 1, 6, 7, 8, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22,…
#> $ cong     <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum  <int> 4499, 4504, 4505, 4506, 4508, 4509, 4510, 4511, 4514, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major    <int> 18, 21, 15, 18, 18, 18, 16, 18, 3, 3, 18, 18, 18, 18, 1…
#> $ text     <chr> "To suspend temporarily the duty on Fast Magenta 2 Stag…
#> Observations: 1,334
#> Variables: 6
#> $ ID       <int> 2, 3, 4, 5, 9, 14, 15, 25, 26, 27, 34, 35, 39, 40, 45, …
#> $ cong     <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum  <int> 4500, 4501, 4502, 4503, 4507, 4512, 4513, 4523, 4524, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major    <int> 18, 18, 18, 5, 18, 12, 2, 18, 19, 18, 18, 18, 18, 18, 9…
#> $ text     <chr> "To suspend temporarily the duty on Fast Black 286 Stag…

# dtm of congress
congress_dtm <- congress %>%
  # tokenize
  unnest_tokens(output = word, input = text) %>%
  # remove numbers
  filter(!str_detect(word, "^[0-9]*$")) %>%
  # remove stop words
  anti_join(stop_words) %>%
  # stem the words
  mutate(word = SnowballC::wordStem(word)) %>%
  # DTM the entire corpus
  # get count of each token in each document
  count(ID, word) %>%
  # create a document-term matrix with all features and tf weighting
  cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"

# dtm of congress_train
congress_train_dtm <- congress_train %>%
  # tokenize
  unnest_tokens(output = word, input = text) %>%
  # remove numbers
  filter(!str_detect(word, "^[0-9]*$")) %>%
  # remove stop words
  anti_join(stop_words) %>%
  # stem the words
  mutate(word = SnowballC::wordStem(word)) %>%
  # DTM the entire corpus
  # get count of each token in each document
  count(ID, word) %>%
  # create a document-term matrix with all features and tf weighting
  cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"

# dtm of congress_test
congress_test_dtm <- congress_test %>%
  # tokenize
  unnest_tokens(output = word, input = text) %>%
  # remove numbers
  filter(!str_detect(word, "^[0-9]*$")) %>%
  # remove stop words
  anti_join(stop_words) %>%
  # stem the words
  mutate(word = SnowballC::wordStem(word)) %>%
  # DTM the entire corpus
  # get count of each token in each document
  count(ID, word) %>%
  # create a document-term matrix with all features and tf weighting
  cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"

# number of unique words in dictionary
#> [1] 4902
#> [1] 4183
#> [1] 2721

