Skip to content

Instantly share code, notes, and snippets.

@bensoltoff
Created March 11, 2019 02:04
Show Gist options
  • Save bensoltoff/6c79b279166236f8b1a41870af59d74a to your computer and use it in GitHub Desktop.
Save bensoltoff/6c79b279166236f8b1a41870af59d74a to your computer and use it in GitHub Desktop.
library(tidyverse)
library(tidytext)
library(tm)
#> Loading required package: NLP
#> 
#> Attaching package: 'NLP'
#> The following object is masked from 'package:ggplot2':
#> 
#>     annotate

set.seed(123) # random number generator seed

# get USCongress data
data(USCongress, package = "RTextTools")
congress <- as_tibble(USCongress) %>%
  mutate(text = as.character(text))

# split congress_tokens into training/test sets
library(rsample)

congress_split <- initial_split(congress, prop = 0.7)
congress_train <- training(congress_split)
congress_test <- testing(congress_split)

glimpse(congress)
#> Observations: 4,449
#> Variables: 6
#> $ ID       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
#> $ cong     <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum  <int> 4499, 4500, 4501, 4502, 4503, 4504, 4505, 4506, 4507, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major    <int> 18, 18, 18, 18, 5, 21, 15, 18, 18, 18, 18, 16, 18, 12, …
#> $ text     <chr> "To suspend temporarily the duty on Fast Magenta 2 Stag…
glimpse(congress_train)
#> Observations: 3,115
#> Variables: 6
#> $ ID       <int> 1, 6, 7, 8, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22,…
#> $ cong     <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum  <int> 4499, 4504, 4505, 4506, 4508, 4509, 4510, 4511, 4514, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major    <int> 18, 21, 15, 18, 18, 18, 16, 18, 3, 3, 18, 18, 18, 18, 1…
#> $ text     <chr> "To suspend temporarily the duty on Fast Magenta 2 Stag…
glimpse(congress_test)
#> Observations: 1,334
#> Variables: 6
#> $ ID       <int> 2, 3, 4, 5, 9, 14, 15, 25, 26, 27, 34, 35, 39, 40, 45, …
#> $ cong     <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum  <int> 4500, 4501, 4502, 4503, 4507, 4512, 4513, 4523, 4524, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major    <int> 18, 18, 18, 5, 18, 12, 2, 18, 19, 18, 18, 18, 18, 18, 9…
#> $ text     <chr> "To suspend temporarily the duty on Fast Black 286 Stag…

# dtm of congress
congress_dtm <- congress %>%
  # tokenize
  unnest_tokens(output = word, input = text) %>%
  # remove numbers
  filter(!str_detect(word, "^[0-9]*$")) %>%
  # remove stop words
  anti_join(stop_words) %>%
  # stem the words
  mutate(word = SnowballC::wordStem(word)) %>%
  # DTM the entire corpus
  # get count of each token in each document
  count(ID, word) %>%
  # create a document-term matrix with all features and tf weighting
  cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"

# dtm of congress_train
congress_train_dtm <- congress_train %>%
  # tokenize
  unnest_tokens(output = word, input = text) %>%
  # remove numbers
  filter(!str_detect(word, "^[0-9]*$")) %>%
  # remove stop words
  anti_join(stop_words) %>%
  # stem the words
  mutate(word = SnowballC::wordStem(word)) %>%
  # DTM the entire corpus
  # get count of each token in each document
  count(ID, word) %>%
  # create a document-term matrix with all features and tf weighting
  cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"

# dtm of congress_test
congress_test_dtm <- congress_test %>%
  # tokenize
  unnest_tokens(output = word, input = text) %>%
  # remove numbers
  filter(!str_detect(word, "^[0-9]*$")) %>%
  # remove stop words
  anti_join(stop_words) %>%
  # stem the words
  mutate(word = SnowballC::wordStem(word)) %>%
  # DTM the entire corpus
  # get count of each token in each document
  count(ID, word) %>%
  # create a document-term matrix with all features and tf weighting
  cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"

# number of unique words in dictionary
ncol(congress_dtm)
#> [1] 4902
ncol(congress_train_dtm)
#> [1] 4183
ncol(congress_test_dtm)
#> [1] 2721

Created on 2019-03-10 by the reprex package (v0.2.1)

Session info
devtools::session_info()
#> ─ Session info ──────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 3.5.2 (2018-12-20)
#>  os       macOS Mojave 10.14.3        
#>  system   x86_64, darwin15.6.0        
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_US.UTF-8                 
#>  ctype    en_US.UTF-8                 
#>  tz       America/Chicago             
#>  date     2019-03-10                  
#> 
#> ─ Packages ──────────────────────────────────────────────────────────────
#>  package     * version    date       lib source                          
#>  assertthat    0.2.0      2017-04-11 [2] CRAN (R 3.5.0)                  
#>  backports     1.1.3      2018-12-14 [2] CRAN (R 3.5.0)                  
#>  broom         0.5.1      2018-12-05 [2] CRAN (R 3.5.0)                  
#>  callr         3.1.1      2018-12-21 [2] CRAN (R 3.5.0)                  
#>  cellranger    1.1.0      2016-07-27 [2] CRAN (R 3.5.0)                  
#>  cli           1.0.1      2018-09-25 [1] CRAN (R 3.5.0)                  
#>  colorspace    1.4-0      2019-01-13 [2] CRAN (R 3.5.2)                  
#>  crayon        1.3.4      2017-09-16 [2] CRAN (R 3.5.0)                  
#>  desc          1.2.0      2018-05-01 [2] CRAN (R 3.5.0)                  
#>  devtools      2.0.1      2018-10-26 [1] CRAN (R 3.5.1)                  
#>  digest        0.6.18     2018-10-10 [1] CRAN (R 3.5.0)                  
#>  dplyr       * 0.8.0.1    2019-02-15 [1] CRAN (R 3.5.2)                  
#>  evaluate      0.13       2019-02-12 [2] CRAN (R 3.5.2)                  
#>  fansi         0.4.0      2018-10-05 [2] CRAN (R 3.5.0)                  
#>  forcats     * 0.4.0      2019-02-17 [2] CRAN (R 3.5.2)                  
#>  fs            1.2.6      2018-08-23 [1] CRAN (R 3.5.0)                  
#>  generics      0.0.2      2018-11-29 [1] CRAN (R 3.5.0)                  
#>  ggplot2     * 3.1.0      2018-10-25 [1] CRAN (R 3.5.0)                  
#>  glue          1.3.0      2018-07-17 [2] CRAN (R 3.5.0)                  
#>  gtable        0.2.0      2016-02-26 [2] CRAN (R 3.5.0)                  
#>  haven         2.1.0      2019-02-19 [2] CRAN (R 3.5.2)                  
#>  highr         0.7        2018-06-09 [2] CRAN (R 3.5.0)                  
#>  hms           0.4.2      2018-03-10 [2] CRAN (R 3.5.0)                  
#>  htmltools     0.3.6      2017-04-28 [1] CRAN (R 3.5.0)                  
#>  httr          1.4.0      2018-12-11 [2] CRAN (R 3.5.0)                  
#>  janeaustenr   0.1.5      2017-06-10 [2] CRAN (R 3.5.0)                  
#>  jsonlite      1.6        2018-12-07 [2] CRAN (R 3.5.0)                  
#>  knitr         1.21       2018-12-10 [2] CRAN (R 3.5.1)                  
#>  lattice       0.20-38    2018-11-04 [2] CRAN (R 3.5.2)                  
#>  lazyeval      0.2.1      2017-10-29 [2] CRAN (R 3.5.0)                  
#>  lubridate     1.7.4      2018-04-11 [2] CRAN (R 3.5.0)                  
#>  magrittr      1.5        2014-11-22 [2] CRAN (R 3.5.0)                  
#>  Matrix        1.2-15     2018-11-01 [2] CRAN (R 3.5.2)                  
#>  memoise       1.1.0      2017-04-21 [2] CRAN (R 3.5.0)                  
#>  modelr        0.1.4      2019-02-18 [2] CRAN (R 3.5.2)                  
#>  munsell       0.5.0      2018-06-12 [2] CRAN (R 3.5.0)                  
#>  nlme          3.1-137    2018-04-07 [2] CRAN (R 3.5.2)                  
#>  NLP         * 0.2-0      2018-10-18 [2] CRAN (R 3.5.0)                  
#>  pillar        1.3.1      2018-12-15 [2] CRAN (R 3.5.0)                  
#>  pkgbuild      1.0.2      2018-10-16 [1] CRAN (R 3.5.0)                  
#>  pkgconfig     2.0.2      2018-08-16 [2] CRAN (R 3.5.1)                  
#>  pkgload       1.0.2      2018-10-29 [1] CRAN (R 3.5.0)                  
#>  plyr          1.8.4      2016-06-08 [2] CRAN (R 3.5.0)                  
#>  prettyunits   1.0.2      2015-07-13 [2] CRAN (R 3.5.0)                  
#>  processx      3.2.1      2018-12-05 [2] CRAN (R 3.5.0)                  
#>  ps            1.3.0      2018-12-21 [2] CRAN (R 3.5.0)                  
#>  purrr       * 0.3.0      2019-01-27 [2] CRAN (R 3.5.2)                  
#>  R6            2.4.0      2019-02-14 [1] CRAN (R 3.5.2)                  
#>  Rcpp          1.0.0      2018-11-07 [1] CRAN (R 3.5.0)                  
#>  readr       * 1.3.1      2018-12-21 [2] CRAN (R 3.5.0)                  
#>  readxl        1.3.0      2019-02-15 [2] CRAN (R 3.5.2)                  
#>  remotes       2.0.2      2018-10-30 [1] CRAN (R 3.5.0)                  
#>  rlang         0.3.1      2019-01-08 [1] CRAN (R 3.5.2)                  
#>  rmarkdown     1.11       2018-12-08 [2] CRAN (R 3.5.0)                  
#>  rprojroot     1.3-2      2018-01-03 [2] CRAN (R 3.5.0)                  
#>  rsample     * 0.0.4      2019-01-07 [1] CRAN (R 3.5.2)                  
#>  rvest         0.3.2      2016-06-17 [2] CRAN (R 3.5.0)                  
#>  scales        1.0.0      2018-08-09 [1] CRAN (R 3.5.0)                  
#>  sessioninfo   1.1.1      2018-11-05 [1] CRAN (R 3.5.0)                  
#>  slam          0.1-44     2018-12-21 [1] CRAN (R 3.5.0)                  
#>  SnowballC     0.6.0      2019-01-15 [2] CRAN (R 3.5.2)                  
#>  stringi       1.3.1      2019-02-13 [1] CRAN (R 3.5.2)                  
#>  stringr     * 1.4.0      2019-02-10 [1] CRAN (R 3.5.2)                  
#>  testthat      2.0.1      2018-10-13 [2] CRAN (R 3.5.0)                  
#>  tibble      * 2.0.1      2019-01-12 [2] CRAN (R 3.5.2)                  
#>  tidyr       * 0.8.2.9000 2019-02-11 [1] Github (tidyverse/tidyr@0b27690)
#>  tidyselect    0.2.5      2018-10-11 [1] CRAN (R 3.5.0)                  
#>  tidytext    * 0.2.0      2018-10-17 [1] CRAN (R 3.5.0)                  
#>  tidyverse   * 1.2.1      2017-11-14 [2] CRAN (R 3.5.0)                  
#>  tm          * 0.7-6      2018-12-21 [2] CRAN (R 3.5.0)                  
#>  tokenizers    0.2.1      2018-03-29 [2] CRAN (R 3.5.0)                  
#>  usethis       1.4.0      2018-08-14 [1] CRAN (R 3.5.0)                  
#>  utf8          1.1.4      2018-05-24 [2] CRAN (R 3.5.0)                  
#>  withr         2.1.2      2018-03-15 [2] CRAN (R 3.5.0)                  
#>  xfun          0.5        2019-02-20 [1] CRAN (R 3.5.2)                  
#>  xml2          1.2.0      2018-01-24 [2] CRAN (R 3.5.0)                  
#>  yaml          2.2.0      2018-07-25 [2] CRAN (R 3.5.0)                  
#> 
#> [1] /Users/soltoffbc/Library/R/3.5/library
#> [2] /Library/Frameworks/R.framework/Versions/3.5/Resources/library
library(tidyverse)
library(tidytext)
library(tm)
set.seed(123) # random number generator seed
# get USCongress data
data(USCongress, package = "RTextTools")
congress <- as_tibble(USCongress) %>%
mutate(text = as.character(text))
# split congress_tokens into training/test sets
library(rsample)
congress_split <- initial_split(congress, prop = 0.7)
congress_train <- training(congress_split)
congress_test <- testing(congress_split)
glimpse(congress)
glimpse(congress_train)
glimpse(congress_test)
# dtm of congress
congress_dtm <- congress %>%
# tokenize
unnest_tokens(output = word, input = text) %>%
# remove numbers
filter(!str_detect(word, "^[0-9]*$")) %>%
# remove stop words
anti_join(stop_words) %>%
# stem the words
mutate(word = SnowballC::wordStem(word)) %>%
# DTM the entire corpus
# get count of each token in each document
count(ID, word) %>%
# create a document-term matrix with all features and tf weighting
cast_dtm(document = ID, term = word, value = n)
# dtm of congress_train
congress_train_dtm <- congress_train %>%
# tokenize
unnest_tokens(output = word, input = text) %>%
# remove numbers
filter(!str_detect(word, "^[0-9]*$")) %>%
# remove stop words
anti_join(stop_words) %>%
# stem the words
mutate(word = SnowballC::wordStem(word)) %>%
# DTM the entire corpus
# get count of each token in each document
count(ID, word) %>%
# create a document-term matrix with all features and tf weighting
cast_dtm(document = ID, term = word, value = n)
# dtm of congress_test
congress_test_dtm <- congress_test %>%
# tokenize
unnest_tokens(output = word, input = text) %>%
# remove numbers
filter(!str_detect(word, "^[0-9]*$")) %>%
# remove stop words
anti_join(stop_words) %>%
# stem the words
mutate(word = SnowballC::wordStem(word)) %>%
# DTM the entire corpus
# get count of each token in each document
count(ID, word) %>%
# create a document-term matrix with all features and tf weighting
cast_dtm(document = ID, term = word, value = n)
# number of unique words in dictionary
ncol(congress_dtm)
ncol(congress_train_dtm)
ncol(congress_test_dtm)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment