-
-
Save amrrs/2a7a3d6d2e45217db2a9905e22fe07d7 to your computer and use it in GitHub Desktop.
find and extract emoji in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# save this to '_chat.txt` (it require a login) | |
# https://www.kaggle.com/sarthaknautiyal/whatsappsample | |
library(ore) | |
library(dplyr) | |
emoji_src <- "https://raw.githubusercontent.com/laurenancona/twimoji/gh-pages/twitterEmojiProject/emoticon_conversion_noGraphic.csv" | |
emoji_fil <- basename(emoji_src) | |
if (!file.exists(emoji_fil)) download.file(emoji_src, emoji_fil) | |
emoji <- read.csv(emoji_fil, header=FALSE, stringsAsFactors = FALSE) | |
emoji_regex <- sprintf("(%s)", paste0(emoji$V2, collapse="|")) | |
compiled <- ore(emoji_regex) | |
chat <- readLines("_chat.txt", encoding = "UTF-8", warn = FALSE) | |
which(grepl(emoji_regex, chat, useBytes = TRUE)) | |
## [1] 8 9 10 11 13 19 20 22 23 62 65 69 73 74 75 82 83 84 87 88 90 91 | |
## [23] 92 93 94 95 107 108 114 115 117 119 122 123 124 125 130 135 139 140 141 142 143 144 | |
## [45] 146 147 150 151 153 157 159 161 162 166 169 171 174 177 178 183 184 189 191 192 195 196 | |
## [67] 199 200 202 206 207 209 220 221 223 224 225 226 228 229 234 235 238 239 242 244 246 247 | |
## [89] 248 249 250 251 253 259 260 262 263 265 274 275 280 281 282 286 287 288 291 292 293 296 | |
## [111] 302 304 305 307 334 335 343 346 348 351 354 355 356 358 361 362 382 389 390 391 396 397 | |
## [133] 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 | |
## [155] 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 442 451 452 | |
## [177] 454 459 463 465 466 469 471 472 473 474 475 479 482 484 485 486 488 490 492 493 496 503 | |
## [199] 505 506 507 509 517 518 519 525 526 527 528 531 535 540 543 545 548 549 557 558 559 560 | |
## [221] 566 567 571 572 573 574 576 577 578 580 587 589 591 592 594 597 600 601 603 608 609 625 | |
## [243] 626 627 637 638 639 640 641 643 645 749 757 764 | |
chat_emoji_lines <- chat[which(grepl(emoji_regex, chat, useBytes = TRUE))] | |
found_emoji <- ore.search(compiled, chat_emoji_lines, all=TRUE) | |
emoji_matches <- matches(found_emoji) | |
str(emoji_matches, 1) | |
## List of 254 | |
## $ : chr [1:4] "\U0001f600" "\U0001f600" "\U0001f44d" "\U0001f44d" | |
## $ : chr "\U0001f648" | |
## $ : chr [1:2] "\U0001f44d" "\U0001f44d" | |
## $ : chr "\U0001f602" | |
## $ : chr [1:3] "\U0001f602" "\U0001f602" "\U0001f602" | |
## $ : chr [1:4] "\U0001f44c" "\U0001f44c" "\U0001f44c" "\U0001f44c" | |
## $ : chr [1:6] "\U0001f602" "\U0001f602" "\U0001f602" "\U0001f602" ... | |
## $ : chr "\U0001f600" | |
## $ : chr [1:5] "\U0001f604" "\U0001f604" "\U0001f604" "\U0001f603" ... | |
## $ : chr "\U0001f44d" | |
## ... | |
data_frame( | |
V2 = flatten_chr(emoji_matches) %>% | |
map(charToRaw) %>% | |
map(as.character) %>% | |
map(toupper) %>% | |
map(~sprintf("\\x%s", .x)) %>% | |
map_chr(paste0, collapse="") | |
) %>% | |
left_join(emoji) %>% | |
count(V3, sort=TRUE) | |
## # A tibble: 89 x 2 | |
## V3 n | |
## <chr> <int> | |
## 1 face with tears of joy 110 | |
## 2 smiling face with smiling eyes 50 | |
## 3 face with stuck-out tongue and winking eye 43 | |
## 4 musical note 42 | |
## 5 birthday cake 35 | |
## 6 grinning face with smiling eyes 26 | |
## 7 face with stuck-out tongue and tightly-closed eyes 24 | |
## 8 grinning face 21 | |
## 9 bouquet 17 | |
## 10 thumbs up sign 17 | |
## # ... with 79 more rows |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment