Last active
June 24, 2020 20:06
-
-
Save chenpanliao/789f7984003719ea4bcc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To the extent possible under law, Chen-Pan Liao has waived all | |
# copyright and related or neighboring rights to PTT-R_Language_Praising.R. | |
# This work is published from: Taiwan. | |
Sys.setlocale(locale = "C") # for windows user | |
library("RCurl") | |
library("XML") | |
board <- "R_Language" | |
i <- 1 | |
st <- T | |
dat <- list() | |
while(st) { | |
myurl <- paste0("https://www.ptt.cc/bbs/", board, "/index", i, ".html") | |
if (url.exists(myurl)){ | |
text <- getURL(myurl) | |
tree <- htmlTreeParse(text, asText = TRUE)$children$html[[2]][[2]][[2]] | |
date <- unlist(lapply(xpathApply(tree, "//div[@class='date']"), xmlValue)) | |
author <- unlist(lapply(xpathApply(tree, "//div[@class='author']"), xmlValue)) | |
title <- unlist(lapply(xpathApply(tree, "//div[@class='title']/a"), xmlValue)) | |
nrec <- lapply(xpathApply(tree, "//div[@class='nrec']"), xmlValue) | |
nrec <- lapply(nrec, as.numeric) | |
nrec <- lapply(nrec, function(x){ if(length(x)==0){return(0)}else{return(x)} }) | |
mark <- lapply(xpathApply(tree, "//div[@class='mark']"), xmlValue) | |
mark <- lapply(mark, function(x){ if(length(x)==0){return(F)}else{return(T)} }) | |
dat[[page = i]] <- list(date, author, title, nrec, mark) | |
cat("Praising", myurl, "\n") | |
i <- i + 1 | |
} else { | |
st = F | |
} | |
} | |
dat <- data.frame( | |
date = unlist(lapply(dat, "[[", 1)), | |
author = unlist(lapply(dat, "[[", 2)), | |
title = unlist(lapply(dat, "[[", 3)), | |
nrce = unlist(lapply(dat, "[[", 4)), | |
isMark = unlist(lapply(dat, "[[", 5)) | |
) | |
# 發名排名前10 | |
sort(n1 <- table(dat$author), decreasing = T)[1:10] | |
# 被推文排名前10 | |
sort(n2 <- tapply(dat$nrce, dat$author, sum), decreasing = T)[1:10] | |
# 平均每文被推次數排名前10 | |
sort(n3 <- n2/n1, decreasing = T)[1:10] | |
# 被M文排名前10 | |
sort(n4 <- tapply(dat$isMark, dat$author, sum), decreasing = T)[1:10] | |
# 被M機率排名前10 | |
sort(n5 <- n4/n1, decreasing = T)[1:10] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment