chenpanliao · June 24, 2020 20:06
diff --git a/PTT-R_Language_Praising.R b/PTT-R_Language_Praising.R
 # To the extent possible under law, Chen-Pan Liao has waived all 
 # copyright and related or neighboring rights to PTT-R_Language_Praising.R.
 # This work is published from: Taiwan.


 Sys.setlocale(locale = "C") # for windows user


 library("RCurl")
 library("XML")

 board <- "R_Language"

 i <- 1
 st <- T
 dat <- list()
 while(st) {
 	myurl <- paste0("https://www.ptt.cc/bbs/", board, "/index", i, ".html")
 	if (url.exists(myurl)){
 		text <- getURL(myurl)
 		tree <- htmlTreeParse(text, asText = TRUE)$children$html[[2]][[2]][[2]]

 		date <- unlist(lapply(xpathApply(tree, "//div[@class='date']"), xmlValue))
 		author <- unlist(lapply(xpathApply(tree, "//div[@class='author']"), xmlValue))
 		title <-  unlist(lapply(xpathApply(tree, "//div[@class='title']/a"), xmlValue))

 		nrec <-  lapply(xpathApply(tree, "//div[@class='nrec']"), xmlValue)
 		nrec <- lapply(nrec, as.numeric)
 		nrec <- lapply(nrec, function(x){  if(length(x)==0){return(0)}else{return(x)}  })
 		
 		mark <- lapply(xpathApply(tree, "//div[@class='mark']"), xmlValue)
 		mark <- lapply(mark, function(x){  if(length(x)==0){return(F)}else{return(T)}  })

 		dat[[page = i]] <- list(date, author, title, nrec, mark)
 		cat("Praising", myurl, "\n")
 		i <- i + 1
 	} else {
 		st = F
 	}
 }

 dat <- data.frame(
 	date = unlist(lapply(dat, "[[", 1)),
 	author = unlist(lapply(dat, "[[", 2)),
 	title = unlist(lapply(dat, "[[", 3)),
 	nrce = unlist(lapply(dat, "[[", 4)),
 	isMark = unlist(lapply(dat, "[[", 5))
 )

 # 發名排名前10
 sort(n1 <- table(dat$author), decreasing = T)[1:10]

 # 被推文排名前10
 sort(n2 <- tapply(dat$nrce, dat$author, sum), decreasing = T)[1:10]

 # 平均每文被推次數排名前10
 sort(n3 <- n2/n1, decreasing = T)[1:10]

 # 被M文排名前10
 sort(n4 <- tapply(dat$isMark, dat$author, sum), decreasing = T)[1:10]

 # 被M機率排名前10
 sort(n5 <- n4/n1, decreasing = T)[1:10]
	# To the extent possible under law, Chen-Pan Liao has waived all
	# copyright and related or neighboring rights to PTT-R_Language_Praising.R.
	# This work is published from: Taiwan.


	Sys.setlocale(locale = "C") # for windows user


	library("RCurl")
	library("XML")

	board <- "R_Language"

	i <- 1
	st <- T
	dat <- list()
	while(st) {
	myurl <- paste0("https://www.ptt.cc/bbs/", board, "/index", i, ".html")
	if (url.exists(myurl)){
	text <- getURL(myurl)
	tree <- htmlTreeParse(text, asText = TRUE)$children$html[[2]][[2]][[2]]

	date <- unlist(lapply(xpathApply(tree, "//div[@class='date']"), xmlValue))
	author <- unlist(lapply(xpathApply(tree, "//div[@class='author']"), xmlValue))
	title <- unlist(lapply(xpathApply(tree, "//div[@class='title']/a"), xmlValue))

	nrec <- lapply(xpathApply(tree, "//div[@class='nrec']"), xmlValue)
	nrec <- lapply(nrec, as.numeric)
	nrec <- lapply(nrec, function(x){ if(length(x)==0){return(0)}else{return(x)} })

	mark <- lapply(xpathApply(tree, "//div[@class='mark']"), xmlValue)
	mark <- lapply(mark, function(x){ if(length(x)==0){return(F)}else{return(T)} })

	dat[[page = i]] <- list(date, author, title, nrec, mark)
	cat("Praising", myurl, "\n")
	i <- i + 1
	} else {
	st = F
	}
	}

	dat <- data.frame(
	date = unlist(lapply(dat, "[[", 1)),
	author = unlist(lapply(dat, "[[", 2)),
	title = unlist(lapply(dat, "[[", 3)),
	nrce = unlist(lapply(dat, "[[", 4)),
	isMark = unlist(lapply(dat, "[[", 5))
	)

	# 發名排名前10
	sort(n1 <- table(dat$author), decreasing = T)[1:10]

	# 被推文排名前10
	sort(n2 <- tapply(dat$nrce, dat$author, sum), decreasing = T)[1:10]

	# 平均每文被推次數排名前10
	sort(n3 <- n2/n1, decreasing = T)[1:10]

	# 被M文排名前10
	sort(n4 <- tapply(dat$isMark, dat$author, sum), decreasing = T)[1:10]

	# 被M機率排名前10
	sort(n5 <- n4/n1, decreasing = T)[1:10]