Created December 11, 2012 18:05
Denver debate analysis II
rm(list = ls())
doInstall <- TRUE # Change to FALSE if you don't want packages installed.
toInstall <- c("zoo", "tm", "ggplot2", "lme4", "arm", "Snowball")
if(doInstall){install.packages(toInstall, repos = "")}
lapply(toInstall, library, character.only = TRUE)
# From:
Transcript <- readLines("")
head(Transcript, 20)
Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE)
Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1
Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2
Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3
Transcript$Speaker <- na.locf(Transcript$Speaker)
# Remove moderator:
Transcript <- Transcript[Transcript$Speaker != 1, ]
myCorpus <- Corpus(DataframeSource(Transcript))
myCorpus <- tm_map(myCorpus, tolower) # Make lowercase
myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english")) # Remove stopwords
myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney"))
myCorpus <- tm_map(myCorpus, stemDocument) # Stem words
docTermMatrix <- DocumentTermMatrix(myCorpus)
docTermMatrix <- inspect(docTermMatrix)
termCountFrame <- data.frame(Term = colnames(docTermMatrix))
termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ])
termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ])
### New ###
tallCountFrame <- with(termCountFrame, data.frame(Term = c(rep(Term, Obama),
rep(Term, Romney))))
tallCountFrame$isRomney <- rep(c(0, 1), colSums(termCountFrame[, -1]))
tallCountFrame$Term <- colnames(docTermMatrix)[tallCountFrame$Term]
randomInterceptModel <- lmer(isRomney ~ (1 | Term) - 1,
family = "binomial", data = tallCountFrame)
# Convert lmer model to plot-able data.
coefficientFrame <- data.frame(Term = rownames(coef(randomInterceptModel)$Term))
coefficientFrame$Estimate <- coef(randomInterceptModel)$Term[, 1]
coefficientFrame$SE <- se.coef(randomInterceptModel)$Term[, 1]
coefficientFrame$Count <- colSums(docTermMatrix)[coefficientFrame$Term]
coefficientFrame$z0 <- with(coefficientFrame, plogis(Estimate))
coefficientFrame$z_1 <- with(coefficientFrame, plogis(Estimate-SE))
coefficientFrame$z1 <- with(coefficientFrame, plogis(Estimate+SE))
coefficientFrame$z_2 <- with(coefficientFrame, plogis(Estimate-2*SE))
coefficientFrame$z2 <- with(coefficientFrame, plogis(Estimate+2*SE))
coefficientFrame$Term <- factor(coefficientFrame$Term,
levels = coefficientFrame$Term[order(coefficientFrame$Estimate)])
cutoffCount <- tail(sort(colSums(docTermMatrix)), 100)[1]
zp1 <- ggplot(coefficientFrame[coefficientFrame$Count >= cutoffCount, ],
aes(x = Term, y = z0,
ymin = z_2, ymax = z2))
zp1 <- zp1 + geom_linerange(size = 1/2)
zp1 <- zp1 + geom_linerange(aes(ymin = z_1, ymax = z1),
size = 1)
zp1 <- zp1 + geom_point(colour = "WHITE", shape = 15, alpha = 1, size = 10/9)
zp1 <- zp1 + scale_y_continuous("Romney use probability",
expand = c(0, 0))
zp1 <- zp1 + coord_flip()
zp1 <- zp1 + ggtitle(paste("p(Romney Said It | Term)\nterms that occur at least ", cutoffCount, " times", sep = ""))
So for this line:
Transcript <- readLines("")

I'm getting this error:
Error in file(con, "r") : cannot open the connection
In addition: Warning message:
In file(con, "r") : unsupported URL scheme

I've checked the modules to ensure they are installed and loaded. I've googled and tried adjusting the URL, plus I've CURLed the URL directly from the command line to ensure it works.

Do you have any other recommendations?


R version 2.15.2 (2012-10-26) -- "Trick or Treat"
Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)

Ok - it looks like this is the issue:
Https is only supported on Windows, when R is started with the --internet2 command line option

So I solved this problem with a manual CURL, I'll annotate in my fork:

