Created
August 13, 2015 11:23
-
-
Save jadianes/7a6d0f132f1b5bcd9647 to your computer and use it in GitHub Desktop.
Sentiment Analyser Shiny app
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(shiny) | |
library(tm) | |
library(SnowballC) | |
library(randomForest) | |
options(mc.cores=1) | |
build_model <- function(new_data_df, sparsity) { | |
# Create new data corpus | |
new_corpus <- Corpus(VectorSource(new_data_df$Text)) | |
new_corpus <- tm_map(new_corpus, content_transformer(tolower)) | |
new_corpus <- tm_map(new_corpus, removePunctuation) | |
new_corpus <- tm_map(new_corpus, removeWords, stopwords("english")) | |
new_corpus <- tm_map(new_corpus, stripWhitespace) | |
new_corpus <- tm_map(new_corpus, stemDocument) | |
# create document-term matrix | |
new_dtm <- DocumentTermMatrix(new_corpus) | |
new_dtm <- removeSparseTerms(new_dtm, sparsity) | |
new_dtm_df <- as.data.frame(as.matrix(new_dtm)) | |
colnames(new_dtm_df) <- make.names(colnames(new_dtm_df)) | |
# intersect corpora and prepare final training data | |
common_names <- intersect(colnames(train_dtm_df),colnames(new_dtm_df)) | |
new_dtm_df <- subset(new_dtm_df, select=names(new_dtm_df) %in% common_names) | |
model_train_data_df <- cbind(train_data_df, subset(train_dtm_df, select=names(train_dtm_df) %in% common_names)) | |
model_train_data_df$Text <- NULL | |
# train classifier | |
model <- randomForest(Sentiment~.,data=model_train_data_df, ntree=50) | |
# return value as a list | |
list(model, new_dtm_df) | |
} | |
shinyServer(function(input, output) { | |
output$contents <- renderTable({ | |
results() | |
}) | |
output$distribution <- renderPlot({ | |
if (is.null(results())) | |
return(NULL) | |
d <- density( | |
as.numeric(results()$Prob > input$threshold) | |
) | |
plot( | |
d, | |
xlim = c(0, 1), | |
main=paste0("Sentiment Distribution (Prob > ", input$threshold, ")") | |
) | |
polygon(d, col="lightgrey", border="lightgrey") | |
abline(v = input$threshold, col = "blue") | |
}) | |
results <- reactive({ | |
inFile <- input$file1 | |
if (is.null(inFile)) | |
return(NULL) | |
# load input data | |
new_data_df <- read.csv( | |
inFile$datapath, | |
sep='\t', | |
header=FALSE, | |
quote = "", | |
stringsAsFactor=F, | |
col.names=c("Text") | |
) | |
model_and_data <- build_model(new_data_df, input$sparsity) | |
pred <- predict(model_and_data[[1]], newdata=model_and_data[[2]], type="prob") | |
new_data_df$Prob <- pred[,2] | |
# return data frame | |
new_data_df | |
}) | |
}) | |
# This is the init code, that will be run when the web app loads | |
# Load train and test data | |
train_data_df <- read.csv( | |
file = 'train_data.tsv', | |
sep='\t', | |
quote = "", | |
header=FALSE, | |
stringsAsFactor=F, | |
col.names=c("Sentiment", "Text") | |
) | |
train_data_df$Sentiment <- as.factor(train_data_df$Sentiment) | |
# Create training corpus for later re-use | |
train_corpus <- Corpus(VectorSource(train_data_df$Text)) | |
train_corpus <- tm_map(train_corpus, content_transformer(tolower)) | |
train_corpus <- tm_map(train_corpus, removePunctuation) | |
train_corpus <- tm_map(train_corpus, removeWords, stopwords("english")) | |
train_corpus <- tm_map(train_corpus, stripWhitespace) | |
train_corpus <- tm_map(train_corpus, stemDocument) | |
# create document-term matrix | |
train_dtm <- DocumentTermMatrix(train_corpus) | |
train_dtm <- removeSparseTerms(train_dtm, 0.995) | |
train_dtm_df <- data.frame(as.matrix(train_dtm)) | |
colnames(train_dtm_df) <- make.names(colnames(train_dtm_df)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(shiny) | |
shinyUI(fluidPage( | |
# Application title | |
headerPanel("Text Sentiment Analyser"), | |
sidebarLayout( | |
# the control panel | |
sidebarPanel( | |
fileInput('file1', 'Choose text File', | |
accept=c('text/tsv', | |
'text/tab-separated-values,text/plain', | |
'.tsv')), | |
tags$hr(), | |
sliderInput("threshold", | |
"Positive sentiment threshold", | |
min = .1, | |
max = .99, | |
value = .5), | |
tags$hr(), | |
sliderInput("sparsity", | |
"Max. term sparsity", | |
min = .1, | |
max = .99, | |
value = .95) | |
), | |
# Show a plot of the generated distribution | |
mainPanel( | |
plotOutput('distribution') | |
) | |
), | |
tags$hr(), | |
fluidRow( | |
# the results detail panel | |
column(12, | |
tableOutput('contents') | |
) | |
) | |
)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment