Created
December 11, 2017 07:25
-
-
Save sureshgorakala/74cf69f7ebd48e487b82e939234b990a to your computer and use it in GitHub Desktop.
Search engine using SVD in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#load required pacakges | |
if(!require("readtext")) | |
install.packages("readtext") | |
library(readtext) | |
if(!require("tm")) | |
install.packages("tm") | |
library(tm) | |
if(!require("stringr")) | |
install.packages("stringr") | |
library(stringr) | |
if(!require("qdap")) | |
install.packages("qdap") | |
library(qdap) | |
if(!require("slam")) | |
install.packages("slam") | |
library(slam) | |
setwd("C:\\Suresh\\Blog Posts\\textsimilarity\\ML_assignment\\Problem_statement_1\\data") | |
#data files are uploaded at below location: | |
#https://github.com/sureshgorakala/machinelearning/tree/master/data | |
#load all content files | |
news_docs = readtext("*.txt") | |
news_list = lapply(news_docs[,2],function(x) genX(x, " [", "]")) | |
N.docs = length(news_list) | |
names(news_list) = news_docs[,1] | |
setwd("C:\\Suresh\\Blog Posts\\textsimilarity\\ML_assignment\\Problem_statement_1") | |
#load search queries | |
search_queries = readtext("query.txt",dvsep = "\n") | |
queries_list = unlist(strsplit(search_queries[1,2],"\n")) | |
N.query = length(queries_list) | |
names(queries_list) = paste0("query", c(1:N.query)) | |
#preprocess data news content | |
#append both content and search queries together, convert the lists to VectorSource | |
newscorpus = VectorSource(c(news_list,queries_list)) | |
newscorpus$Names = c(names(news_list),names(queries_list)) | |
#convert to corpus format | |
newscorpus_preproc = Corpus(newscorpus) | |
#cleaning the data | |
newscorpus_preproc = tm_map(newscorpus_preproc,stripWhitespace) | |
newscorpus_preproc = tm_map(newscorpus_preproc,removePunctuation) | |
newscorpus_preproc = tm_map(newscorpus_preproc,content_transformer(tolower)) | |
newscorpus_preproc = tm_map(newscorpus_preproc,removeWords,stopwords("english")) | |
#create tdm using weighted tfidf weightage | |
tdm = TermDocumentMatrix(newscorpus_preproc,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))) | |
tdm_mat = as.matrix(tdm) | |
colnames(tdm_mat) = c(names(news_list),names(queries_list)) | |
#normalizing the term document matrix | |
tfidf_mat <- scale(tdm_mat, center = FALSE,scale = sqrt(colSums(tdm_mat^2))) | |
trainsvd = tfidf_mat[,1:9] | |
#trainsvd2 = t(tdm_mat)[1:9,] | |
querytfidf = tfidf_mat[,10:18] | |
#truncated svd | |
library(irlba) | |
decom2 = irlba(trainsvd,maxit=100) | |
D2 = diag(decom2$d) | |
U2 = decom2$u | |
V2 = decom2$v | |
newTerms2= U2 %*% D2 #1233X5 | |
newA2 = t(V2) #5X9 | |
newQ2 = t(newTerms2) %*% querytfidf #5X9 | |
cosinesim2 = t(newQ2) %*% newA2 | |
' | |
#normal svd | |
decom = svd(trainsvd) | |
newTerms= decom$u %*% diag(decom$d) | |
newA = t(decom$v) | |
newQ = t(newTerms) %*% querytfidf | |
cosinesim = newA %*% t(newQ) | |
' | |
#querytfidf2 = t(tdm_mat)[10:18,] | |
#applying svd to tdf | |
#nu = nrow(trainsvd) | |
#nv = ncol(trainsvd) | |
#decom = svd(trainsvd2) | |
#cosinesim2 = querytfidf2 %*% decom$v | |
' | |
if(!require("irlba")) | |
install.packages("irlba") | |
library(irlba) | |
decom2 = irlba(trainsvd,maxit=100) | |
cosinesim2 = querytfidf %*% decom2$v | |
svddata = data.frame(decom$v) | |
names(svddata) = rownames(tfidf_mat) | |
if(!require("lsa")) | |
install.packages("lsa") | |
library(lsa) | |
cosineSim = cosine(t(svddata)) | |
View(cosineSim) | |
#D = diag(decom$d) | |
#S = diag(decom$d^0.5 ) | |
#calculating document similarity: | |
#DocSim = S %*% t(decom$v) | |
library(corrplot) | |
o=corrplot(DocSim,method="number") # shows the similarity between the documents visually | |
#subsetting only the queries and its document similarity values | |
x = N.docs+1 | |
y = N.query+N.query | |
searchSimilarities = DocSim[x:y,1:x-1] | |
o=corrplot(searchSimilarities,method="number") # shows the similarity between the documents visually | |
#seperating query tdm matrix and content tdm matrix | |
query.vectors <- tfidf_mat[, (N.docs + 1):(N.docs+N.query)] | |
tfidf_mat <- tfidf_mat[, 1:N.docs]' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment