sureshgorakala · December 11, 2017 07:25
diff --git a/search_engine_svd.r b/search_engine_svd.r
 #load required pacakges
 if(!require("readtext"))
  install.packages("readtext")
 library(readtext)

 if(!require("tm"))
  install.packages("tm")
 library(tm)

 if(!require("stringr"))
  install.packages("stringr")
 library(stringr)

 if(!require("qdap"))
  install.packages("qdap")
 library(qdap)

 if(!require("slam"))
  install.packages("slam")
 library(slam)

 setwd("C:\\Suresh\\Blog Posts\\textsimilarity\\ML_assignment\\Problem_statement_1\\data")

 #data files are uploaded at below location:
 #https://github.com/sureshgorakala/machinelearning/tree/master/data

 #load all content files
 news_docs = readtext("*.txt")
 news_list = lapply(news_docs[,2],function(x) genX(x, " [", "]"))
 N.docs = length(news_list)
 names(news_list) = news_docs[,1]

 setwd("C:\\Suresh\\Blog Posts\\textsimilarity\\ML_assignment\\Problem_statement_1")

 #load search queries
 search_queries = readtext("query.txt",dvsep = "\n")
 queries_list = unlist(strsplit(search_queries[1,2],"\n"))
 N.query = length(queries_list)
 names(queries_list) = paste0("query", c(1:N.query))


 #preprocess data news content
 #append both content and search queries together, convert the lists to VectorSource
 newscorpus = VectorSource(c(news_list,queries_list))
 newscorpus$Names = c(names(news_list),names(queries_list))
 #convert to corpus format
 newscorpus_preproc = Corpus(newscorpus)
 #cleaning the data
 newscorpus_preproc = tm_map(newscorpus_preproc,stripWhitespace)
 newscorpus_preproc = tm_map(newscorpus_preproc,removePunctuation)
 newscorpus_preproc = tm_map(newscorpus_preproc,content_transformer(tolower))
 newscorpus_preproc = tm_map(newscorpus_preproc,removeWords,stopwords("english"))


 #create tdm using weighted tfidf weightage
 tdm = TermDocumentMatrix(newscorpus_preproc,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
 tdm_mat = as.matrix(tdm)
 colnames(tdm_mat) = c(names(news_list),names(queries_list))

 #normalizing the term document matrix
 tfidf_mat <- scale(tdm_mat, center = FALSE,scale = sqrt(colSums(tdm_mat^2)))

 trainsvd = tfidf_mat[,1:9]
 #trainsvd2 = t(tdm_mat)[1:9,]

 querytfidf = tfidf_mat[,10:18]

 #truncated svd
 library(irlba)

 decom2 = irlba(trainsvd,maxit=100)

 D2 = diag(decom2$d)
 U2 = decom2$u
 V2 = decom2$v

 newTerms2= U2 %*% D2 #1233X5
 newA2 = t(V2) #5X9
 newQ2 = t(newTerms2) %*% querytfidf #5X9

 cosinesim2 = t(newQ2) %*% newA2


 '
 #normal svd
 decom = svd(trainsvd)

 newTerms= decom$u %*% diag(decom$d)
 newA = t(decom$v)
 newQ = t(newTerms) %*% querytfidf

 cosinesim = newA %*% t(newQ)
 '

 #querytfidf2 = t(tdm_mat)[10:18,]

 #applying svd to tdf
 #nu = nrow(trainsvd)
 #nv = ncol(trainsvd)

 #decom = svd(trainsvd2)


 #cosinesim2 = querytfidf2 %*% decom$v


 '
 if(!require("irlba"))
  install.packages("irlba")
 library(irlba)

 decom2 = irlba(trainsvd,maxit=100)
 cosinesim2 = querytfidf %*% decom2$v


 svddata = data.frame(decom$v)
 names(svddata) = rownames(tfidf_mat)

 if(!require("lsa"))
  install.packages("lsa")
 library(lsa)

 cosineSim = cosine(t(svddata))
 View(cosineSim)

 #D = diag(decom$d)
 #S = diag(decom$d^0.5 )

 #calculating document similarity:
 #DocSim = S %*% t(decom$v)

 library(corrplot)
 o=corrplot(DocSim,method="number") # shows the similarity between the documents visually

 #subsetting only the queries and its document similarity values
 x = N.docs+1
 y = N.query+N.query
 searchSimilarities = DocSim[x:y,1:x-1]
 o=corrplot(searchSimilarities,method="number") # shows the similarity between the documents visually


 #seperating query tdm matrix and content tdm matrix
 query.vectors <- tfidf_mat[, (N.docs + 1):(N.docs+N.query)]
 tfidf_mat <- tfidf_mat[, 1:N.docs]'
	#load required pacakges
	if(!require("readtext"))
	install.packages("readtext")
	library(readtext)

	if(!require("tm"))
	install.packages("tm")
	library(tm)

	if(!require("stringr"))
	install.packages("stringr")
	library(stringr)

	if(!require("qdap"))
	install.packages("qdap")
	library(qdap)

	if(!require("slam"))
	install.packages("slam")
	library(slam)

	setwd("C:\\Suresh\\Blog Posts\\textsimilarity\\ML_assignment\\Problem_statement_1\\data")

	#data files are uploaded at below location:
	#https://github.com/sureshgorakala/machinelearning/tree/master/data

	#load all content files
	news_docs = readtext("*.txt")
	news_list = lapply(news_docs[,2],function(x) genX(x, " [", "]"))
	N.docs = length(news_list)
	names(news_list) = news_docs[,1]

	setwd("C:\\Suresh\\Blog Posts\\textsimilarity\\ML_assignment\\Problem_statement_1")

	#load search queries
	search_queries = readtext("query.txt",dvsep = "\n")
	queries_list = unlist(strsplit(search_queries[1,2],"\n"))
	N.query = length(queries_list)
	names(queries_list) = paste0("query", c(1:N.query))


	#preprocess data news content
	#append both content and search queries together, convert the lists to VectorSource
	newscorpus = VectorSource(c(news_list,queries_list))
	newscorpus$Names = c(names(news_list),names(queries_list))
	#convert to corpus format
	newscorpus_preproc = Corpus(newscorpus)
	#cleaning the data
	newscorpus_preproc = tm_map(newscorpus_preproc,stripWhitespace)
	newscorpus_preproc = tm_map(newscorpus_preproc,removePunctuation)
	newscorpus_preproc = tm_map(newscorpus_preproc,content_transformer(tolower))
	newscorpus_preproc = tm_map(newscorpus_preproc,removeWords,stopwords("english"))


	#create tdm using weighted tfidf weightage
	tdm = TermDocumentMatrix(newscorpus_preproc,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
	tdm_mat = as.matrix(tdm)
	colnames(tdm_mat) = c(names(news_list),names(queries_list))

	#normalizing the term document matrix
	tfidf_mat <- scale(tdm_mat, center = FALSE,scale = sqrt(colSums(tdm_mat^2)))

	trainsvd = tfidf_mat[,1:9]
	#trainsvd2 = t(tdm_mat)[1:9,]

	querytfidf = tfidf_mat[,10:18]

	#truncated svd
	library(irlba)

	decom2 = irlba(trainsvd,maxit=100)

	D2 = diag(decom2$d)
	U2 = decom2$u
	V2 = decom2$v

	newTerms2= U2 %*% D2 #1233X5
	newA2 = t(V2) #5X9
	newQ2 = t(newTerms2) %*% querytfidf #5X9

	cosinesim2 = t(newQ2) %*% newA2


	'
	#normal svd
	decom = svd(trainsvd)

	newTerms= decom$u %*% diag(decom$d)
	newA = t(decom$v)
	newQ = t(newTerms) %*% querytfidf

	cosinesim = newA %*% t(newQ)
	'

	#querytfidf2 = t(tdm_mat)[10:18,]

	#applying svd to tdf
	#nu = nrow(trainsvd)
	#nv = ncol(trainsvd)

	#decom = svd(trainsvd2)


	#cosinesim2 = querytfidf2 %*% decom$v


	'
	if(!require("irlba"))
	install.packages("irlba")
	library(irlba)

	decom2 = irlba(trainsvd,maxit=100)
	cosinesim2 = querytfidf %*% decom2$v


	svddata = data.frame(decom$v)
	names(svddata) = rownames(tfidf_mat)

	if(!require("lsa"))
	install.packages("lsa")
	library(lsa)

	cosineSim = cosine(t(svddata))
	View(cosineSim)

	#D = diag(decom$d)
	#S = diag(decom$d^0.5 )

	#calculating document similarity:
	#DocSim = S %*% t(decom$v)

	library(corrplot)
	o=corrplot(DocSim,method="number") # shows the similarity between the documents visually

	#subsetting only the queries and its document similarity values
	x = N.docs+1
	y = N.query+N.query
	searchSimilarities = DocSim[x:y,1:x-1]
	o=corrplot(searchSimilarities,method="number") # shows the similarity between the documents visually


	#seperating query tdm matrix and content tdm matrix
	query.vectors <- tfidf_mat[, (N.docs + 1):(N.docs+N.query)]
	tfidf_mat <- tfidf_mat[, 1:N.docs]'