Skip to content

Instantly share code, notes, and snippets.

@yangweigbh
Created August 17, 2020 15:19
Show Gist options
  • Save yangweigbh/feb7dfe52754e4398de30280501ca8ba to your computer and use it in GitHub Desktop.
Save yangweigbh/feb7dfe52754e4398de30280501ca8ba to your computer and use it in GitHub Desktop.
simple reverse index application that parse wikipedia abstract https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz
@file:DependsOn("com.github.ajalt:clikt:2.8.0")
@file:DependsOn("org.apache.opennlp:opennlp-tools:1.9.3")
import com.github.ajalt.clikt.core.CliktCommand
import com.github.ajalt.clikt.parameters.options.option
import com.github.ajalt.clikt.parameters.options.prompt
import opennlp.tools.stemmer.PorterStemmer
import org.xml.sax.Attributes
import org.xml.sax.helpers.DefaultHandler
import java.io.File
import java.util.concurrent.atomic.AtomicInteger
import java.util.regex.Pattern
import javax.xml.parsers.SAXParser
import javax.xml.parsers.SAXParserFactory
data class Doc(var title: String = "", var url: String = "", var abstract: String = "", val id: Int)
object ReverseIndexCommand : CliktCommand(name = "reverse-index") {
private val STOP_WORDS: Set<String> = setOf("a", "and", "be", "have", "i", "in", "of", "that", "the", "to")
private val xml: String by option(help="path to xml").prompt("wiki xml path")
private val query: String by option(help="query").prompt("query")
override fun run() {
val docMap = loadDocument(xml)
val reverseIndex = buildReverseIndex(docMap)
val ids = doSearch(reverseIndex, query)
for (id in ids) {
val doc = docMap[id]
println("${doc?.id}\t${doc?.abstract}")
}
}
private fun doSearch(reverseIndex: Map<String, MutableList<Int>>, query: String): List<Int> {
var res : MutableList<Int>? = null
val queryTokens = analyze(query)
for (qToken in queryTokens) {
res = if (res == null) {
reverseIndex[qToken]
} else {
intersection(res, reverseIndex[qToken])
}
}
return res ?: listOf()
}
private fun intersection(a: MutableList<Int>, b: MutableList<Int>?): MutableList<Int> {
val res: MutableList<Int> = mutableListOf()
if (b == null) return res
var i = 0
var j = 0
while (i < a.size && j < b.size) {
if (a[i] < b[j]) {
i++
} else if (a[i] > b[j]) {
j++
} else {
res.add(a[i])
i++
j++
}
}
return res
}
private fun loadDocument(path: String): Map<Int, Doc> {
val inputFile = File(path)
val factory = SAXParserFactory.newInstance()
val saxParser: SAXParser = factory.newSAXParser()
val handler = XMLHandler()
saxParser.parse(inputFile, handler)
return handler.docMap
}
class XMLHandler : DefaultHandler() {
private var inTitle: Boolean = false
private var inUrl: Boolean = false
private var inAbstract: Boolean = false
val docMap: MutableMap<Int, Doc> = mutableMapOf()
private var currentDoc: Doc? = null
private val idGenerator: AtomicInteger = AtomicInteger(0)
override fun endElement(uri: String?, localName: String?, qName: String?) {
super.endElement(uri, localName, qName)
if (qName != null) {
if (qName == "doc") {
currentDoc?.let { docMap.put(it.id, it) }
}
}
}
override fun characters(ch: CharArray?, start: Int, length: Int) {
super.characters(ch, start, length)
if (ch != null) {
if (inTitle) {
currentDoc?.title = String(ch, start, length)
inTitle = false
} else if (inUrl) {
currentDoc?.url = String(ch, start, length)
inUrl = false
} else if (inAbstract) {
currentDoc?.abstract = String(ch, start, length)
inAbstract = false
}
}
}
override fun startElement(uri: String?, localName: String?, qName: String?, attributes: Attributes?) {
super.startElement(uri, localName, qName, attributes)
if (qName != null) {
when (qName) {
"doc" -> {
currentDoc = Doc(id = idGenerator.getAndIncrement())
}
"title" -> {
inTitle = true
}
"url" -> {
inUrl = true
}
"abstract" -> {
inAbstract = true
}
}
}
}
}
private fun buildReverseIndex(docs: Map<Int, Doc>): Map<String, MutableList<Int>> {
val res: MutableMap<String, MutableList<Int>> = mutableMapOf()
for ((_, doc) in docs) {
val tokenList = analyze(doc.abstract)
for (token in tokenList) {
val docList = res.getOrDefault(token, mutableListOf())
if (docList.isNotEmpty() && docList[docList.size - 1] == doc.id) {
continue
}
docList.add(doc.id)
res[token] = docList
}
}
return res
}
private fun analyze(abstract: String): List<String> {
var tokens = tokenize(abstract)
tokens = lowerCaseFilter(tokens)
tokens = stopWordFilter(tokens)
tokens = stemmerFilter(tokens)
return tokens
}
private fun stemmerFilter(tokens: List<String>): List<String> {
val stemmer = PorterStemmer()
return tokens.map { stemmer.stem(it) }
}
private fun stopWordFilter(tokens: List<String>): List<String> {
return tokens.filter { !STOP_WORDS.contains(it) }
}
private fun lowerCaseFilter(tokens: List<String>): List<String> {
return tokens.map { it.toLowerCase() }
}
private fun tokenize(abstract: String): List<String> {
return abstract.split(Pattern.compile("\\W"))
}
}
ReverseIndexCommand.main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment