Created
August 17, 2020 15:19
-
-
Save yangweigbh/feb7dfe52754e4398de30280501ca8ba to your computer and use it in GitHub Desktop.
simple reverse index application that parse wikipedia abstract https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@file:DependsOn("com.github.ajalt:clikt:2.8.0") | |
@file:DependsOn("org.apache.opennlp:opennlp-tools:1.9.3") | |
import com.github.ajalt.clikt.core.CliktCommand | |
import com.github.ajalt.clikt.parameters.options.option | |
import com.github.ajalt.clikt.parameters.options.prompt | |
import opennlp.tools.stemmer.PorterStemmer | |
import org.xml.sax.Attributes | |
import org.xml.sax.helpers.DefaultHandler | |
import java.io.File | |
import java.util.concurrent.atomic.AtomicInteger | |
import java.util.regex.Pattern | |
import javax.xml.parsers.SAXParser | |
import javax.xml.parsers.SAXParserFactory | |
data class Doc(var title: String = "", var url: String = "", var abstract: String = "", val id: Int) | |
object ReverseIndexCommand : CliktCommand(name = "reverse-index") { | |
private val STOP_WORDS: Set<String> = setOf("a", "and", "be", "have", "i", "in", "of", "that", "the", "to") | |
private val xml: String by option(help="path to xml").prompt("wiki xml path") | |
private val query: String by option(help="query").prompt("query") | |
override fun run() { | |
val docMap = loadDocument(xml) | |
val reverseIndex = buildReverseIndex(docMap) | |
val ids = doSearch(reverseIndex, query) | |
for (id in ids) { | |
val doc = docMap[id] | |
println("${doc?.id}\t${doc?.abstract}") | |
} | |
} | |
private fun doSearch(reverseIndex: Map<String, MutableList<Int>>, query: String): List<Int> { | |
var res : MutableList<Int>? = null | |
val queryTokens = analyze(query) | |
for (qToken in queryTokens) { | |
res = if (res == null) { | |
reverseIndex[qToken] | |
} else { | |
intersection(res, reverseIndex[qToken]) | |
} | |
} | |
return res ?: listOf() | |
} | |
private fun intersection(a: MutableList<Int>, b: MutableList<Int>?): MutableList<Int> { | |
val res: MutableList<Int> = mutableListOf() | |
if (b == null) return res | |
var i = 0 | |
var j = 0 | |
while (i < a.size && j < b.size) { | |
if (a[i] < b[j]) { | |
i++ | |
} else if (a[i] > b[j]) { | |
j++ | |
} else { | |
res.add(a[i]) | |
i++ | |
j++ | |
} | |
} | |
return res | |
} | |
private fun loadDocument(path: String): Map<Int, Doc> { | |
val inputFile = File(path) | |
val factory = SAXParserFactory.newInstance() | |
val saxParser: SAXParser = factory.newSAXParser() | |
val handler = XMLHandler() | |
saxParser.parse(inputFile, handler) | |
return handler.docMap | |
} | |
class XMLHandler : DefaultHandler() { | |
private var inTitle: Boolean = false | |
private var inUrl: Boolean = false | |
private var inAbstract: Boolean = false | |
val docMap: MutableMap<Int, Doc> = mutableMapOf() | |
private var currentDoc: Doc? = null | |
private val idGenerator: AtomicInteger = AtomicInteger(0) | |
override fun endElement(uri: String?, localName: String?, qName: String?) { | |
super.endElement(uri, localName, qName) | |
if (qName != null) { | |
if (qName == "doc") { | |
currentDoc?.let { docMap.put(it.id, it) } | |
} | |
} | |
} | |
override fun characters(ch: CharArray?, start: Int, length: Int) { | |
super.characters(ch, start, length) | |
if (ch != null) { | |
if (inTitle) { | |
currentDoc?.title = String(ch, start, length) | |
inTitle = false | |
} else if (inUrl) { | |
currentDoc?.url = String(ch, start, length) | |
inUrl = false | |
} else if (inAbstract) { | |
currentDoc?.abstract = String(ch, start, length) | |
inAbstract = false | |
} | |
} | |
} | |
override fun startElement(uri: String?, localName: String?, qName: String?, attributes: Attributes?) { | |
super.startElement(uri, localName, qName, attributes) | |
if (qName != null) { | |
when (qName) { | |
"doc" -> { | |
currentDoc = Doc(id = idGenerator.getAndIncrement()) | |
} | |
"title" -> { | |
inTitle = true | |
} | |
"url" -> { | |
inUrl = true | |
} | |
"abstract" -> { | |
inAbstract = true | |
} | |
} | |
} | |
} | |
} | |
private fun buildReverseIndex(docs: Map<Int, Doc>): Map<String, MutableList<Int>> { | |
val res: MutableMap<String, MutableList<Int>> = mutableMapOf() | |
for ((_, doc) in docs) { | |
val tokenList = analyze(doc.abstract) | |
for (token in tokenList) { | |
val docList = res.getOrDefault(token, mutableListOf()) | |
if (docList.isNotEmpty() && docList[docList.size - 1] == doc.id) { | |
continue | |
} | |
docList.add(doc.id) | |
res[token] = docList | |
} | |
} | |
return res | |
} | |
private fun analyze(abstract: String): List<String> { | |
var tokens = tokenize(abstract) | |
tokens = lowerCaseFilter(tokens) | |
tokens = stopWordFilter(tokens) | |
tokens = stemmerFilter(tokens) | |
return tokens | |
} | |
private fun stemmerFilter(tokens: List<String>): List<String> { | |
val stemmer = PorterStemmer() | |
return tokens.map { stemmer.stem(it) } | |
} | |
private fun stopWordFilter(tokens: List<String>): List<String> { | |
return tokens.filter { !STOP_WORDS.contains(it) } | |
} | |
private fun lowerCaseFilter(tokens: List<String>): List<String> { | |
return tokens.map { it.toLowerCase() } | |
} | |
private fun tokenize(abstract: String): List<String> { | |
return abstract.split(Pattern.compile("\\W")) | |
} | |
} | |
ReverseIndexCommand.main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment