Created
November 11, 2015 02:31
-
-
Save sstults/715267e06c4d77c37e06 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python -x | |
import pysolr | |
import sys | |
from nltk.corpus import wordnet as wn | |
class Indexer: | |
""" | |
Loads hypernyms, hyponyms, holonyms, and lemmas from WordNet into Solr | |
""" | |
def __init__(self, args): | |
self.args = args | |
self.doc_buffer = [] | |
self.doc_count = 0 | |
self.solr = pysolr.Solr(args.url, 100) | |
def add_doc(self, solrdoc): | |
self.doc_buffer.append(solrdoc) | |
self.doc_count += 1 | |
if len(self.doc_buffer) >= self.args.batch_size: | |
self.post() | |
self.doc_buffer = [] | |
def post(self): | |
self.solr.add(self.doc_buffer) | |
self.doc_buffer = [] | |
self.status() | |
def status(self): | |
sys.stdout.write("Added %d records\r" % self.doc_count) | |
sys.stdout.flush() | |
@staticmethod | |
def make_doc_from_synset(synset): | |
return { | |
'id': synset.name(), | |
'hypernyms': [x.name() for x in synset.hypernyms()], | |
'hyponyms': [x.name() for x in synset.hyponyms()], | |
'holonyms': [x.name() for x in synset.member_holonyms()], | |
'lemmas': [x.name() for x in synset.lemmas()] | |
} | |
def load_all_synsets(self): | |
if self.args.delete: | |
self.solr.delete('*:*') | |
for synset in wn.all_synsets('n'): | |
self.add_doc(self.make_doc_from_synset(synset)) | |
self.post() # add the remainder of docs | |
print() | |
print("Done") | |
if __name__ == "__main__": | |
import argparse | |
import textwrap | |
parser = argparse.ArgumentParser(prog='wordnet-indexer', | |
description='Loads some WordNet into Solr', | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog=textwrap.dedent(''' | |
Example: \n\n | |
./wordnet-indexer.py -d -b 1000 | |
''')) | |
parser.add_argument('-u', '--url', help='Solr base url', action='store', | |
default='http://localhost:8983/solr/gettingstarted') | |
parser.add_argument('-d', '--delete', help='Delete all docs first', action='store_true') | |
parser.add_argument('-b', '--batch_size', help='Solr update batch size', action='store', type=int, default=1000) | |
i = Indexer(parser.parse_args()) | |
i.load_all_synsets() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment