Skip to content

Instantly share code, notes, and snippets.

@hobbes3
Last active April 18, 2019 22:44
Show Gist options
  • Save hobbes3/ad162289be8e6368efd94f751eabb212 to your computer and use it in GitHub Desktop.
Save hobbes3/ad162289be8e6368efd94f751eabb212 to your computer and use it in GitHub Desktop.
irs 990 add oneshot onboard multithread multithreading concurrent splunk4good nom on
#!/usr/bin/env python
# hobbes3
# A way to handle indexing 2+ million XML files in a single directory (synced from a S3 bucket)
import glob
import time
import logging
import logging.handlers
import subprocess
import os
from multiprocessing.dummy import Pool as ThreadPool
start_time = time.time()
THREADS = 16
FILES = "/home/splunk/irs_990/data/*.xml"
FNULL = open(os.devnull, 'w')
logger = logging.getLogger('logger_debug')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter("[%(levelname)s] (%(threadName)-10s) %(message)s"))
logger.addHandler(ch)
forms = glob.glob(FILES)
def nom_on(form):
logger.info("Indexing %s" % form)
command = "/opt/splunk/bin/splunk nom on %s -index irs_990 -sourcetype irs_990" % form
process = subprocess.Popen(command.split(), stdout=FNULL, stderr=subprocess.STDOUT)
process.wait()
# http://stackoverflow.com/a/28463266/1150923
pool = ThreadPool(THREADS)
results = pool.map(nom_on, forms)
pool.close()
pool.join()
print "--- %s seconds ---" % (time.time() - start_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment