Skip to content

Instantly share code, notes, and snippets.

@skywodd
Last active September 25, 2017 19:45
Show Gist options
  • Save skywodd/591bfd79e232d64ac7a49f09408220c4 to your computer and use it in GitHub Desktop.
Save skywodd/591bfd79e232d64ac7a49f09408220c4 to your computer and use it in GitHub Desktop.
Incremential backup script for NGINX logs files. Store log entries in a SQlite3 database. Ignore existing entries using SHA256.
"""
Incremential backup script for NGINX logs files.
"""
import re
import sqlite3
import argparse
from hashlib import sha256
import pdb
import sys
import traceback
# Nginx access log format regex
nginx_access_re = re.compile(r'(?P<remote_addr>[^\s]+)\s-\s(?P<remote_user>[^\s]+)\s\[(?P<time_local>[^\]]+)\]\s"(?P<request>[^\"]*)"\s(?P<status>[^\s]+)\s(?P<body_bytes_sent>[^\s]+)\s"(?P<http_referer>[^\"]*)"\s"(?P<http_user_agent>[^\"]*)"')
datetime_re = re.compile(r'(?P<day>[0-9]{2})\/(?P<month>[A-Za-z]{3})\/(?P<year>[0-9]{4}):(?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2})\s(?P<timezone>[\+\-0-9]{5})')
# Nginx month labels
month_labels = {
'Jan': '01',
'Feb': '02',
'Mar': '03',
'Apr': '04',
'May': '05',
'Jun': '06',
'Jul': '07',
'Aug': '08',
'Sep': '09',
'Oct': '10',
'Nov': '11',
'Dec': '12',
}
def process_log(cursor, log_file, check_collision=False):
""" Read log file line by line and store content in database. """
# Open the log file
print('Openning "{}"...'.format(log_file))
with open(log_file) as fi:
counter = 0
ignored = 0
inserted = 0
# Process each line
print('Processing log entries (may take some time)...')
for line in fi:
# Get line footprint
m = sha256()
m.update(line.encode())
footprint = m.digest()
# Parse log line
m = nginx_access_re.match(line)
if m is None:
print('Error: Cannot parse line:')
print(line)
input('>>> Press enter to continue')
continue
# Unpack values
(
remote_addr,
remote_user,
time_local,
request,
status,
body_bytes_sent,
http_referer,
http_user_agent
) = m.groups()
# Fix missing value
if remote_user == '-':
remote_user = ''
if http_referer == '-':
http_referer = ''
if http_user_agent == '-':
http_user_agent = ''
# Unpack request
parts = request.split()
if len(parts) >= 2:
request_cmd, request_path, *_ = parts
else:
request_cmd = ''
request_path = request
# Parse datetime
day, month, year, hours, minutes, seconds, timezone = datetime_re.match(time_local).groups()
timezone = timezone[:3] + ':' + timezone[-2:]
time_local = '%s-%s-%s %s:%s:%s%s' % (year, month_labels[month], day, hours, minutes, seconds, timezone)
# Store everything
cursor.execute("INSERT OR IGNORE INTO logs VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(footprint, remote_addr, remote_user, time_local, request_cmd, request_path,
status, body_bytes_sent, http_referer, http_user_agent))
if cursor.rowcount:
inserted += 1
else:
# Detect SHA collision
if check_collision:
cursor.execute("SELECT * FROM logs WHERE footprint=?", (footprint, ))
row = cursor.fetchone()
if row['remote_addr'] != remote_addr \
or row['remote_user'] != remote_user \
or row['time_local'] != time_local \
or row['request_cmd'] != request_cmd \
or row['request_path'] != request_path \
or str(row['status']) != status \
or str(row['body_bytes_sent']) != body_bytes_sent \
or row['http_referer'] != http_referer \
or row['http_user_agent'] != http_user_agent:
print('Warning: collision found!')
print('Expected:\n',
footprint, remote_addr, remote_user, time_local,
request_cmd, request_path, status, body_bytes_sent,
http_referer, http_user_agent)
print('Got:\n',
row['footprint'], row['remote_addr'], row['remote_user'], row['time_local'],
row['request_cmd'], row['request_path'], row['status'], row['body_bytes_sent'],
row['http_referer'], row['http_user_agent'])
input('>>> Press enter to continue')
ignored += 1
# Update counter
counter += 1
if counter % 1024 == 0:
print('.', end='', flush=True)
# End stats
print()
print('Total: {} entries processed, {} inserted, {} ignored.'.format(counter, inserted, ignored))
return counter, inserted, ignored
def process_logs(database_file, *log_files, check_collision=False):
# Open the database
print('Openning database file...')
conn = sqlite3.connect(database_file)
conn.row_factory = sqlite3.Row
c = conn.cursor()
# Create the table
print('Create database scheme...')
c.execute(
'''CREATE TABLE IF NOT EXISTS logs (
footprint BLOB UNIQUE,
remote_addr TEXT,
remote_user TEXT,
time_local TEXT,
request_cmd TEXT,
request_path TEXT,
status INTEGER,
body_bytes_sent INTEGER,
http_referer TEXT,
http_user_agent TEXT)'''
)
# Process each file
counter = inserted = ignored = 0
for log_file in log_files:
_counter, _inserted, _ignored = process_log(c, log_file, check_collision)
counter += _counter
inserted += _inserted
ignored += _ignored
print('Processing done.')
print('Grand total: {} entries processed, {} inserted, {} ignored.'.format(counter, inserted, ignored))
# Save the changes
print('Saving the database...')
conn.commit()
# Close the connection
conn.close()
# Main entry point
if __name__ == '__main__':
# CLI interface
parser = argparse.ArgumentParser(description='Update web logs database.',
epilog='N.B. Duplicate log entries are ignored.')
parser.add_argument('database', metavar='DATABASE',
help='The SQlite database file path to work with.')
parser.add_argument('log_files', metavar='FILE', nargs='+',
help='All log files path to be processed.')
parser.add_argument('-c', '--check-collision', action='store_true',
help='Enable full collision checking for existing entries in the database.')
args = parser.parse_args()
# Run the program
try:
process_logs(args.database, *args.log_files,
check_collision=args.check_collision)
except:
type, value, tb = sys.exc_info()
traceback.print_exc()
pdb.post_mortem(tb)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment