Created
September 19, 2018 17:46
-
-
Save 8bitben/689175621c621ac4caea529363962f57 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import calendar | |
''' | |
CSV to CLF log file formatter | |
Ben Shepherd | |
Go Fish Digital | |
August 2018 | |
''' | |
with open('client-logs-formatted.log','w') as log_out: | |
with open('client-logs.csv') as log_csv: | |
reader = csv.DictReader(log_csv) | |
for row in reader: | |
#Initialize log line object | |
log_file_object = {} | |
#Grab request IP | |
log_file_object['request_ip'] = row['requestip'] | |
#User identifier not relevant for this project, no authentication used | |
log_file_object['user_identifier'] = '-' | |
#Same as above | |
log_file_object['user_id'] = '-' | |
''' | |
Date and time are separated and incorrectly formatted in source data, | |
combine and reformat them here | |
''' | |
date_split = row['date'].split('-') | |
year = date_split[0] | |
month = calendar.month_abbr[int(date_split[1])] | |
day = date_split[2] | |
#Bring it all together into a CLF-style timestamp string | |
log_file_object['request_timestamp'] = '[{}/{}/{}:{} {}]'.format(day,month,year,row['time'],'-0000') | |
#Check if query string is relevant, if it is we need to reformat for CLF | |
if row['querystring'] != '-': | |
http_request = '{} {}?{}'.format(row['method'],row['uri'],row['querystring']) | |
elif row['querystring'] == '-': | |
http_request = '{} {}'.format(row['method'],row['uri']) | |
#HTTP Request String | |
log_file_object['http_request'] = http_request | |
#HTTP Status | |
log_file_object['http_status_code'] = row['status'] | |
#Response size in bytes | |
log_file_object['response_size'] = '-' | |
#Referrer -- this is specific to Combined LF, not spec'd in Common LF | |
log_file_object['referrer'] = row['referrer'] | |
#UserAgent -- this is specific to Combined LF, not spec'd in Common LF | |
#There is a data anomoly re: quotations, remove them | |
log_file_object['user_agent'] = row['useragent'].replace('%2520',' ') | |
#We've got everything we need! Smash it all together into one line each for the resulting log file output | |
combined_log_line_format = '{request_ip} {user_identifier} {user_id} {request_timestamp} "{http_request}" {http_status_code} "{referrer}" "{user_agent}"\n'.format(**log_file_object) | |
#debug | |
print(combined_log_line_format) | |
#write | |
log_out.write(combined_log_line_format) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment