Last active
August 10, 2017 23:46
-
-
Save akshaybabloo/03e998ec97456bf187b58eff26123e0d to your computer and use it in GitHub Desktop.
NeuCube form submission XML parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xmltodict | |
import pandas as pd | |
import dateutil.parser | |
def format_date(dt): | |
""" | |
Formats DateTTime to D-M-Y. | |
Parameters | |
---------- | |
dt: str | |
String of date and time. | |
Returns | |
------- | |
dt: str | |
Formatted date. | |
""" | |
date = dateutil.parser.parse(dt) | |
dt = '{}-{}-{}'.format(date.day, date.month, date.year) | |
return dt | |
def parse(path, file_name): | |
""" | |
Parses the KEDRI download submission form to relevant data and writes it to CSV file. | |
Parameters | |
---------- | |
path: str | |
Absolute path of the XML file. | |
file_name: str | |
File name to bes saved as csv. | |
""" | |
file = open(path) | |
main_content = {'Date': [], 'Name': [], 'Company/Institute': [], 'Email': [], 'Phone': []} | |
content = xmltodict.parse(file.read()) | |
for submission in dict(dict(content)['submissions'])['submission']: | |
main_content['Date'].append(format_date(dict(submission)['@time'])) | |
for text_q in dict(submission)['text_q']: | |
if dict(text_q)['@name'] == 'Name': | |
if '#text' in dict(text_q): | |
main_content['Name'].append(dict(text_q)['#text']) | |
else: | |
main_content['Name'].append(None) | |
elif dict(text_q)['@name'] == 'Company/Institute': | |
if '#text' in dict(text_q): | |
main_content['Company/Institute'].append(dict(text_q)['#text']) | |
else: | |
main_content['Company/Institute'].append(None) | |
elif dict(text_q)['@name'] == 'Email': | |
if '#text' in dict(text_q): | |
main_content['Email'].append(dict(text_q)['#text']) | |
else: | |
main_content['Email'].append(None) | |
elif dict(text_q)['@name'] == 'Phone': | |
if '#text' in dict(text_q): | |
main_content['Phone'].append(dict(text_q)['#text']) | |
else: | |
main_content['Phone'].append(None) | |
df = pd.DataFrame(main_content) | |
df = df[['Date', 'Name', 'Company/Institute', 'Phone', 'Email']] | |
df = df.drop_duplicates(subset=['Name']) | |
df.index += 1 | |
df.to_csv(file_name) | |
file.close() | |
if __name__ == '__main__': | |
parse('submission_log.xml', 'Kedri.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment