Created
September 9, 2019 13:07
-
-
Save belyaev-pa/3287a384e2078bf7622cd53bd1e0dc77 to your computer and use it in GitHub Desktop.
Test task by Beliaev for Polymedia company
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import urllib.request | |
import os | |
import gzip | |
import lxml.etree | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import time | |
from collections import defaultdict | |
XML_TMP = '/home/pavel/Загрузки/dtb/dblp.xml' | |
OUTPUT_FILE = '/home/pavel/Загрузки/dtb/myplot.png' | |
GZ_URL = 'http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml.gz' | |
DTD_URL = 'http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.dtd' | |
def timing(f): | |
""" | |
декоратор для измерения времени выполнения функции | |
:param f: | |
:return: | |
""" | |
def wrap(*args): | |
time1 = time.time() | |
ret = f(*args) | |
time2 = time.time() | |
print('{:s} Выполнение функции заняло: {:.3f} мс'.format(f.__name__, (time2-time1)*1000.0)) | |
return ret | |
return wrap | |
def download_gz_and_unpack(xml_tmp): | |
""" | |
функция загрузки архива с БД и его распаковки | |
:param xml_tmp: путь куда временно сохранить файл | |
:return: | |
""" | |
file_name, _headers = urllib.request.urlretrieve(GZ_URL) | |
with gzip.open(file_name, 'rb') as gz_file, open(xml_tmp, 'wb') as out: | |
out.write(gz_file.read()) | |
os.remove(file_name) | |
def iterate_over_xml(xml_path): | |
""" | |
генератор по элементам xml файла | |
:param xml_path: путь до xml файла | |
:return: | |
""" | |
records = lxml.etree.iterparse(xml_path, events=("start", "end"), | |
dtd_validation=True, load_dtd=True) | |
records.resolvers.add(DTDResolver()) | |
_, root = next(records) | |
start_tag = None | |
for event, element in records: | |
if event == 'start' and start_tag is None: # a new start | |
start_tag = element.tag | |
if event == 'end' and element.tag == start_tag: | |
yield element | |
start_tag = None | |
element.clear() | |
class DTDResolver(lxml.etree.Resolver): | |
""" | |
Для загрузки DTD схемы | |
""" | |
def resolve(self, system_url, public_id, context): | |
file_name, _headers = urllib.request.urlretrieve(DTD_URL) | |
return self.resolve_filename(file_name, context) | |
def count_books_by_years(xml_path): | |
""" | |
считаем книги по годам | |
:param xml_path: путь до xml файла | |
:return: | |
""" | |
years = defaultdict(int) | |
for record in iterate_over_xml(xml_path): | |
flag, year = False, False | |
for attr in record: | |
if attr.tag == 'booktitle': | |
flag = True | |
elif attr.tag == 'year': | |
year = attr.text | |
if flag and year: | |
years[year] += 1 | |
return years | |
def plot_png(years_dict): | |
""" | |
функция для формирования графика | |
:param years_dict: слоавь содержащий книги по годам | |
:return: void | |
""" | |
df = pd.DataFrame([('{} г.'.format(k), v) for k, v in years_dict.items()], | |
columns=['year', 'book_count']) | |
df.sort_values(by=['year']) | |
print(df) | |
df.plot.bar(x='year',y='book_count', figsize=(9,6), logy=True, | |
yticks=[100,1000,10000,100000]) | |
plt.savefig(OUTPUT_FILE) | |
@timing | |
def main(): | |
download_gz_and_unpack(XML_TMP) | |
plot_png(count_books_by_years(XML_TMP)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment