Skip to content

Instantly share code, notes, and snippets.

@sosmii
Forked from urschrei/parseml.py
Last active August 6, 2020 12:28
Show Gist options
  • Save sosmii/38fae628d8cafb7764dd806aef787429 to your computer and use it in GitHub Desktop.
Save sosmii/38fae628d8cafb7764dd806aef787429 to your computer and use it in GitHub Desktop.
オリジナル版だと2byte文字のデコードができないっぽいことがあったので一回decode_headerをかました
#!/usr/bin/env python
"""
2020 update:
- More iterators, fewer lists
- Python 3 compatible
- Processes files in parallel
(one thread per CPU, but that's not really how it works)
"""
import email
import glob
import os
from email.header import decode_header
from multiprocessing import Pool
EXTENSION = "eml"
def extract(filename):
"""
Try to extract the attachments from all files in cwd
"""
# ensure that an output dir exists
od = "output"
os.path.exists(od) or os.makedirs(od)
output_count = 0
try:
with open(filename, "r") as f:
msg = email.message_from_file(f)
for attachment in msg.get_payload()[1:]:
raw_attachment_name = attachment.get_filename()
# If no attachments are found, skip this file
if raw_attachment_name:
files = decode_header(raw_attachment_name)
byte_attachment_name, encoding = files[0]
decoded_attachment_name = byte_attachment_name.decode() \
if type(byte_attachment_name) == bytes \
else byte_attachment_name
with open(os.path.join(od, decoded_attachment_name), "wb") as of:
of.write(attachment.get_payload(decode=True))
output_count += 1
if output_count == 0:
print("No attachment found for file %s!" % f.name)
# this should catch read and write errors
except IOError:
print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
if __name__ == "__main__":
# let's do this in parallel, using cpu count as number of threads
pool = Pool(None)
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
# need these if we use _async
pool.close()
pool.join()
# 2-element list holding number of files, number of attachments
numfiles = [sum(i) for i in zip(*res)]
print("Done: Processed {} files with {} attachments.".format(*numfiles))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment