Skip to content

Instantly share code, notes, and snippets.

@drinkcat
Last active July 3, 2024 04:17
Show Gist options
  • Save drinkcat/292257bcbc9d36751d688c06304225b9 to your computer and use it in GitHub Desktop.
Save drinkcat/292257bcbc9d36751d688c06304225b9 to your computer and use it in GitHub Desktop.
Split gmail takeout MBOX file per year
# Split a mbox file per year
import os
import sys
MATCH = b'\r\n\r\nFrom '
OFFSET = 4 # 2 pairs of \r\n above
BUFSIZE = 1024*1024
outfiles = {}
def parse_message(buffer, start, end):
print(f'Message {start}-{end}')
headend = buffer.find(b'\r\n', start)
if headend < start:
raise Exception('Can\'t find header.')
header = buffer[start:headend].decode('utf-8')
# TODO: Would be more proper to pass 2 as second
# parameter and parse the date properly.
headerdata = header.split(' ') #, 2
if headerdata[0] != "From":
raise Exception(f'Bad header /{header}/.')
print(f"{headerdata[1]} -- {headerdata[-1]}")
# Just use the year
outfile = headerdata[-1]
if not outfile in outfiles:
outfiles[outfile] = open(output + "/" + outfile, "wb")
outfiles[outfile].write(buffer[start:end])
if len(sys.argv) != 3:
print("Usage: python split.py input.mbox output")
exit()
filename = sys.argv[1]
output = sys.argv[2]
if not os.path.isdir(output):
os.mkdir(output)
lastbuf = b''
# Parse input
infile = open(filename, 'rb')
while True:
buf = infile.read(BUFSIZE)
if not buf:
break
buf = lastbuf + buf
i = 0
while True:
# Find next
nexti = buf.find(MATCH, i)
if nexti < 0:
break
nexti += OFFSET
# Parse
parse_message(buf, i, nexti)
i = nexti
lastbuf = buf[i:len(buf)]
# Last message
if len(lastbuf) > 0:
parse_message(lastbuf, 0, len(lastbuf))
# Close all FD
for outfile in outfiles:
outfiles[outfile].close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment