Skip to content

Instantly share code, notes, and snippets.

@nudomarinero
Created February 20, 2016 16:16
Show Gist options
  • Save nudomarinero/eaee524f11de0316f19c to your computer and use it in GitHub Desktop.
Save nudomarinero/eaee524f11de0316f19c to your computer and use it in GitHub Desktop.
Transform MacJournal individual text entries to Zim
from __future__ import print_function
import locale
import datetime
import re
import os
from os.path import isfile, join
import unicodedata
from unidecode import unidecode
# Configuration
loc = locale.setlocale(locale.LC_TIME, ("es_ES.utf8", "es_ES.utf8"))
def parse_date(input_line):
"""
Parse the date in the MacJournal format
"""
date_numbers = list(re.findall("(\d{1,2}) de (.*?) de (\d{2,4}), (\d{1,2}):(\d{1,2})", input_line)[0])
date_numbers[0] = "{:02d}".format(int(date_numbers[0])) # Day
date_numbers[3] = "{:02d}".format(int(date_numbers[3])) # Hour
date_re = "{} {} {} {}:{}".format(*date_numbers)
return datetime.datetime.strptime(date_re, "%d %B %Y %H:%M")
def parse_file(input_file):
"""
Parse a MacJournal txt single entry file
"""
with open(input_file, "r") as f:
line1 = f.readline().split("\t")
assert line1[1] == "Fecha:", "Wrong format of line 1"
date_entry = parse_date(line1[2])
line2 = f.readline()
if "Tema:" in line2:
title_entry = line2.split("\t")[2].strip()
else:
title_entry = input_file.split("/")[-1].strip().replace(".txt", "")
content = [line for line in f]
return title_entry, content, date_entry
def format_zim(title, content, date, creation_date=False):
"""
Generate a Zim-like list of lines
"""
out = []
out.append("Content-Type: text/x-zim-wiki\n")
out.append("Wiki-Format: zim 0.4\n")
out.append("Creation-Date: "+date.isoformat()+"+00:00\n\n")
out.append("====== {} ======\n".format(title))
if creation_date:
out.append("Creado "+date.isoformat(" ")+"\n")
out.extend(content)
return out
def transform_file(input_file, output_file, creation_date=True):
"""
Enter a MacJournal file and output a Zim file
"""
title_entry, content, date_entry = parse_file(input_file)
out = format_zim(title_entry, content, date_entry, creation_date=creation_date)
with open(output_file, "w") as f:
for line in out:
f.write(line)
if __name__ == "__main__":
output_dir = "MacJournal"
dirs = ["Blog", "LOFAR", "Personal"]
for d in dirs:
path = join("text_1", d)
entries = [f for f in os.listdir(path) if isfile(join(path, f)) and f.endswith("txt")]
if not os.path.exists(join(output_dir, d)):
os.mkdir(join(output_dir, d))
for entry in entries:
#name = unicodedata.normalize('NFKC', entry.replace(" ", "_"))
name = unidecode(entry.replace(" ", "_"))
print(entry, name)
input_file = join("text_1", d, entry)
output_file = join(output_dir, d, name)
transform_file(input_file, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment