-
-
Save takaxp/1489536 to your computer and use it in GitHub Desktop.
Evernote から Export した HTML を Org-mode 形式に変換するスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
""" | |
evnt2org | |
convert html text that is exported from Evernote to org-mode | |
""" | |
import urllib | |
import re | |
import sys | |
""" | |
タグごとに正規表現を定義 | |
""" | |
# 無視するタグ | |
ignores = re.compile(r"(</?body.*?>|</?html.*?>|<\?xml.*?>|</?meta.*?>|</?head.*?>|</?input.*?>|</?tbody.*?>|</?dt.*?>|</?ul.*?>|<!DOCTYPE.*?>|</?dl.*?>|</div>|</?font.*?>|</?p.*?>|</?table.*?>|</?span.*?>|</?ol.*?>|</?col.*?>|</?tt.*?>|</?dd.*?>)", re.DOTALL) | |
# タイトル | |
title_tag = re.compile(r"<title.*?>(?P<title>.*?)</title>", re.DOTALL) | |
# h? | |
h1_tag = re.compile(r"<h1>(?P<h1>.*?)</h1>", re.DOTALL) | |
h2_tag = re.compile(r"<h2>(?P<h2>.*?)</h2>", re.DOTALL) | |
h3_tag = re.compile(r"<h3>(?P<h3>.*?)</h3>", re.DOTALL) | |
h4_tag = re.compile(r"<h4>(?P<h4>.*?)</h4>", re.DOTALL) | |
h5_tag = re.compile(r"<h5>(?P<h5>.*?)</h5>", re.DOTALL) | |
# リンク | |
a_tag = re.compile(r"<a.*?href=(\"|')(?P<href>.*?)(\"|').*?>(?P<title>.*?)</a>", re.DOTALL) | |
# 改行 | |
newline = re.compile(r"(<div.*?>|<br\/>|</?tr.*?>)", re.DOTALL) | |
# 強調 | |
bold = re.compile(r"<strong>(?P<bold>.*?)</strong>", re.DOTALL) | |
bold2 = re.compile("<b>(?P<bold>.*?)</b>", re.DOTALL) | |
bold3 = re.compile("<em>(?P<em>.*?)</em>", re.DOTALL) | |
# イタリック | |
italic = re.compile(r"<i>(?P<italic>.*?)</i>", re.DOTALL) | |
# 下線 | |
under = re.compile(r"<u>(?P<under>.*?)</u>", re.DOTALL) | |
# 打ち消し | |
strike = re.compile(r"<strike>(?P<strike>.*?)</strike>", re.DOTALL) | |
strike2 = re.compile(r"<s>(?P<strike>.*?)</s>", re.DOTALL) | |
# code | |
code = re.compile(r"<code>(?P<code>.*?)</code>", re.DOTALL) | |
pre = re.compile(r"<pre>(?P<pre>.*?)</pre>", re.DOTALL) | |
# 箇条書き | |
li = re.compile(r"<li>(?P<li>.*?)</li>", re.DOTALL) | |
# 引用 | |
quote = re.compile(r"<blockquote>(?P<quote>.*?)</blockquote>", re.DOTALL) | |
# 中央寄せ | |
center = re.compile(r"<center>(?P<center>.*?)</center>", re.DOTALL) | |
# テーブル | |
rowh1 = re.compile(r"</th.*?><th.*?>", re.DOTALL) | |
rowh2 = re.compile(r"</?th.*?>", re.DOTALL) | |
rowd1 = re.compile(r"</td.*?><td.*?>", re.DOTALL) | |
rowd2 = re.compile(r"</?td.*?>", re.DOTALL) | |
# 画像 | |
img = re.compile(r"<img .*?src=\"(?P<img>.*?)\".*?/>", re.DOTALL) | |
""" | |
ファイルを読み込む | |
""" | |
file = sys.argv[1] | |
evnt = open(file).read() | |
""" | |
正規表現を適用する | |
""" | |
evnt = ignores.sub("", evnt) | |
evnt = title_tag.sub("* \g<title>\n", evnt) | |
evnt = h1_tag.sub("** \g<h1>\n", evnt) | |
evnt = h2_tag.sub("*** \g<h2>\n", evnt) | |
evnt = h3_tag.sub("**** \g<h3>\n", evnt) | |
evnt = h4_tag.sub("***** \g<h4>\n", evnt) | |
evnt = h5_tag.sub("****** \g<h5>\n", evnt) | |
evnt = a_tag.sub("[[\g<href>][\g<title>]", evnt) | |
evnt = newline.sub("\n", evnt) | |
evnt = bold.sub("*\g<bold>*", evnt) | |
evnt = bold2.sub("*\g<bold>*", evnt) | |
evnt = bold3.sub("*\g<em>*", evnt) | |
evnt = italic.sub("/\g<italic>/", evnt) | |
evnt = under.sub("_\g<under>_", evnt) | |
evnt = strike.sub("+\g<strike>+", evnt) | |
evnt = strike2.sub("+\g<strike>+", evnt) | |
evnt = code.sub("#+BEGIN_SRC text\n\g<code>\n#+END_SRC\n", evnt) | |
evnt = pre.sub("#+BEGIN_SRC text\n\g<pre>\n#+END_SRC\n", evnt) | |
evnt = li.sub("- \g<li>\n", evnt) | |
evnt = quote.sub("#+BEGIN_QUOTE \n\g<quote>\n#+END_QUOTE\n", evnt) | |
evnt = center.sub("#+BEGIN_CENTER \n\g<center>\n#+END_CENTER\n", evnt) | |
evnt = rowh1.sub("|", evnt) | |
evnt = rowh2.sub("|", evnt) | |
evnt = rowd1.sub("|", evnt) | |
evnt = rowd2.sub("|", evnt) | |
evnt = img.sub("[img:\g<img>]", evnt) | |
evnt = urllib.unquote(evnt) | |
print evnt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
#!c:\python27\python.exe | |
# coding: utf-8 | |
u"""======================================== | |
正規表現を利用してHTMLタグの抽出 | |
========================================""" | |
import re | |
text=u"""\ | |
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> | |
<html lang="ja"> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | |
<title>${title}</title> | |
</head> | |
<body> | |
${body} | |
</body> | |
</html>\ | |
""" | |
def tag_all(str): | |
u"""---------------------------------------- | |
全てのタグを正規表現で抽出、リストにして返す | |
----------------------------------------""" | |
#タグ全ての正規表現 <${任意}> | |
reg=re.compile(ur"""<(.*?)>""") | |
#タグをリストに格納 | |
tag_list=[i.group(0) for i in reg.finditer(str)] | |
return tag_list | |
def tag_end(str): | |
u"""---------------------------------------- | |
終了タグを正規表現で抽出、リストにして返す | |
----------------------------------------""" | |
#タグ全ての正規表現 <${任意}> | |
reg=re.compile(ur"""</(.*?)>""") | |
#タグをリストに格納 | |
tag_list=[i.group(0) for i in reg.finditer(str)] | |
return tag_list | |
def tag_start(str): | |
u"""---------------------------------------- | |
開始タグをリストにして返す | |
----------------------------------------""" | |
#all tag | |
tag_all_list=tag_all(str) | |
#end tag | |
tag_end_list=tag_end(str) | |
#start tag | |
tag_start_list=[i for i in set(tag_all_list)-set(tag_end_list)] | |
return tag_start_list | |
def main(): | |
res=tag_start(text) | |
print u"[元のテキスト]" | |
print text | |
print "-"*40 | |
print u"[開始タグ一覧]" | |
for i,elem in enumerate(res): | |
print u"%s : %s" % (i+1,elem) | |
if __name__=='__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment