Last active
December 19, 2023 06:44
-
-
Save BjornFJohansson/c340f1040c3f722513901a52a0fc1d5c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import urllib | |
import re, os | |
from urllib.parse import urlparse | |
from string import punctuation | |
pages = sorted(Path('.').glob('*.md')) | |
# Sometimes wp creates pages with ~ (tilde). Rename these | |
tildepages = [p for p in pages if "~" in str(p)] | |
for tildepage in tildepages: | |
a, b = str(tildepage).split("~", maxsplit=1) | |
np = Path(f"{a}.md") | |
if not np.exists(): | |
tildepage.rename(np) | |
# Some wp files have unicode and some punctuation quoted. | |
# subpages are renamed with a pipe | character | |
for page in pages: | |
uq = urllib.parse.unquote(str(page), encoding='cp1252') | |
if not str(page) == uq: | |
nm = uq.replace("/", "|") | |
page.rename(nm) | |
# Strip page name from first line, remove if only punctuation remains | |
i = 0 | |
for page in pages: | |
firstline, *rest = page.read_text(encoding='utf-8')[1:].splitlines() | |
newfirstline = firstline.strip("# ").replace(page.stem, "") | |
if not set(newfirstline) - set(punctuation): | |
newfirstline = "" | |
if firstline != newfirstline: | |
newpagetext = newfirstline + "\n" + "\n".join(rest) | |
page.write_text(newpagetext) | |
i+=1 | |
# 7175 pages | |
# remove [alias:...] and replace with obsidian alias: | |
# https://help.obsidian.md/Linking+notes+and+files/Aliases | |
i = 0 | |
regxal = re.compile(r"(?:\[)alias:(.+)(?:\])") | |
for page in pages[1:]: | |
pagetext = page.read_text(encoding='utf-8')[1:] | |
matchobj = re.search(regxal, pagetext) | |
if matchobj: | |
aliases = matchobj.group(1).strip().split("; ") | |
nb = f"---\naliases: {', '.join(aliases)}\n---\n\n" | |
newpagetext = nb + pagetext[:matchobj.start()]+pagetext[matchobj.end():] | |
page.write_text(newpagetext) | |
i+=1 | |
# Replace absolute file links with obsidian md style links: | |
# file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md | |
# [file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md] | |
# >>>====> | |
# [yeast-colony-pcr.md](<file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md>) | |
# This could be improved to handle images by prepending a ! for some links | |
regxfl = re.compile(r"(?:\[?)(file:/[^\s\]]+)(?:\]|\s)") | |
i=0 | |
def repl_file_links(matchobj): | |
url = matchobj.group(1) | |
fn = os.path.basename(url) | |
return f"[{fn}](<{url}>)" | |
for page in pages: | |
pagetext = page.read_text(encoding='utf-8').lstrip("\ufeff") | |
newpagetext, r = re.subn(regxfl, repl_file_links, pagetext) | |
if r: | |
page.write_text(newpagetext) | |
i+=1 | |
# Search and replace all *defined* CamelCase and [wikiwords] | |
pregs = [re.compile(f"(?:\[)({p.stem})(?:\])") for p in pages] | |
from tqdm import tqdm | |
for page in tqdm(pages): | |
pagetext = page.read_text(encoding='utf-8') | |
for preg in pregs: | |
pagetext = re.sub(preg, r"[[\1]]" , pagetext) | |
page.write_text(pagetext) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment