BjornFJohansson · December 19, 2023 06:44
diff --git a/wikidpad-to-obsidian.py b/wikidpad-to-obsidian.py
 from pathlib import Path
 import urllib
 import re, os
 from urllib.parse import urlparse
 from string import punctuation
 pages = sorted(Path('.').glob('*.md'))

 # Sometimes wp creates pages with ~ (tilde). Rename these
 tildepages = [p for p in pages if "~" in str(p)]
 for tildepage in tildepages:
    a, b = str(tildepage).split("~", maxsplit=1)
    np = Path(f"{a}.md")
    if not np.exists():
        tildepage.rename(np)

 # Some wp files have unicode and some punctuation quoted.
 # subpages are renamed with a pipe | character
 for page in pages:
    uq = urllib.parse.unquote(str(page), encoding='cp1252')
    if not str(page) == uq:
        nm = uq.replace("/", "|")
        page.rename(nm)


 # Strip page name from first line, remove if only punctuation remains
 i = 0
 for page in pages:
    firstline, *rest = page.read_text(encoding='utf-8')[1:].splitlines()
    newfirstline = firstline.strip("# ").replace(page.stem, "")
    if not set(newfirstline) - set(punctuation):
        newfirstline = ""
    if firstline != newfirstline:
        newpagetext = newfirstline + "\n" + "\n".join(rest)
        page.write_text(newpagetext)
        i+=1
 # 7175 pages



 # remove [alias:...] and replace with obsidian alias:
 # https://help.obsidian.md/Linking+notes+and+files/Aliases
 i = 0
 regxal = re.compile(r"(?:\[)alias:(.+)(?:\])")
 for page in pages[1:]:
    pagetext = page.read_text(encoding='utf-8')[1:]
    matchobj = re.search(regxal, pagetext)
    if matchobj:
        aliases = matchobj.group(1).strip().split("; ")
        nb = f"---\naliases: {', '.join(aliases)}\n---\n\n"
        newpagetext = nb + pagetext[:matchobj.start()]+pagetext[matchobj.end():]
        page.write_text(newpagetext)
        i+=1


 # Replace absolute file links with obsidian md style links:
 # file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md
 # [file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md]
 # >>>====>
 # [yeast-colony-pcr.md](<file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md>)
 # This could be improved to handle images by prepending a ! for some links
 regxfl = re.compile(r"(?:\[?)(file:/[^\s\]]+)(?:\]|\s)")
 i=0
 def repl_file_links(matchobj):
    url = matchobj.group(1)
    fn = os.path.basename(url)
    return f"[{fn}](<{url}>)"
 for page in pages:
    pagetext = page.read_text(encoding='utf-8').lstrip("\ufeff")
    newpagetext, r = re.subn(regxfl, repl_file_links, pagetext)
    if r:
        page.write_text(newpagetext)
        i+=1

 # Search and replace all *defined* CamelCase and [wikiwords]
 pregs = [re.compile(f"(?:\[)({p.stem})(?:\])") for p in pages]

 from tqdm import tqdm
 for page in tqdm(pages):
    pagetext = page.read_text(encoding='utf-8')
    for preg in pregs:
        pagetext = re.sub(preg, r"[[\1]]" , pagetext)
    page.write_text(pagetext)
	from pathlib import Path
	import urllib
	import re, os
	from urllib.parse import urlparse
	from string import punctuation
	pages = sorted(Path('.').glob('*.md'))

	# Sometimes wp creates pages with ~ (tilde). Rename these
	tildepages = [p for p in pages if "~" in str(p)]
	for tildepage in tildepages:
	a, b = str(tildepage).split("~", maxsplit=1)
	np = Path(f"{a}.md")
	if not np.exists():
	tildepage.rename(np)

	# Some wp files have unicode and some punctuation quoted.
	# subpages are renamed with a pipe \| character
	for page in pages:
	uq = urllib.parse.unquote(str(page), encoding='cp1252')
	if not str(page) == uq:
	nm = uq.replace("/", "\|")
	page.rename(nm)


	# Strip page name from first line, remove if only punctuation remains
	i = 0
	for page in pages:
	firstline, *rest = page.read_text(encoding='utf-8')[1:].splitlines()
	newfirstline = firstline.strip("# ").replace(page.stem, "")
	if not set(newfirstline) - set(punctuation):
	newfirstline = ""
	if firstline != newfirstline:
	newpagetext = newfirstline + "\n" + "\n".join(rest)
	page.write_text(newpagetext)
	i+=1
	# 7175 pages



	# remove [alias:...] and replace with obsidian alias:
	# https://help.obsidian.md/Linking+notes+and+files/Aliases
	i = 0
	regxal = re.compile(r"(?:\[)alias:(.+)(?:\])")
	for page in pages[1:]:
	pagetext = page.read_text(encoding='utf-8')[1:]
	matchobj = re.search(regxal, pagetext)
	if matchobj:
	aliases = matchobj.group(1).strip().split("; ")
	nb = f"---\naliases: {', '.join(aliases)}\n---\n\n"
	newpagetext = nb + pagetext[:matchobj.start()]+pagetext[matchobj.end():]
	page.write_text(newpagetext)
	i+=1


	# Replace absolute file links with obsidian md style links:
	# file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md
	# [file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md]
	# >>>====>
	# [yeast-colony-pcr.md](<file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md>)
	# This could be improved to handle images by prepending a ! for some links
	regxfl = re.compile(r"(?:\[?)(file:/[^\s\]]+)(?:\]\|\s)")
	i=0
	def repl_file_links(matchobj):
	url = matchobj.group(1)
	fn = os.path.basename(url)
	return f"[{fn}](<{url}>)"
	for page in pages:
	pagetext = page.read_text(encoding='utf-8').lstrip("\ufeff")
	newpagetext, r = re.subn(regxfl, repl_file_links, pagetext)
	if r:
	page.write_text(newpagetext)
	i+=1

	# Search and replace all defined CamelCase and [wikiwords]
	pregs = [re.compile(f"(?:\[)({p.stem})(?:\])") for p in pages]

	from tqdm import tqdm
	for page in tqdm(pages):
	pagetext = page.read_text(encoding='utf-8')
	for preg in pregs:
	pagetext = re.sub(preg, r"[[\1]]" , pagetext)
	page.write_text(pagetext)