Skip to content

Instantly share code, notes, and snippets.

@kimadactyl
Last active December 17, 2021 21:24
Show Gist options
  • Save kimadactyl/a284f6aef0025aed3f0debe94cd612d9 to your computer and use it in GitHub Desktop.
Save kimadactyl/a284f6aef0025aed3f0debe94cd612d9 to your computer and use it in GitHub Desktop.
Migrate Blogspot to Hugo page bundles
# To grab site, do something like:
# wget --mirror --convert-links --adjust-extension --page-requisites --span-hosts --domains 1.bp.blogspot.com,2.bp.blogspot.com,3.bp.blogspot.com,4.bp.blogspot.com,myblog.blogspot.com, http://myblog.blogspot.com/ -P pass_01
require 'date'
require 'fileutils'
require 'kramdown'
require 'logger'
require 'nokogiri'
require 'rake'
require 'sanitize'
INPUT_DIR = './scrape/pass_01/myblog.blogspot.com/'
OUTPUT_DIR = './output/pass_01'
missing_files = Logger.new('missing_files.log')
# Delete and remake the output directory
FileUtils.remove_dir(OUTPUT_DIR)
Dir.mkdir(OUTPUT_DIR)
# Get all HTML files in any directory
# Note this will skip txt files e.g. robots.txt
files = Dir.glob("#{INPUT_DIR}/**/*.html")
files.each do |file|
html = Nokogiri::HTML(File.open(file))
basename = File.basename(file, ".html")
# Grab the main post body
content = html.at_css('div.post-body')
# Get the page title
begin
title = html.at_css("meta[property='og:title']")['content']
rescue
title = "FIXME"
end
new_blog_path = basename
# Get the meta description from og tag
begin
description = html.at_css("meta[property='og:description']")['content']
rescue
description = "FIXME"
end
# Get the image meta property from OG tag
begin
image = html.at_css("meta[property='og:image']")['content']
rescue
image = "FIXME"
end
# Get the date, which we use to set the directory
begin
date = html.at_css("abbr[itemprop='datePublished']")['title']
year = Date.parse(date).year
dir = "#{OUTPUT_DIR}/#{year}/#{new_blog_path}"
rescue
date = "FIXME"
dir = "#{OUTPUT_DIR}/no_year/#{new_blog_path}"
end
system 'mkdir', '-p', dir
# Move images around
content.xpath('//img').each do |img|
src = img['src']
# Is this coming from a blogspot CDN?
if src.split('bp.blogspot.com/')[1]
img_location = "#{INPUT_DIR}../" + src.match(/\d.bp.blogspot.com.*/)[0]
else
print "M"
missing_files.error("Missing file: #{src}")
break
end
# Move it to the new location in a page bundle
new_location = "#{dir}/#{File.basename(img_location)}"
begin
FileUtils.cp(img_location, new_location)
rescue
missing_files.error("Couldn't resolve file: #{src}")
end
img.attributes['src'].value = File.basename(new_location)
print "T"
end
content.xpath('//a[@imageanchor="1"]').each do |a|
src = a['href']
if src.split('bp.blogspot.com/')[1]
img_location = "#{INPUT_DIR}../" + src.match(/\d.bp.blogspot.com.*/)[0]
else
print "M"
missing_files.error("Missing file: #{src}")
break
end
new_location = "#{dir}/#{File.basename(img_location)}"
begin
FileUtils.cp(img_location, new_location)
rescue
missing_files.error("Couldn't resolve file: #{src}")
end
a.attributes['href'].value = File.basename(new_location)
print "I"
end
# puts "Converting: #{title} | #{date}"
# puts "#{description}"
# puts "#{image}"
# Configure markdown processor
sanitize_config = {
elements: %w[b em i strong u a abbr blockquote br cite code dd dfn dl dt kbd li mark ol p pre q s samp small strike sub sup time ul var img table iframe],
:attributes=>{
"a"=>["href"],
"abbr"=>["title"],
"blockquote"=>["cite"],
"dfn"=>["title"],
"q"=>["cite"],
"time"=>["datetime", "pubdate"],
"img"=>["alt", "src"],
"iframe"=>["alt", "src"]
},
:protocols=>{
"a"=>{"href"=>["ftp", "http", "https", "mailto", :relative]},
"blockquote"=>{"cite"=>["http", "https", :relative]},
"q"=>{"cite"=>["http", "https", :relative]},
"img"=>{"src"=>["http", "https", :relative]},
"iframe"=>{"src"=>["http", "https", :relative]}
}
}
# Convert the content to markdown
content = Sanitize.fragment(content, sanitize_config)
content = Kramdown::Document.new(content, html_to_native: true)
content = content.to_kramdown
# Create output file
output = <<~HEREDOC
---
title: "#{title.gsub('"', '\"')}"
image: "#{File.basename(image)}"
date: #{date}
aliases: "/#{file.split('//')[1]}"
---
#{content}
HEREDOC
# Write output to output dir and change extension
File.write("#{OUTPUT_DIR}/#{year}/#{new_blog_path}/index.md", output)
print "P"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment