Last active
August 29, 2015 14:24
-
-
Save tomelm/6f309818553367c28502 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'date' | |
require 'nokogiri' | |
require 'rest-client' | |
require 'reverse_markdown' | |
# Match [caption <stuff>]...[/caption] tags | |
# example: http://rubular.com/r/r2FH3QSOpL | |
CAPTION_REGEX = /\[caption.*\](?=.*\[)|\[\/caption\]/ | |
# Match the entirety of an img html tag | |
# example: http://rubular.com/r/xU3ZUF1vvY | |
IMG_TAG_REGEX = /(<img.*?>)/ | |
IMG_SRC_REGEX = /src=".*?"/ | |
IMG_SRC_GROUP_REGEX = /<img.*src="(.*?)".*\/?>/ | |
# Regex's for gist and info extraction | |
GIST_REGEX = /\[gist.*\]/ | |
GIST_ID_REGEX = /\[gist id="(.*?)".*\]/ | |
GIST_FILE_REGEX = /\[gist .* file="(.*)"\]/ | |
# Base url for all the data | |
BLOG_BASE_URL = 'http://engineeringblog.yelp.com' | |
# Open and parse the XML file using Nokogiri | |
# | |
# path - a path to the XML file to be parsed | |
# | |
# Returns a Nokogiri XML object | |
def open_xml_file(path) | |
Nokogiri::XML(File.read(path)) | |
end | |
# Extracts all of the authors from a WordPress XML file | |
# | |
# xml - a parsed, Nokogiri XML object | |
# | |
# Examples | |
# | |
# extract_authors(xml) | |
# # => { 'darwin': 'Darwin S., Software engineer'} | |
# | |
# Returns a hash table of the author's login => display name | |
def extract_authors(xml) | |
authors = {} | |
authors_xml = xml.xpath('//wp:author') | |
authors_xml.each do |author| | |
author_login = author.xpath('wp:author_login').first.text | |
author_display = author.xpath('wp:author_display_name').first.text | |
authors[author_login] = author_display | |
end | |
authors | |
end | |
class Post | |
attr_accessor :images | |
def initialize(xml, authors={}) | |
@xml = xml | |
@author = author(authors) | |
@images = [] | |
end | |
def title | |
@title ||= @xml.xpath('title').text | |
end | |
def author(authors={}) | |
@author ||= authors[@xml.xpath('dc:creator').text] || @xml.xpath('dc:creator').text | |
end | |
def date | |
@date ||= DateTime.parse(@xml.xpath('pubDate').text) | |
end | |
def post_name | |
@post_name ||= @xml.xpath('wp:post_name').text | |
end | |
def file_name | |
"#{date.strftime("%Y-%m-%d")}-#{post_name}.markdown" | |
end | |
def front_matter | |
<<-eos.gsub(/^\s+/, '') | |
--- | |
layout: post | |
title: "#{title}" | |
author: #{author} | |
date: #{date} | |
published: true | |
--- | |
\n | |
eos | |
end | |
def content | |
return @content unless @content.nil? | |
cleaned_lines = [] | |
lines = @xml.xpath('content:encoded').text.split("\n") | |
lines.each do |line| | |
cleaned_lines << ReverseMarkdown.convert(clean_line(line)) | |
end | |
cleaned_lines.join | |
end | |
private | |
def clean_line(line) | |
if line.index(IMG_TAG_REGEX) | |
@images << image_url = line[IMG_SRC_GROUP_REGEX, 1] | |
image_path = "/images/posts/#{post_name}/#{File.basename(image_url)}" | |
line.gsub!(IMG_SRC_REGEX, "src=#{image_path}") | |
end | |
cleaned_line = extract_caption(line, image_path) if line =~ CAPTION_REGEX | |
cleaned_line = extract_gist(line) if line =~ GIST_REGEX | |
cleaned_line = "<p>#{line}</p>" if cleaned_line.nil? | |
cleaned_line.gsub!('’', "'") # fix unicode apostrophe issues | |
cleaned_line | |
end | |
def extract_caption(line, image_url) | |
# TODO figure out how I want to handle images and captions later | |
# ref: http://stackoverflow.com/questions/19331362/using-an-image-caption-in-markdown-jekyll | |
caption = Nokogiri::HTML(line).text | |
.strip | |
.sub(CAPTION_REGEX, '') | |
.sub(CAPTION_REGEX, '') # remove closing caption | |
return "{% include post/image.html image=\"#{image_url}\" caption=\"#{caption}\" %}" | |
end | |
def extract_gist(line) | |
"{{ gist #{line[GIST_ID_REGEX, 1]} #{line[GIST_FILE_REGEX, 1]} }}" | |
end | |
end | |
xml = open_xml_file('./blog.xml') | |
authors = extract_authors(xml) | |
raw_posts = xml.xpath('//item') | |
Dir.mkdir('_posts') | |
Dir.mkdir('images') | |
Dir.mkdir('images/posts') | |
puts 'Converting posts' | |
posts = raw_posts.collect {|p| Post.new(p, authors)} | |
puts 'Processing and writing posts, images' | |
posts.each do |post| | |
puts post.file_name | |
File.write("_posts/#{post.file_name}", post.front_matter + post.content) | |
image_dir = 'images/posts/' + post.post_name | |
Dir.mkdir(image_dir) | |
post.images.each do |image| | |
next if image.nil? | |
image = BLOG_BASE_URL + image unless image.index('http') | |
puts "|--> #{image}" | |
begin | |
open("#{image_dir}/#{File.basename(image)}", 'wb') do |file| | |
file.write(RestClient.get(image)) | |
end | |
rescue Exception => e | |
puts "failed to download #{image} - #{e.message}" | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment