kimadactyl · August 10, 2021 17:09
diff --git a/1-trim-lines.rb b/1-trim-lines.rb
 # First grab the remote site
 # wget --mirror --convert-links --adjust-extension --page-requisites --no-parent http://yourdomain.co.uk
 # If you have images hosted on another site then do
 # wget --mirror --convert-links --adjust-extension --page-requisites --span-hosts --domains mydomain.com,imagehostingcdn.com https://mydomain.com
 # Then have a look how many lines to trim from beginning and end

 htmls = Dir.glob('/**/*.html')

 htmls.each do |file|
  # Change numbers to how many lines you want to trim
  output = File.readlines(file).drop(450).reverse.drop(72).reverse

  File.open(file, 'w') do |f|
    output.each do |line|
      f.write line
    end
  end
 end
diff --git a/2-add-frontmatter.rb b/2-add-frontmatter.rb
 # Add frontmatter to all your HTML files

 htmls = Dir.glob('/**/*.html')

 htmls.each do |file|
  output = File.readlines(file).reverse
  output << "---\n\n"
  output << "title: \n"
  output << "---\n"
  output = output.reverse
  File.open(file, 'w') do |f|
    output.each do |line|
      f.write line
    end
  end
 end
diff --git a/3-get-files.rb b/3-get-files.rb
 # Extract links on external CDNs

 htmls = Dir.glob('/**/*.html')

 htmls.each do |file|
  matches = File.read(file)
                 .scan(/(http\:\/\/cdn[0-9]\.yourdomain\.co\.uk\/.*\.(jpg|png|gif|jpeg|pdf))"/)
  matches.each { |m| puts m[0] }
 end

 # This isn't perfect but you can then save this to a file and run wget.
 # wget -i file-manifest -m
diff --git a/4-delete-params.sh b/4-delete-params.sh
 # If you're importing from Dropbox it might help to run the following to clean up file extensions

 find . -exec rename -s '?raw=1' '' {} +
 find . -exec rename -s '?dl=0' '' {} +
	# First grab the remote site
	# wget --mirror --convert-links --adjust-extension --page-requisites --no-parent http://yourdomain.co.uk
	# If you have images hosted on another site then do
	# wget --mirror --convert-links --adjust-extension --page-requisites --span-hosts --domains mydomain.com,imagehostingcdn.com https://mydomain.com
	# Then have a look how many lines to trim from beginning and end

	htmls = Dir.glob('/*/.html')

	htmls.each do \|file\|
	# Change numbers to how many lines you want to trim
	output = File.readlines(file).drop(450).reverse.drop(72).reverse

	File.open(file, 'w') do \|f\|
	output.each do \|line\|
	f.write line
	end
	end
	end
	# Add frontmatter to all your HTML files

	htmls = Dir.glob('/*/.html')

	htmls.each do \|file\|
	output = File.readlines(file).reverse
	output << "---\n\n"
	output << "title: \n"
	output << "---\n"
	output = output.reverse
	File.open(file, 'w') do \|f\|
	output.each do \|line\|
	f.write line
	end
	end
	end
	# Extract links on external CDNs

	htmls = Dir.glob('/*/.html')

	htmls.each do \|file\|
	matches = File.read(file)
	.scan(/(http\:\/\/cdn[0-9]\.yourdomain\.co\.uk\/.*\.(jpg\|png\|gif\|jpeg\|pdf))"/)
	matches.each { \|m\| puts m[0] }
	end

	# This isn't perfect but you can then save this to a file and run wget.
	# wget -i file-manifest -m