HTML to txt:
cat all.html | node node_modules/html-to-text/bin/cli.js > all.txt
HTML to txt:
cat all.html | node node_modules/html-to-text/bin/cli.js > all.txt
require 'nokogiri' | |
elements = Dir['articles/*'].map { |f| e = Nokogiri::HTML(File.read(f)).css('table:last').first }.compact | |
File.write('all.html', elements.map(&:to_html).join("\n\n<h1><br><hr/><br></h1>\n\n")) |
require 'nokogiri' | |
require 'open-uri' | |
require 'digest/sha1' | |
cachefile = 'cache.marshal' | |
if File.exists?(cachefile) | |
hrefs = Marshal.load(File.read(cachefile)) | |
else | |
urls = ['http://www.paulgraham.com/ind.html'] | |
23.times { |i| urls << "http://www.paulgraham.com/ind_#{i+1}.html" } | |
pages = urls.map { |url| open(url).read } | |
hrefs = pages.map { |p| Nokogiri::HTML(p).css('table tr td a').map { |i| i['href'] } } | |
hrefs = hrefs.flatten.sort.uniq | |
hrefs = hrefs.map {|h| 'http://www.paulgraham.com/' + h } | |
File.write(cachefile, Marshal.dump(hrefs)) | |
end | |
articles = [] | |
hrefs.each_slice(10) do |urls_slice| | |
threads = [] | |
urls_slice.each do |url| | |
threads << Thread.new(url) do |url| | |
name = 'articles/' + Digest::SHA1.hexdigest(url) + '-' + url.gsub(/[^a-zA-Z0-9]/, '-') | |
if File.exists?(name) | |
print '|' | |
else | |
begin | |
File.write(name, open(url).read) | |
print '.' | |
rescue | |
print 'x' | |
end | |
end | |
end | |
end | |
threads.each(&:join) | |
end |