-
-
Save seba--/6294697 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[diff "word"] | |
binary = true | |
textconv = docx-to-txt.rb -t |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env ruby | |
# Simplistic DOCX to plain text converter, loosely based on the | |
# Simplistic OpenDocument Text (.odt) to plain text converter. | |
# Author: Jason Rogers <https://github.com/jacaetevha> | |
# | |
# Changed 2013-08-21 by Sebastian Erdweg <https://github.com/seba--> | |
# Put a newline after any closing paragraph to retain the | |
# paragraph structure of the docx document. | |
# | |
# Assumes that you have the unzip and tidy commands available for your system | |
require 'optparse' | |
options = {} | |
optparse = OptionParser.new do|opts| | |
opts.banner = "Usage: #{File.basename __FILE__} [options] file" | |
options[:text_only] = false | |
opts.on( '-t', '--text-only', 'Output less information' ) do | |
options[:text_only] = true | |
end | |
opts.on( '-h', '--help', 'Display this screen' ) do | |
puts opts | |
exit | |
end | |
end | |
optparse.parse! | |
if ARGV[0].nil? | |
puts "No filename given!\n" | |
puts "Usage: #{File.basename __FILE__} filename\n" | |
exit 1; | |
end | |
unless File.exist?(ARGV[0]) | |
puts "File does not exist!\n" | |
puts "Usage: #{File.basename __FILE__} filename\n" | |
exit 1; | |
end | |
command = "unzip -qq -p '#{ARGV[0]}' word/document.xml" | |
command += " | tidy -utf8 -xml -w 255 -i -c -q -asxml" unless options[:text_only] | |
content = `#{command}` | |
if options[:text_only] | |
content.gsub! /<\/w:p>/, "\n" # replace end-of-paragraph tag </w:p> by newline | |
content.gsub! /<[^>]+>/, '' # remove all XML tags | |
content.gsub! /\n{2,}/, "\n\n" # remove multiple blank lines | |
content.gsub! /\A\n+/, '' # remove leading blank lines | |
end | |
puts content |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment