panayi · October 27, 2012 08:02
diff --git a/plain_text_extractor.rb b/plain_text_extractor.rb
 require "rubygems"
 require "nokogiri"

 class PlainTextExtractor < Nokogiri::XML::SAX::Document

  attr_reader :plaintext

  # Initialize the state of interest variable with false
  def initialize
    @interesting = false
    @plaintext = ""
  end

  # This method is called whenever a comment occurs and
  # the comments text is passed in as string.
  def comment(string)
    case string.strip       # strip leading and trailing whitespaces
    when /^someComment/     # match starting comment
      @interesting = true
    when /^\/someComment/
      @interesting = false  # match closing comment
    end
  end

  # This callback method is called with any string between
  # a tag.
  def characters(string)
    @plaintext << string if @interesting
  end
 end

 pte = PlainTextExtractor.new
 parser = Nokogiri::HTML::SAX::Parser.new(pte)
 parser.parse_file ARGV[0]
 puts pte.plaintext
diff --git a/sample_page.html b/sample_page.html
 <html>
  <head>
    <title>Some Title</title>
  </head>
  <body>
     <h2>Here goes some heading we are not interested in.</h2>
     
      <!-- someComment -->
        Here it goes. We are interested in this text. </br>
        But <b>some</b> words are wrapped with HTML-Tags we are <i>not</i>
        interested in.
        <a href="bar">Or links,..</a>
        <table>
          <tr>
            <td>Or a Table,...</td>
          </tr>
        </table>
      <!-- /someComment -->
     
      But we do NOT care about this.
      
      <!-- foo -->
        Even if it is wrapped in another comment.
      <!-- /foo -->
  </body>
 </html>
	require "rubygems"
	require "nokogiri"

	class PlainTextExtractor < Nokogiri::XML::SAX::Document

	attr_reader :plaintext

	# Initialize the state of interest variable with false
	def initialize
	@interesting = false
	@plaintext = ""
	end

	# This method is called whenever a comment occurs and
	# the comments text is passed in as string.
	def comment(string)
	case string.strip # strip leading and trailing whitespaces
	when /^someComment/ # match starting comment
	@interesting = true
	when /^\/someComment/
	@interesting = false # match closing comment
	end
	end

	# This callback method is called with any string between
	# a tag.
	def characters(string)
	@plaintext << string if @interesting
	end
	end

	pte = PlainTextExtractor.new
	parser = Nokogiri::HTML::SAX::Parser.new(pte)
	parser.parse_file ARGV[0]
	puts pte.plaintext
	<html>
	<head>
	<title>Some Title</title>
	</head>
	<body>
	<h2>Here goes some heading we are not interested in.</h2>

	<!-- someComment -->
	Here it goes. We are interested in this text. </br>
	But <b>some</b> words are wrapped with HTML-Tags we are <i>not</i>
	interested in.
	<a href="bar">Or links,..</a>
	<table>
	<tr>
	<td>Or a Table,...</td>
	</tr>
	</table>
	<!-- /someComment -->

	But we do NOT care about this.

	<!-- foo -->
	Even if it is wrapped in another comment.
	<!-- /foo -->
	</body>
	</html>