kingcu · October 28, 2011 18:05
diff --git a/tyler.rb b/tyler.rb
 #run this file by typing 'ruby tyler.rb' on your command line.
 #make sure the tyler_garbage_data.txt file is in the same directory,
 #and that you are also in that directory on your command line.
 #
 #A project for you would be to figure out how to pass the ruby
 #script the garbage data file on teh command line, so you can
 #have it named whatever.
 #
 #Finally, to capture the output of this script, you'll want to
 #execute it with something like
 #   'ruby tyler.rb > output.txt'
 #otherewise the stuff will just print out on the screen and won't make
 #it into a file.  That's another good exercise, figure out how to use
 #File.write() to write this data to a particular file.



 #bring in the library for parsing JSON data - JSON is just another
 #way to transfer data back and forth between computers, like XML.
 #
 #if you don't have it installed (there is an error) you'll need
 #to do something like 'gem install json' on your command line
 require 'json'

 #make an array to hold our objects we get from the file
 json_data = []

 #open the file and read the data into a local variable 'txt'
 #notice the second argument of "r:binary" - this is important
 #because we are telling ruby that the file we are opening contains
 #data other than just text.  If we don't tell ruby this, it will
 #error when it encounters non-text stuff.  The non-text data in
 #this file appear to be image thumbnails for what it's worth.
 #
 #http://www.ruby-doc.org/core-1.9.2/File.html#method-c-open
 #
 File.open("./tyler_garbage_data.txt", "r:binary") do |file|
  #set state - since the file wraps each data line inbetween two
  #distinct lines, we set this to true to indicate "ok, the next
  #line after this is going to be a data line"
  next_is_data = false

  #ok lets start actually looping through each line (text followed
  #by a newline character)
  file.each do |line|
    #if the line we are at matches the pattern of the "start of data"
    #line i mentioned above, set the state and then call 'next' to skip
    #the rest of the code in this section
    #
    #readup on regular expressions (regex) for more information, but it's
    #a hell of a topic
    if line =~ /Response-body:<<--EOF/i
      next_is_data = true
      next
    end

    #if the last line was data (meaning it matched the comparison
    #above) then let's try and parse it
    if next_is_data
      #a begin/rescue/end block is a chunk of code we basically
      #expect to fail.  If there is an error executing the code in the
      #begin block, we stop and then execute the code in the rescue block
      begin
        #try and parse the line as JSON.  If it's not JSON (half the data
        #in the file isn't), then it will error.
        js = JSON.parse(line)
        #wow we got here, that means parsing as JSON didn't error, so let's
        #store the data away in our good data store
        json_data << js
      rescue
        #fuck, not JSON, garbage data, don't do anything
      end
    end
    #now that we processed the data as either good or bad, we can move
    #on and set the state back to false.
    next_is_data = false
  end
 end


 #here are two data stores (arrays) for putting the two categories
 #of items in, one for products and one for categories
 categories = []
 products = []

 #loop through the data array and execute some code for each item,
 #which is stored as 'js'
 #
 #http://www.ruby-doc.org/core-1.9.2/Array.html#method-i-each
 json_data.each do |js|
  #check that the data has an attribute called 'name' - if it has
  #this category, it's product data we are interested in
  if js["name"]
    if js["type"] == "Product"
      products << js
    elsif js["type"] == "Category"
      categories << js
    end
  end
 end

 #this is tricky looking, but pretty simple.  Just make an array
 #of the two arrays of data, and loop through it.  This way, I don't
 #have to copy code to do the same thing on each array of items.
 [categories, products].each do |arr|
  arr.each do |js|
    str = []
    #the items we stored as good in the above parsing are called Hashes.
    #each_pair() is a method that looks through each key/value combo
    #and gives you the key and value
    #
    #http://www.ruby-doc.org/core-1.9.2/Hash.html#method-i-each_pair
    js.each_pair do |key, val|
      #don't put any images in our data, because there are alot and it's ugly
      next if key == "images"
      #store the key/value string, tab separated (the \t makes a computer tab)
      #in our array of strings to print out.
      #
      #http://www.ruby-doc.org/core-1.9.2/Array.html#method-i-3C-3C
      str << "#{key}:\t\t\t#{val}"
    end
    #we have all our strings saved up, let's tell ruby to print them all out,
    #and to automatically separate them with newlines
    #
    #http://www.ruby-doc.org/core-1.9.2/Array.html#method-i-join
    puts str.join("\n")

    #put a final newline inbetween our product dumps, so we can tell them apart
    #for readability you could also add a dashed line or something to visually
    #separate them...puts "--------------------------") or something like that
    puts "\n"
  end
 end
	#run this file by typing 'ruby tyler.rb' on your command line.
	#make sure the tyler_garbage_data.txt file is in the same directory,
	#and that you are also in that directory on your command line.
	#
	#A project for you would be to figure out how to pass the ruby
	#script the garbage data file on teh command line, so you can
	#have it named whatever.
	#
	#Finally, to capture the output of this script, you'll want to
	#execute it with something like
	# 'ruby tyler.rb > output.txt'
	#otherewise the stuff will just print out on the screen and won't make
	#it into a file. That's another good exercise, figure out how to use
	#File.write() to write this data to a particular file.



	#bring in the library for parsing JSON data - JSON is just another
	#way to transfer data back and forth between computers, like XML.
	#
	#if you don't have it installed (there is an error) you'll need
	#to do something like 'gem install json' on your command line
	require 'json'

	#make an array to hold our objects we get from the file
	json_data = []

	#open the file and read the data into a local variable 'txt'
	#notice the second argument of "r:binary" - this is important
	#because we are telling ruby that the file we are opening contains
	#data other than just text. If we don't tell ruby this, it will
	#error when it encounters non-text stuff. The non-text data in
	#this file appear to be image thumbnails for what it's worth.
	#
	#http://www.ruby-doc.org/core-1.9.2/File.html#method-c-open
	#
	File.open("./tyler_garbage_data.txt", "r:binary") do \|file\|
	#set state - since the file wraps each data line inbetween two
	#distinct lines, we set this to true to indicate "ok, the next
	#line after this is going to be a data line"
	next_is_data = false

	#ok lets start actually looping through each line (text followed
	#by a newline character)
	file.each do \|line\|
	#if the line we are at matches the pattern of the "start of data"
	#line i mentioned above, set the state and then call 'next' to skip
	#the rest of the code in this section
	#
	#readup on regular expressions (regex) for more information, but it's
	#a hell of a topic
	if line =~ /Response-body:<<--EOF/i
	next_is_data = true
	next
	end

	#if the last line was data (meaning it matched the comparison
	#above) then let's try and parse it
	if next_is_data
	#a begin/rescue/end block is a chunk of code we basically
	#expect to fail. If there is an error executing the code in the
	#begin block, we stop and then execute the code in the rescue block
	begin
	#try and parse the line as JSON. If it's not JSON (half the data
	#in the file isn't), then it will error.
	js = JSON.parse(line)
	#wow we got here, that means parsing as JSON didn't error, so let's
	#store the data away in our good data store
	json_data << js
	rescue
	#fuck, not JSON, garbage data, don't do anything
	end
	end
	#now that we processed the data as either good or bad, we can move
	#on and set the state back to false.
	next_is_data = false
	end
	end


	#here are two data stores (arrays) for putting the two categories
	#of items in, one for products and one for categories
	categories = []
	products = []

	#loop through the data array and execute some code for each item,
	#which is stored as 'js'
	#
	#http://www.ruby-doc.org/core-1.9.2/Array.html#method-i-each
	json_data.each do \|js\|
	#check that the data has an attribute called 'name' - if it has
	#this category, it's product data we are interested in
	if js["name"]
	if js["type"] == "Product"
	products << js
	elsif js["type"] == "Category"
	categories << js
	end
	end
	end

	#this is tricky looking, but pretty simple. Just make an array
	#of the two arrays of data, and loop through it. This way, I don't
	#have to copy code to do the same thing on each array of items.
	[categories, products].each do \|arr\|
	arr.each do \|js\|
	str = []
	#the items we stored as good in the above parsing are called Hashes.
	#each_pair() is a method that looks through each key/value combo
	#and gives you the key and value
	#
	#http://www.ruby-doc.org/core-1.9.2/Hash.html#method-i-each_pair
	js.each_pair do \|key, val\|
	#don't put any images in our data, because there are alot and it's ugly
	next if key == "images"
	#store the key/value string, tab separated (the \t makes a computer tab)
	#in our array of strings to print out.
	#
	#http://www.ruby-doc.org/core-1.9.2/Array.html#method-i-3C-3C
	str << "#{key}:\t\t\t#{val}"
	end
	#we have all our strings saved up, let's tell ruby to print them all out,
	#and to automatically separate them with newlines
	#
	#http://www.ruby-doc.org/core-1.9.2/Array.html#method-i-join
	puts str.join("\n")

	#put a final newline inbetween our product dumps, so we can tell them apart
	#for readability you could also add a dashed line or something to visually
	#separate them...puts "--------------------------") or something like that
	puts "\n"
	end
	end