hyoshiok · April 16, 2012 09:39 · hyoshiok · Apr 16, 2012
diff --git a/count_utf8.rb b/count_utf8.rb
 # encodings: utf-8
 =begin :rdoc

 = count_utf8.rb

 This script guesses the encoding of a csv file and print ratio of utf8 / size of the message

 The format of the csv file is the following
 id, replied_to_id, thread_id, timestamp, group, recipients, private, user_id, user, email_address, body, url, attachment_ids

 = usage 

  ruby count_utf8.rb <csv-file> 

 = output 

 [+group+]  group is created
 [+joined+] member joined
 [+ASCII+]  message is written in ASCII
 [+UTF8+]   message is written in UTF8. It is likely to be written in Japenese

 Author :: hirotaka.yoshioka@mail.rakuten.com
 Date   :: 4/16/2012 created 

 = history
 4/16/2012  fork from code_utf8.rb
 =end

 require 'nkf'
 require 'csv'

 CODES = { 
  NKF::UNKNOWN => "UNKNOWN(ASCII)", 
  NKF::JIS => "JIS", 
  NKF::EUC => "EUC", 
  NKF::SJIS => "SJIS", 
  NKF::BINARY => "BINARY", 
  NKF::ASCII => "ASCII", 
  NKF::UTF8 => "UTF8" 
 }

 hash = Hash.new(0)
 sum = Array.new()

 begin
 i = 0
 line=CSV.open(ARGV[0], 'r')
 line.shift # the first line is header, so it is omitted
 line.each do |row|
  body = row[10]
  case body
  when /\[Group/ 
    hash["group"] += 1
  when /\[Tag.*joined\]/ 
    hash["joined"] += 1
  else 
    code = CODES.fetch(NKF.guess(body)) if body != nil 
    hash[code] += 1
    if code != "ASCII" then
      u = 0
      body.split("").each { |b|
        if b[0].to_i < 33 || b[0].to_i > 127 then
           u += 1
        end
      }
      print " utf8.size, body.size, ratio ", u, ", ", body.size, ", ", u*100.0/body.size, "\n" if code != "ASCII" 
    end
  end 
 end
 rescue
 ensure
 print "English/All=","\t",hash["ASCII"]*100.0/(hash["UTF8"]+hash["ASCII"]+1),"\n"
 hash.each do |key, value|
  print key,"\t",value, "\n"
 end
 end
	# encodings: utf-8
	=begin :rdoc

	= count_utf8.rb

	This script guesses the encoding of a csv file and print ratio of utf8 / size of the message

	The format of the csv file is the following
	id, replied_to_id, thread_id, timestamp, group, recipients, private, user_id, user, email_address, body, url, attachment_ids

	= usage

	ruby count_utf8.rb <csv-file>

	= output

	[+group+] group is created
	[+joined+] member joined
	[+ASCII+] message is written in ASCII
	[+UTF8+] message is written in UTF8. It is likely to be written in Japenese

	Author :: hirotaka.yoshioka@mail.rakuten.com
	Date :: 4/16/2012 created

	= history
	4/16/2012 fork from code_utf8.rb
	=end

	require 'nkf'
	require 'csv'

	CODES = {
	NKF::UNKNOWN => "UNKNOWN(ASCII)",
	NKF::JIS => "JIS",
	NKF::EUC => "EUC",
	NKF::SJIS => "SJIS",
	NKF::BINARY => "BINARY",
	NKF::ASCII => "ASCII",
	NKF::UTF8 => "UTF8"
	}

	hash = Hash.new(0)
	sum = Array.new()

	begin
	i = 0
	line=CSV.open(ARGV[0], 'r')
	line.shift # the first line is header, so it is omitted
	line.each do \|row\|
	body = row[10]
	case body
	when /\[Group/
	hash["group"] += 1
	when /\[Tag.*joined\]/
	hash["joined"] += 1
	else
	code = CODES.fetch(NKF.guess(body)) if body != nil
	hash[code] += 1
	if code != "ASCII" then
	u = 0
	body.split("").each { \|b\|
	if b[0].to_i < 33 \|\| b[0].to_i > 127 then
	u += 1
	end
	}
	print " utf8.size, body.size, ratio ", u, ", ", body.size, ", ", u*100.0/body.size, "\n" if code != "ASCII"
	end
	end
	end
	rescue
	ensure
	print "English/All=","\t",hash["ASCII"]*100.0/(hash["UTF8"]+hash["ASCII"]+1),"\n"
	hash.each do \|key, value\|
	print key,"\t",value, "\n"
	end
	end