Last active
October 30, 2017 12:51
-
-
Save drobune/694b77b4964f1f2e9ea8087ae806f043 to your computer and use it in GitHub Desktop.
twitterのarchive dataをscrapbox形式に変換する。ハッシュタグも適当に振る
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'csv' | |
require 'uri' | |
require 'time' | |
require 'natto' | |
require 'json' | |
nm = Natto::MeCab.new | |
wordHash = {} | |
charset = nm.dicts.first.charset | |
encoding = (charset == 'SHIFT-JIS') ? 'Shift_JIS' : charset | |
line_break = "" | |
day_tweets = [] | |
key_day = Date.new | |
all_tweets = [] | |
current_title = "" | |
CSV.read('tweets.csv', headers: true).each do |data| | |
current_day = Time.parse(data["timestamp"]).to_date | |
unless key_day == current_day | |
all_tweets << {title: current_title, lines: day_tweets } unless current_title.empty? | |
key_day = current_day | |
current_title = "tweets #{current_day.strftime("%Y-%m-%d")}" | |
day_tweets = [] | |
end | |
hash_tags = "" | |
tweet_body = data["text"].to_s.gsub(/(\r\n?|\n)/,"") # 改行コードを削除 | |
# 短縮された本文中のURLは捨てる | |
URI.extract(tweet_body).each do |url| | |
tweet_body.slice!(url) | |
end | |
# ハッシュタグを振る | |
nm.parse(tweet_body) do |n| | |
next unless n.feature =~ /名詞/ | |
# ミスった単語で使用回数の多いものを手動で除去する | |
next if n.surface.length <= 1 || n.surface =~ /http|:\/\/|^in$|^rt$|^on$|\$|and|^amp$|^of$|\_|-|\(|\)|\.|~|at|by|to|co|gt|now|もの|こと|こちら|すか|とこ|もん|ぶり|なに|ここ|ちゃん|たち|やつ|いま|とき|ほう|ため|それ|さん|そう|こと|よう|これ|みたい|うち/i | |
hash_tags << "##{n.surface} " | |
#言葉の登場回数をカウントする | |
wordHash[n.surface] ? wordHash[n.surface] += 1 : wordHash[n.surface] = 1 | |
end | |
day_tweets << line_break | |
day_tweets << tweet_body | |
day_tweets << data["expanded_urls"].split(',').map{|url| '[' + url +']' }.join(',') unless data["expanded_urls"].empty? | |
day_tweets << hash_tags | |
end | |
#ハッシュタグワード使用回数のランキング | |
#wordRanking = wordHash.sort_by{ |_, v| -v } | |
#puts wordRanking.to_a[0..199] | |
# scrapbox形式にexport | |
scrapbox = { | |
pages: all_tweets | |
} | |
puts JSON.generate(scrapbox) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment