Last active
August 29, 2015 14:16
-
-
Save QB/6a18ec6692922b6710fe to your computer and use it in GitHub Desktop.
早稲田大学の研究者データベースをスクレイピングするよ。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "mechanize" | |
require "pp" | |
class Employees | |
URI = "https://www.wnp7.waseda.jp/Rdb/app/ip/ipi0203.html?lang_kbn=0" | |
def initialize | |
@agent = Mechanize.new | |
end | |
def get_employees | |
max = get_max | |
((max-1) / 15).times.map do |i| | |
get_list(i * 15) | |
end.flatten | |
end | |
def get_max | |
page = @agent.get(URI) | |
page_number = page.search("center").search("td")[1].children.text.split("\t")[4] # ページ番号 | |
page_number.match(/^(\d+)/)[1].to_i | |
end | |
def get_list(n=0) | |
page = @agent.post(URI, next: n) | |
page.encoding = "eucjp-ms" | |
list = page.search("table")[2].search("tr") | |
list.shift | |
parse_list(list) | |
end | |
def parse_list(list) | |
list.map do |r| | |
{ | |
name: r.children.children[1].children[0].text, | |
number: r.children.children[1].attributes["href"].value.gsub(/\D/, ''), | |
reading: r.children.children[2].text.strip, | |
belonging: r.children.children[3].text.strip, | |
title: r.children.children[4].text.strip, | |
specialty: r.children.children[5].text.strip, | |
} | |
end | |
end | |
def bar # デバッグ用 | |
puts "="*20 | |
end | |
end | |
e = Employees.new | |
pp all_emps = e.get_employees |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment