Last active
August 29, 2015 14:03
-
-
Save AlexEne/67f50c133d58738d106e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import matplotlib.pyplot as plt | |
import matplotlib.cm as cm | |
import bs4 | |
from lxml import etree | |
import codecs | |
import time | |
import pandas as pd | |
def scrap_page(start): | |
payload = {'sort': 'num_votes,desc', 'start': start, 'title_type': 'feature'} | |
req = requests.get('http://www.imdb.com/search/title', params=payload) | |
bs = bs4.BeautifulSoup(req.content) | |
results = bs.findChild('table', {'class': 'results'}) | |
while req.status_code != 200 or not results: | |
time.sleep(1) # try again | |
req = requests.get('http://www.imdb.com/search/title', params=payload) | |
bs = bs4.BeautifulSoup(req.text) | |
results = bs.findChild('table', {'class': 'results'}) | |
titles = results.findChildren('td', 'title') | |
for i, title in enumerate(titles[:-1]): | |
name = title.findAll('a', href=True)[0].text | |
year = title.find('span', 'year_type').text | |
rating = title.select('.value')[0].text | |
d = title.findChild('span', {'class': 'runtime'}) | |
duration = '0' | |
if d: | |
duration = d.text | |
g = title.findChild('span', {'class': 'genre'}) | |
if not g: | |
continue # no gender for this movie, just skip it. | |
genres = g.findChildren('a') | |
genre = '|'.join(gen.text for gen in genres) | |
p = title.parent | |
num_votes = p.findChild('td', {'class': 'sort_col'}).text | |
num_votes = num_votes.replace(',', '') | |
# print ','.join([name, year, rating, duration, num_votes, genre]) | |
with codecs.open('movies.csv', 'a', 'utf-8') as f: | |
line = '\t'.join([name, year, rating, duration, num_votes, genre]) | |
f.write(line + '\n') | |
return i + 1 | |
"""def scrap_page(start): | |
payload = {'sort': 'num_votes,desc', 'start': start, 'title_type': 'feature'} | |
req = requests.get('http://www.imdb.com/search/title', params=payload) | |
while req.status_code != 200: | |
time.sleep(1) | |
req = requests.get('http://www.imdb.com/search/title', params=payload) | |
parser = etree.Htmlparser() | |
tree = etree.parse(req.text, parser) | |
tree.#main > table > tbody > tr:nth-child(2) > td.title > a""" | |
def process_data(csv_file): | |
n = ['name', 'year', 'score', 'duration', 'votes', 'genre'] | |
data = pd.read_csv(csv_file, names=n, delimiter='\t', encoding='utf-8', engine='python').dropna() | |
data['duration'] = [float(r.split(' ')[0]) for r in data.duration] | |
data['year'] = [float(y[1:-1]) for y in data.year] | |
genres = set() | |
for movie in data.genre: | |
genres.update(movie.split('|')) | |
genres = sorted(genres) | |
#print genres | |
for genre in genres: | |
data[genre] = [genre in movie.split('|') for movie in data.genre] | |
# print data.shape[0] | |
#print data.head() | |
#print data.duration.describe() | |
return data | |
def plot_axis(ax, data_x, data_y, title='', show_x=False, show_y=False): | |
ax.scatter(data_x, data_y, alpha=0.3, color="#3F5D7D", edgecolor="#3F5D7D") | |
ax.axes.get_xaxis().set_visible(show_x) | |
ax.axes.get_yaxis().set_visible(show_y) | |
ax.spines['right'].set_visible(False) | |
ax.spines['top'].set_visible(False) | |
ax.spines['left'].set_visible(False) | |
if title: | |
ax.set_title(title) | |
# ax.spines['bottom'].set_visible(False) | |
ax.get_xaxis().tick_bottom() | |
ax.get_yaxis().tick_left() | |
def main(): | |
# total_scraped = 0 | |
#while total_scraped < 10000: | |
# total_scraped += scrap_page(total_scraped + 1) | |
# print 'Scraped {0}'.format(total_scraped) | |
data = process_data('movies.csv') | |
#print data[(data.year < 1950) & (data.votes > 100000)] | |
d = data[(data.Horror == True)] | |
d2 = data[(data.Comedy == True)] | |
d3 = data[(data['Western'] == True)] | |
fig = plt.figure(facecolor='white', figsize=(9, 9)) | |
genres = ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', | |
'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance', | |
'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'] | |
ax1 = fig.add_subplot(5, 5, 1) | |
plot_axis(ax1, data.year, data.score, title='Overall score/year distribution', show_y=True) | |
for i, genre in enumerate(genres): | |
d = data[(data[genre] == True)] | |
ax = fig.add_subplot(5, 5, i+2, sharex=ax1, sharey=ax1, title=genre) | |
plot_axis(ax, d.year, d.score) | |
if i > 16: | |
ax.axes.get_xaxis().set_visible(True) | |
for label in ax.xaxis.get_ticklabels(): | |
label.set_rotation(45) | |
if (i+1) % 5 == 0: | |
ax.axes.get_yaxis().set_visible(True) | |
plt.xlim(1908, 2015) | |
plt.show() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment