AlexEne · August 29, 2015 14:03
diff --git a/more plot stuff b/more plot stuff
 import requests
 import matplotlib.pyplot as plt
 import matplotlib.cm as cm
 import bs4
 from lxml import etree
 import codecs
 import time
 import pandas as pd


 def scrap_page(start):
    payload = {'sort': 'num_votes,desc', 'start': start, 'title_type': 'feature'}
    req = requests.get('http://www.imdb.com/search/title', params=payload)
    bs = bs4.BeautifulSoup(req.content)
    results = bs.findChild('table', {'class': 'results'})
    while req.status_code != 200 or not results:
        time.sleep(1)  # try again
        req = requests.get('http://www.imdb.com/search/title', params=payload)
        bs = bs4.BeautifulSoup(req.text)
        results = bs.findChild('table', {'class': 'results'})

    titles = results.findChildren('td', 'title')

    for i, title in enumerate(titles[:-1]):
        name = title.findAll('a', href=True)[0].text
        year = title.find('span', 'year_type').text
        rating = title.select('.value')[0].text
        d = title.findChild('span', {'class': 'runtime'})
        duration = '0'
        if d:
            duration = d.text
        g = title.findChild('span', {'class': 'genre'})
        if not g:
            continue  # no gender for this movie, just skip it.
        genres = g.findChildren('a')
        genre = '|'.join(gen.text for gen in genres)
        p = title.parent
        num_votes = p.findChild('td', {'class': 'sort_col'}).text
        num_votes = num_votes.replace(',', '')
        # print ','.join([name, year, rating, duration, num_votes, genre])
        with codecs.open('movies.csv', 'a', 'utf-8') as f:
            line = '\t'.join([name, year, rating, duration, num_votes, genre])
            f.write(line + '\n')
    return i + 1


 """def scrap_page(start):
    payload = {'sort': 'num_votes,desc', 'start': start, 'title_type': 'feature'}
    req = requests.get('http://www.imdb.com/search/title', params=payload)
    while req.status_code != 200:
        time.sleep(1)
        req = requests.get('http://www.imdb.com/search/title', params=payload)
    parser = etree.Htmlparser()
    tree = etree.parse(req.text, parser)
    tree.#main > table > tbody > tr:nth-child(2) > td.title > a"""


 def process_data(csv_file):
    n = ['name', 'year', 'score', 'duration', 'votes', 'genre']
    data = pd.read_csv(csv_file, names=n, delimiter='\t', encoding='utf-8', engine='python').dropna()
    data['duration'] = [float(r.split(' ')[0]) for r in data.duration]
    data['year'] = [float(y[1:-1]) for y in data.year]

    genres = set()
    for movie in data.genre:
        genres.update(movie.split('|'))
    genres = sorted(genres)
    #print genres
    for genre in genres:
        data[genre] = [genre in movie.split('|') for movie in data.genre]
    # print data.shape[0]
    #print data.head()
    #print data.duration.describe()
    return data


 def plot_axis(ax, data_x, data_y, title='', show_x=False, show_y=False):
    ax.scatter(data_x, data_y, alpha=0.3, color="#3F5D7D", edgecolor="#3F5D7D")
    ax.axes.get_xaxis().set_visible(show_x)
    ax.axes.get_yaxis().set_visible(show_y)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    if title:
        ax.set_title(title)
    # ax.spines['bottom'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()


 def main():
    # total_scraped = 0
    #while total_scraped < 10000:
    #    total_scraped += scrap_page(total_scraped + 1)
    #    print 'Scraped {0}'.format(total_scraped)
    data = process_data('movies.csv')

    #print data[(data.year < 1950) & (data.votes > 100000)]

    d = data[(data.Horror == True)]
    d2 = data[(data.Comedy == True)]
    d3 = data[(data['Western'] == True)]

    fig = plt.figure(facecolor='white', figsize=(9, 9))

    genres = ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama',
              'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance',
              'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


    ax1 = fig.add_subplot(5, 5, 1)
    plot_axis(ax1, data.year, data.score, title='Overall score/year distribution', show_y=True)

    for i, genre in enumerate(genres):
        d = data[(data[genre] == True)]
        ax = fig.add_subplot(5, 5, i+2, sharex=ax1, sharey=ax1, title=genre)
        plot_axis(ax, d.year, d.score)
        if i > 16:
            ax.axes.get_xaxis().set_visible(True)
            for label in ax.xaxis.get_ticklabels():
                label.set_rotation(45)
        if (i+1) % 5 == 0:
            ax.axes.get_yaxis().set_visible(True)

    plt.xlim(1908, 2015)
    plt.show()


 if __name__ == '__main__':
    main()
	import requests
	import matplotlib.pyplot as plt
	import matplotlib.cm as cm
	import bs4
	from lxml import etree
	import codecs
	import time
	import pandas as pd


	def scrap_page(start):
	payload = {'sort': 'num_votes,desc', 'start': start, 'title_type': 'feature'}
	req = requests.get('http://www.imdb.com/search/title', params=payload)
	bs = bs4.BeautifulSoup(req.content)
	results = bs.findChild('table', {'class': 'results'})
	while req.status_code != 200 or not results:
	time.sleep(1) # try again
	req = requests.get('http://www.imdb.com/search/title', params=payload)
	bs = bs4.BeautifulSoup(req.text)
	results = bs.findChild('table', {'class': 'results'})

	titles = results.findChildren('td', 'title')

	for i, title in enumerate(titles[:-1]):
	name = title.findAll('a', href=True)[0].text
	year = title.find('span', 'year_type').text
	rating = title.select('.value')[0].text
	d = title.findChild('span', {'class': 'runtime'})
	duration = '0'
	if d:
	duration = d.text
	g = title.findChild('span', {'class': 'genre'})
	if not g:
	continue # no gender for this movie, just skip it.
	genres = g.findChildren('a')
	genre = '\|'.join(gen.text for gen in genres)
	p = title.parent
	num_votes = p.findChild('td', {'class': 'sort_col'}).text
	num_votes = num_votes.replace(',', '')
	# print ','.join([name, year, rating, duration, num_votes, genre])
	with codecs.open('movies.csv', 'a', 'utf-8') as f:
	line = '\t'.join([name, year, rating, duration, num_votes, genre])
	f.write(line + '\n')
	return i + 1


	"""def scrap_page(start):
	payload = {'sort': 'num_votes,desc', 'start': start, 'title_type': 'feature'}
	req = requests.get('http://www.imdb.com/search/title', params=payload)
	while req.status_code != 200:
	time.sleep(1)
	req = requests.get('http://www.imdb.com/search/title', params=payload)
	parser = etree.Htmlparser()
	tree = etree.parse(req.text, parser)
	tree.#main > table > tbody > tr:nth-child(2) > td.title > a"""


	def process_data(csv_file):
	n = ['name', 'year', 'score', 'duration', 'votes', 'genre']
	data = pd.read_csv(csv_file, names=n, delimiter='\t', encoding='utf-8', engine='python').dropna()
	data['duration'] = [float(r.split(' ')[0]) for r in data.duration]
	data['year'] = [float(y[1:-1]) for y in data.year]

	genres = set()
	for movie in data.genre:
	genres.update(movie.split('\|'))
	genres = sorted(genres)
	#print genres
	for genre in genres:
	data[genre] = [genre in movie.split('\|') for movie in data.genre]
	# print data.shape[0]
	#print data.head()
	#print data.duration.describe()
	return data


	def plot_axis(ax, data_x, data_y, title='', show_x=False, show_y=False):
	ax.scatter(data_x, data_y, alpha=0.3, color="#3F5D7D", edgecolor="#3F5D7D")
	ax.axes.get_xaxis().set_visible(show_x)
	ax.axes.get_yaxis().set_visible(show_y)
	ax.spines['right'].set_visible(False)
	ax.spines['top'].set_visible(False)
	ax.spines['left'].set_visible(False)
	if title:
	ax.set_title(title)
	# ax.spines['bottom'].set_visible(False)
	ax.get_xaxis().tick_bottom()
	ax.get_yaxis().tick_left()


	def main():
	# total_scraped = 0
	#while total_scraped < 10000:
	# total_scraped += scrap_page(total_scraped + 1)
	# print 'Scraped {0}'.format(total_scraped)
	data = process_data('movies.csv')

	#print data[(data.year < 1950) & (data.votes > 100000)]

	d = data[(data.Horror == True)]
	d2 = data[(data.Comedy == True)]
	d3 = data[(data['Western'] == True)]

	fig = plt.figure(facecolor='white', figsize=(9, 9))

	genres = ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama',
	'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance',
	'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


	ax1 = fig.add_subplot(5, 5, 1)
	plot_axis(ax1, data.year, data.score, title='Overall score/year distribution', show_y=True)

	for i, genre in enumerate(genres):
	d = data[(data[genre] == True)]
	ax = fig.add_subplot(5, 5, i+2, sharex=ax1, sharey=ax1, title=genre)
	plot_axis(ax, d.year, d.score)
	if i > 16:
	ax.axes.get_xaxis().set_visible(True)
	for label in ax.xaxis.get_ticklabels():
	label.set_rotation(45)
	if (i+1) % 5 == 0:
	ax.axes.get_yaxis().set_visible(True)

	plt.xlim(1908, 2015)
	plt.show()


	if __name__ == '__main__':
	main()