willwillems · February 18, 2017 11:27 · willwillems · Feb 18, 2017
diff --git a/scraper.py b/scraper.py
 import requests
 import urllib.parse
 import numpy as np
 import matplotlib.pyplot as plt
 from bs4 import BeautifulSoup

 def crawlmp( base_url, parameters, add_to_url = '' ):
    """ Parameters dictionairy can contain: 
    query[str], 
    categoryId[int],
    searchOnTitleAndDescription[bool],
    startDateFrom[str],
 	categoryId[int]
    """
    default = {
        'query' : '',
        'searchOnTitleAndDescription' : "true",
        'startDateFrom' : 'always'
    }
    # merge the default args and the provided ones
    z = {**default, **parameters}
    urlargs = urllib.parse.urlencode(z)
    # get page
    url = "http://www.marktplaats.nl/{}?{}{}".format(base_url, urlargs, add_to_url)
    r  = requests.get(url)
    # get content of page
    data = r.text
    # Produce DOM tree
    soup = BeautifulSoup(data, 'html.parser')
    # produce array to append to in for loop
    price = []
    # price = np.array([])
    # Find how many pages this query has
    if soup.find('span', class_="last") == None:
        pages = 1
    else:
        pages = pages = int(soup.find('span', class_="last").text)
    ammount_of_pages = range(0, pages)
    for i in ammount_of_pages:
        current_page = {'currentPage' : str(i+1)}
        # Marktplaats needs an aditional argument for the last page
        if i == ammount_of_pages[-1]:
            current_page['lastPage'] = 'true'
        r  = requests.get(url + '&' + urllib.parse.urlencode(current_page))
        data = r.text
        soup = BeautifulSoup(data, 'html.parser')
        # Progress indicator
        print("req nr:", i)
        for ad in soup.find_all('article', class_='search-result'):
            try:
                price_tag = ad.find('span', class_='price-new').text.replace('.', '')
                if price_tag.find(',') != -1:
                    price.append({
                        'price' : float(price_tag.split('€').pop().strip().replace(',', '.')),
                        'date' : ad.find('div', class_='date').getText().strip(),
                        'seller' : ad.find('div', class_='seller-name ellipsis')['title'],
                        'title' : ad.find('span', class_='mp-listing-title')['title']
                    })
            except:
                print("Non numeric price")
                # print("Unexpected error:", sys.exc_info()[0])
    return price

 # Example of distribution plot of all Apple laptops
 MB = crawlmp(
    'z/computers-en-software/laptops-apple.html',
    {
        'categoryId' : '325',
        'query' : '',
        'searchOnTitleAndDescription' : "true",
        'startDateFrom' : 'always'
    }
 )
 # Get a list of all the prices
 MB_prices = np.array([d['price'] for d in MB])
 # Get the average price of the results
 print("Average price: {}".format(np.average(MB_prices)))
 # Plot this shit
 n, bins, patches = plt.hist(MB_prices, bins = range(0, 3000, 50))
 plt.show()
	import requests
	import urllib.parse
	import numpy as np
	import matplotlib.pyplot as plt
	from bs4 import BeautifulSoup

	def crawlmp( base_url, parameters, add_to_url = '' ):
	""" Parameters dictionairy can contain:
	query[str],
	categoryId[int],
	searchOnTitleAndDescription[bool],
	startDateFrom[str],
	categoryId[int]
	"""
	default = {
	'query' : '',
	'searchOnTitleAndDescription' : "true",
	'startDateFrom' : 'always'
	}
	# merge the default args and the provided ones
	z = {default, parameters}
	urlargs = urllib.parse.urlencode(z)
	# get page
	url = "http://www.marktplaats.nl/{}?{}{}".format(base_url, urlargs, add_to_url)
	r = requests.get(url)
	# get content of page
	data = r.text
	# Produce DOM tree
	soup = BeautifulSoup(data, 'html.parser')
	# produce array to append to in for loop
	price = []
	# price = np.array([])
	# Find how many pages this query has
	if soup.find('span', class_="last") == None:
	pages = 1
	else:
	pages = pages = int(soup.find('span', class_="last").text)
	ammount_of_pages = range(0, pages)
	for i in ammount_of_pages:
	current_page = {'currentPage' : str(i+1)}
	# Marktplaats needs an aditional argument for the last page
	if i == ammount_of_pages[-1]:
	current_page['lastPage'] = 'true'
	r = requests.get(url + '&' + urllib.parse.urlencode(current_page))
	data = r.text
	soup = BeautifulSoup(data, 'html.parser')
	# Progress indicator
	print("req nr:", i)
	for ad in soup.find_all('article', class_='search-result'):
	try:
	price_tag = ad.find('span', class_='price-new').text.replace('.', '')
	if price_tag.find(',') != -1:
	price.append({
	'price' : float(price_tag.split('€').pop().strip().replace(',', '.')),
	'date' : ad.find('div', class_='date').getText().strip(),
	'seller' : ad.find('div', class_='seller-name ellipsis')['title'],
	'title' : ad.find('span', class_='mp-listing-title')['title']
	})
	except:
	print("Non numeric price")
	# print("Unexpected error:", sys.exc_info()[0])
	return price

	# Example of distribution plot of all Apple laptops
	MB = crawlmp(
	'z/computers-en-software/laptops-apple.html',
	{
	'categoryId' : '325',
	'query' : '',
	'searchOnTitleAndDescription' : "true",
	'startDateFrom' : 'always'
	}
	)
	# Get a list of all the prices
	MB_prices = np.array([d['price'] for d in MB])
	# Get the average price of the results
	print("Average price: {}".format(np.average(MB_prices)))
	# Plot this shit
	n, bins, patches = plt.hist(MB_prices, bins = range(0, 3000, 50))
	plt.show()