Skip to content

Instantly share code, notes, and snippets.

@willwillems
Last active February 18, 2017 11:27
Show Gist options
  • Save willwillems/ea2c042daa8566dbca66511c37a14c9a to your computer and use it in GitHub Desktop.
Save willwillems/ea2c042daa8566dbca66511c37a14c9a to your computer and use it in GitHub Desktop.
Simple Python 3.5.2 Marktplaats scraping example with Beautiful Soup 4
import requests
import urllib.parse
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
def crawlmp( base_url, parameters, add_to_url = '' ):
""" Parameters dictionairy can contain:
query[str],
categoryId[int],
searchOnTitleAndDescription[bool],
startDateFrom[str],
categoryId[int]
"""
default = {
'query' : '',
'searchOnTitleAndDescription' : "true",
'startDateFrom' : 'always'
}
# merge the default args and the provided ones
z = {**default, **parameters}
urlargs = urllib.parse.urlencode(z)
# get page
url = "http://www.marktplaats.nl/{}?{}{}".format(base_url, urlargs, add_to_url)
r = requests.get(url)
# get content of page
data = r.text
# Produce DOM tree
soup = BeautifulSoup(data, 'html.parser')
# produce array to append to in for loop
price = []
# price = np.array([])
# Find how many pages this query has
if soup.find('span', class_="last") == None:
pages = 1
else:
pages = pages = int(soup.find('span', class_="last").text)
ammount_of_pages = range(0, pages)
for i in ammount_of_pages:
current_page = {'currentPage' : str(i+1)}
# Marktplaats needs an aditional argument for the last page
if i == ammount_of_pages[-1]:
current_page['lastPage'] = 'true'
r = requests.get(url + '&' + urllib.parse.urlencode(current_page))
data = r.text
soup = BeautifulSoup(data, 'html.parser')
# Progress indicator
print("req nr:", i)
for ad in soup.find_all('article', class_='search-result'):
try:
price_tag = ad.find('span', class_='price-new').text.replace('.', '')
if price_tag.find(',') != -1:
price.append({
'price' : float(price_tag.split('€').pop().strip().replace(',', '.')),
'date' : ad.find('div', class_='date').getText().strip(),
'seller' : ad.find('div', class_='seller-name ellipsis')['title'],
'title' : ad.find('span', class_='mp-listing-title')['title']
})
except:
print("Non numeric price")
# print("Unexpected error:", sys.exc_info()[0])
return price
# Example of distribution plot of all Apple laptops
MB = crawlmp(
'z/computers-en-software/laptops-apple.html',
{
'categoryId' : '325',
'query' : '',
'searchOnTitleAndDescription' : "true",
'startDateFrom' : 'always'
}
)
# Get a list of all the prices
MB_prices = np.array([d['price'] for d in MB])
# Get the average price of the results
print("Average price: {}".format(np.average(MB_prices)))
# Plot this shit
n, bins, patches = plt.hist(MB_prices, bins = range(0, 3000, 50))
plt.show()
@willwillems
Copy link
Author

Example of result for all 13-inch macbook pro's:

X-axis: price in €
Y-axis: amount of ads in bin
Bin size: €50
13-inchmb
Data collected on 17-02-17

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment