Created
November 20, 2014 16:13
-
-
Save nubela/f4452e34a4adfd38ecf6 to your computer and use it in GitHub Desktop.
PropertyGuru scraper because why the f*ck do websites block copy paste?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from BeautifulSoup import BeautifulSoup | |
import requests | |
FILE_NAME = "result.txt" | |
BASE_URL = "http://www.propertyguru.com.sg/" | |
URL = "http://www.propertyguru.com.sg/singapore-property-listing/property-for-rent/%d?property_type=H" \ | |
"&property_type_code[]=HDB&minprice=1500&maxprice=2500&minsize=1000&distance=0.5¢er_lat=1" \ | |
".39126455055¢er_long=103.89543056488&latitude=1.39126455055&longitude=103.89543056488" | |
def get_listings(page=1): | |
r = requests.get(URL % (page)) | |
soup = BeautifulSoup(r.text) | |
all_listings = soup.findAll("a", {"class": "infotitle listing_action clearfix "}) | |
return map(lambda x: BASE_URL + x["href"], all_listings) | |
def process_listings(url): | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text) | |
agent_info = soup.find("div", {"class": "agent_info"}) | |
name = agent_info.h3.string.strip() | |
no = agent_info.div.string.strip().split("\r")[0] | |
info1 = soup.find("div", {"class": "info1"}) | |
info_all = info1.findAll("p") | |
price = int(info_all[0].span.string.strip().replace("S$ ", "").replace(" / month", "").replace(",", "")) | |
size = int(info_all[2].string.strip().split(" ")[0].replace(",", "")) | |
address = info_all[3].string | |
address = address.replace("\t", " ") | |
address = address.replace("\r\n", " ") | |
while " " in address: | |
address = address.replace(" ", " ") | |
return { | |
"agent_name": name, | |
"no": no, | |
"price": price, | |
"size": size, | |
"address": address, | |
} | |
for i in range(5): | |
results = [] | |
all_listings = get_listings(i) | |
for l in all_listings: | |
results += [process_listings(l)] | |
f = open(FILE_NAME, "w") | |
f.write(json.dumps(results)) | |
f.close() |
@ganeshraj is the script still working?
I tested the code, unfortunately, the code does not work as requests
does not allow for the site.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this code. I'm trying to filter property guru for places with no live in landlords but it won't let me do it