Created
August 7, 2023 13:25
-
-
Save jvangael/25f017952f367f6248b3dcceead7ad48 to your computer and use it in GitHub Desktop.
Quick hack to crawl the planning website of the South Cambridgeshire District Council
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import csv | |
import logging | |
import os | |
import requests | |
import random | |
import re | |
import sys | |
import time | |
from bs4 import BeautifulSoup | |
# Disable SSL warnings for now. | |
import urllib3 | |
urllib3.disable_warnings() | |
HEADERS = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-GB,en;q=0.9,nl-NL;q=0.8,nl;q=0.7,en-US;q=0.6', | |
'Cache-Control': 'max-age=0', | |
'Connection': 'keep-alive', | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'Host': 'applications.greatercambridgeplanning.org', | |
'Origin': 'https://applications.greatercambridgeplanning.org', | |
'Referer': 'https://applications.greatercambridgeplanning.org/online-applications/search.do?action=monthlyList', | |
'Sec-Fetch-Dest': 'document', | |
'Sec-Fetch-Mode': 'navigate', | |
'Sec-Fetch-Site': 'same-origin', | |
'Sec-Fetch-User': '?1', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"' | |
} | |
CSRF = '' | |
CSV_COLUMNS = [ | |
'key','Reference', | |
'Application Received', 'Application Validated', 'Address', 'Proposal', 'Status', | |
'Appeal Status', 'Appeal Decision', 'Application Type', 'Expected Decision Level', | |
'Case Officer', 'Parish', 'Ward', 'District Reference', 'Applicant Name', 'Agent Name', | |
'Agent Company Name', 'Agent Address', 'Environmental Assessment Requested', | |
'Applicant Address', 'Decision', 'Decision Issued Date', 'Actual Decision Level', | |
'Application Validated Date', 'Actual Committee Date', 'Neighbour Consultation Expiry Date', | |
'Standard Consultation Date', 'Standard Consultation Expiry Date', | |
'Last Advertised In Press Date', 'Latest Advertisement Expiry Date', | |
'Last Site Notice Posted Date', 'Latest Site Notice Expiry Date', 'Agreed Expiry Date', | |
'Permission Expiry Date', 'Determination Deadline', 'Temporary Permission Expiry Date' | |
] | |
def wait_random_seconds(min, max): | |
seconds = min + random.randint(1, max-min) | |
time.sleep(seconds) | |
def init_crawler(): | |
# First get a csrf and JSESSIONID token. | |
token_url = 'https://applications.greatercambridgeplanning.org/online-applications/' | |
server = requests.get(token_url, headers=HEADERS, verify=False) | |
CSRF = re.findall('name="_csrf" value="([-\w]+)"', server.text)[0] | |
logging.info("CSRF token is {}".format(CSRF)) | |
jsession_id = server.cookies['JSESSIONID'] | |
logging.info("JSESSIONID is {}".format(jsession_id)) | |
HEADERS['Cookie'] = 'JSESSIONID={}'.format(jsession_id) | |
def crawl_search(month): | |
search_url = 'https://applications.greatercambridgeplanning.org/online-applications/monthlyListResults.do?action=firstPage' | |
search_form_data = { | |
'searchCriteria.localAuthority': '505', | |
'searchCriteria.parish': '', | |
'searchCriteria.ward': '', | |
'month': '{}'.format(month), | |
'dateType': 'DC_Validated', | |
'searchType': 'Application', | |
'_csrf': CSRF, | |
} | |
server = requests.post(search_url, headers=HEADERS, data=search_form_data, verify=False) | |
print(server.text) | |
exit() | |
list_form_data = { | |
#'searchCriteria.localAuthority': '505', | |
#'searchCriteria.parish': '', | |
#'searchCriteria.ward': '', | |
#'month': '{}'.format(month), | |
#'dateType': 'DC_Validated', | |
#'searchType': 'Application', | |
'_csrf': CSRF, | |
'searchCriteria.resultsPerPage': '1000', | |
'action': 'page', | |
'searchCriteria.page': '1', | |
'orderBy': 'DateReceived', | |
'orderByDirection': 'Descending' | |
} | |
#list_url = 'https://applications.greatercambridgeplanning.org/online-applications/monthlyListResults.do?action=page' | |
list_url = 'https://applications.greatercambridgeplanning.org/online-applications/pagedSearchResults.do' | |
server = requests.post(list_url, headers=HEADERS, data=list_form_data, verify=False) | |
return server.text | |
def crawl_application(key, tab): | |
headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-GB,en;q=0.9,nl-NL;q=0.8,nl;q=0.7,en-US;q=0.6', | |
'Cache-Control': 'max-age=0', | |
'Connection': 'keep-alive', | |
'Cookie': 'JSESSIONID=FV9BZ4c3T2NnEXkhHu-_OU11PJFThKC-RnTx_fK5.gcp-palive-dmz1', | |
'Host': 'applications.greatercambridgeplanning.org', | |
'Sec-Fetch-Dest': 'document', | |
'Sec-Fetch-Mode': 'navigate', | |
'Sec-Fetch-Site': 'none', | |
'Sec-Fetch-User': '?1', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"' | |
} | |
application_url = 'https://applications.greatercambridgeplanning.org/online-applications/applicationDetails.do?activeTab={}&keyVal={}'.format(tab, key) | |
logging.info("GET {}".format(application_url)) | |
server = requests.get(application_url, headers=headers, verify=False) | |
if server.status_code != 200: | |
logging.info("HTTP Status {}".format(server.status_code)) | |
return None | |
return server.text | |
def search_main(args): | |
# TODO make this list a bit more dynamic | |
all_months = [ | |
'Jul 23','Jun 23','May 23','Apr 23','Mar 23','Feb 23','Jan 23', | |
'Dec 22','Nov 22','Oct 22','Sep 22','Aug 22','Jul 22','Jun 22','May 22','Apr 22','Mar 22','Feb 22','Jan 22', | |
'Dec 21','Nov 21','Oct 21','Sep 21','Aug 21','Jul 21','Jun 21','May 21','Apr 21','Mar 21','Feb 21','Jan 21', | |
'Dec 20','Nov 20','Oct 20','Sep 20','Aug 20','Jul 20' | |
] | |
print('Search') | |
for month in all_months[:1]: | |
logging.info("Retrieving details for {}".format(month)) | |
raw_text = crawl_search(month) | |
print(raw_text) | |
def applications_main(args): | |
# First retrieva all planning keys. | |
with open('/Users/jvg/Desktop/ref_codes', 'r') as f: | |
lines = f.readlines() | |
planning_keys = [line.rstrip("\n") for line in lines] | |
logging.info("Found {} planning keys. Starting crawl.".format(len(planning_keys))) | |
csv_writer = csv.DictWriter(sys.stdout, CSV_COLUMNS, delimiter=',') | |
csv_writer.writeheader() | |
# Start the actual crawling. | |
for key in planning_keys: | |
logging.info("Retrieving details for {}".format(key)) | |
cached = False | |
tabs_config = { | |
'summary': 'simpleDetailsTable', | |
'details': 'applicationDetails', | |
'dates': 'simpleDetailsTable', | |
} | |
data = { | |
'key': key | |
} | |
for tabName, tableName in tabs_config.items(): | |
cache_filename = "/Users/jvg/Desktop/cache/{}_{}.html".format(key, tabName) | |
# Check if we have this cached or not? | |
raw_html = '' | |
if os.path.exists(cache_filename): | |
with open(cache_filename, 'r') as f: | |
raw_html = f.read() | |
cached = True | |
else: | |
cached = False | |
raw_html = crawl_application(key, tabName) | |
if raw_html is None: | |
exit() | |
if "Unable to perform this task. A remote exception occurred." in raw_html: | |
continue | |
with open(cache_filename, 'w') as f: | |
f.write(raw_html) | |
soup = BeautifulSoup(raw_html, 'html.parser') | |
table = soup.find_all('table', id=tableName)[0] | |
for row in table.find_all('tr'): | |
data[row.find_all('th')[0].text.strip()] = row.find_all('td')[0].text.strip() | |
csv_writer.writerow(data) | |
if not cached: | |
logging.info("WAITING") | |
wait_random_seconds(30, 60) | |
# Setup the parser and call the main functions. | |
parser = argparse.ArgumentParser( | |
prog='planning_crawl.py', | |
description='Various commands to crawl the website of the Greater Cambridge Shared Planning service') | |
subparsers = parser.add_subparsers() | |
search_parser = subparsers.add_parser('search') | |
search_parser.set_defaults(func=search_main) | |
applications_parser = subparsers.add_parser('applications') | |
applications_parser.set_defaults(func=applications_main) | |
logging.basicConfig(level=logging.INFO) | |
if __name__ == '__main__': | |
init_crawler() | |
args = parser.parse_args() | |
args.func(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment