Skip to content

Instantly share code, notes, and snippets.

@jvangael
Created August 7, 2023 13:25
Show Gist options
  • Save jvangael/25f017952f367f6248b3dcceead7ad48 to your computer and use it in GitHub Desktop.
Save jvangael/25f017952f367f6248b3dcceead7ad48 to your computer and use it in GitHub Desktop.
Quick hack to crawl the planning website of the South Cambridgeshire District Council
import argparse
import csv
import logging
import os
import requests
import random
import re
import sys
import time
from bs4 import BeautifulSoup
# Disable SSL warnings for now.
import urllib3
urllib3.disable_warnings()
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en;q=0.9,nl-NL;q=0.8,nl;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'applications.greatercambridgeplanning.org',
'Origin': 'https://applications.greatercambridgeplanning.org',
'Referer': 'https://applications.greatercambridgeplanning.org/online-applications/search.do?action=monthlyList',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"'
}
CSRF = ''
CSV_COLUMNS = [
'key','Reference',
'Application Received', 'Application Validated', 'Address', 'Proposal', 'Status',
'Appeal Status', 'Appeal Decision', 'Application Type', 'Expected Decision Level',
'Case Officer', 'Parish', 'Ward', 'District Reference', 'Applicant Name', 'Agent Name',
'Agent Company Name', 'Agent Address', 'Environmental Assessment Requested',
'Applicant Address', 'Decision', 'Decision Issued Date', 'Actual Decision Level',
'Application Validated Date', 'Actual Committee Date', 'Neighbour Consultation Expiry Date',
'Standard Consultation Date', 'Standard Consultation Expiry Date',
'Last Advertised In Press Date', 'Latest Advertisement Expiry Date',
'Last Site Notice Posted Date', 'Latest Site Notice Expiry Date', 'Agreed Expiry Date',
'Permission Expiry Date', 'Determination Deadline', 'Temporary Permission Expiry Date'
]
def wait_random_seconds(min, max):
seconds = min + random.randint(1, max-min)
time.sleep(seconds)
def init_crawler():
# First get a csrf and JSESSIONID token.
token_url = 'https://applications.greatercambridgeplanning.org/online-applications/'
server = requests.get(token_url, headers=HEADERS, verify=False)
CSRF = re.findall('name="_csrf" value="([-\w]+)"', server.text)[0]
logging.info("CSRF token is {}".format(CSRF))
jsession_id = server.cookies['JSESSIONID']
logging.info("JSESSIONID is {}".format(jsession_id))
HEADERS['Cookie'] = 'JSESSIONID={}'.format(jsession_id)
def crawl_search(month):
search_url = 'https://applications.greatercambridgeplanning.org/online-applications/monthlyListResults.do?action=firstPage'
search_form_data = {
'searchCriteria.localAuthority': '505',
'searchCriteria.parish': '',
'searchCriteria.ward': '',
'month': '{}'.format(month),
'dateType': 'DC_Validated',
'searchType': 'Application',
'_csrf': CSRF,
}
server = requests.post(search_url, headers=HEADERS, data=search_form_data, verify=False)
print(server.text)
exit()
list_form_data = {
#'searchCriteria.localAuthority': '505',
#'searchCriteria.parish': '',
#'searchCriteria.ward': '',
#'month': '{}'.format(month),
#'dateType': 'DC_Validated',
#'searchType': 'Application',
'_csrf': CSRF,
'searchCriteria.resultsPerPage': '1000',
'action': 'page',
'searchCriteria.page': '1',
'orderBy': 'DateReceived',
'orderByDirection': 'Descending'
}
#list_url = 'https://applications.greatercambridgeplanning.org/online-applications/monthlyListResults.do?action=page'
list_url = 'https://applications.greatercambridgeplanning.org/online-applications/pagedSearchResults.do'
server = requests.post(list_url, headers=HEADERS, data=list_form_data, verify=False)
return server.text
def crawl_application(key, tab):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en;q=0.9,nl-NL;q=0.8,nl;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'JSESSIONID=FV9BZ4c3T2NnEXkhHu-_OU11PJFThKC-RnTx_fK5.gcp-palive-dmz1',
'Host': 'applications.greatercambridgeplanning.org',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"'
}
application_url = 'https://applications.greatercambridgeplanning.org/online-applications/applicationDetails.do?activeTab={}&keyVal={}'.format(tab, key)
logging.info("GET {}".format(application_url))
server = requests.get(application_url, headers=headers, verify=False)
if server.status_code != 200:
logging.info("HTTP Status {}".format(server.status_code))
return None
return server.text
def search_main(args):
# TODO make this list a bit more dynamic
all_months = [
'Jul 23','Jun 23','May 23','Apr 23','Mar 23','Feb 23','Jan 23',
'Dec 22','Nov 22','Oct 22','Sep 22','Aug 22','Jul 22','Jun 22','May 22','Apr 22','Mar 22','Feb 22','Jan 22',
'Dec 21','Nov 21','Oct 21','Sep 21','Aug 21','Jul 21','Jun 21','May 21','Apr 21','Mar 21','Feb 21','Jan 21',
'Dec 20','Nov 20','Oct 20','Sep 20','Aug 20','Jul 20'
]
print('Search')
for month in all_months[:1]:
logging.info("Retrieving details for {}".format(month))
raw_text = crawl_search(month)
print(raw_text)
def applications_main(args):
# First retrieva all planning keys.
with open('/Users/jvg/Desktop/ref_codes', 'r') as f:
lines = f.readlines()
planning_keys = [line.rstrip("\n") for line in lines]
logging.info("Found {} planning keys. Starting crawl.".format(len(planning_keys)))
csv_writer = csv.DictWriter(sys.stdout, CSV_COLUMNS, delimiter=',')
csv_writer.writeheader()
# Start the actual crawling.
for key in planning_keys:
logging.info("Retrieving details for {}".format(key))
cached = False
tabs_config = {
'summary': 'simpleDetailsTable',
'details': 'applicationDetails',
'dates': 'simpleDetailsTable',
}
data = {
'key': key
}
for tabName, tableName in tabs_config.items():
cache_filename = "/Users/jvg/Desktop/cache/{}_{}.html".format(key, tabName)
# Check if we have this cached or not?
raw_html = ''
if os.path.exists(cache_filename):
with open(cache_filename, 'r') as f:
raw_html = f.read()
cached = True
else:
cached = False
raw_html = crawl_application(key, tabName)
if raw_html is None:
exit()
if "Unable to perform this task. A remote exception occurred." in raw_html:
continue
with open(cache_filename, 'w') as f:
f.write(raw_html)
soup = BeautifulSoup(raw_html, 'html.parser')
table = soup.find_all('table', id=tableName)[0]
for row in table.find_all('tr'):
data[row.find_all('th')[0].text.strip()] = row.find_all('td')[0].text.strip()
csv_writer.writerow(data)
if not cached:
logging.info("WAITING")
wait_random_seconds(30, 60)
# Setup the parser and call the main functions.
parser = argparse.ArgumentParser(
prog='planning_crawl.py',
description='Various commands to crawl the website of the Greater Cambridge Shared Planning service')
subparsers = parser.add_subparsers()
search_parser = subparsers.add_parser('search')
search_parser.set_defaults(func=search_main)
applications_parser = subparsers.add_parser('applications')
applications_parser.set_defaults(func=applications_main)
logging.basicConfig(level=logging.INFO)
if __name__ == '__main__':
init_crawler()
args = parser.parse_args()
args.func(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment