|
import urllib2 |
|
from bs4 import BeautifulSoup |
|
import string |
|
import urlparse |
|
import re |
|
from datetime import datetime |
|
import csv |
|
#import psycopg2 |
|
|
|
def build_url(l,n=''): |
|
base_url = 'http://www.courts.phila.gov/mtvr/name.asp?search='+l+'%25&page='+n |
|
return base_url |
|
|
|
def get_soup(url): |
|
soup = BeautifulSoup(urllib2.urlopen(url)) |
|
return soup |
|
|
|
|
|
def date_format(datestring): |
|
# Assumes dates in the format 11/5/2003, which is what this data set uses. |
|
try: |
|
better_date = datetime.strptime(datestring, '%m/%d/%Y') |
|
except ValueError as error: |
|
better_date = '' |
|
except: |
|
store_exception(error,94, datestring) |
|
return better_date |
|
|
|
def store_exception(exception, line_number, some_string): |
|
data = { |
|
'timestamp' : datetime.now(), |
|
'line_number': line_number, |
|
'message' : exception, |
|
'string' : some_string |
|
} |
|
data_order = ['timestamp', 'line_number', 'message', 'string'] |
|
csv_row(data,data_order,'bail_error_log') |
|
|
|
## Now we store things |
|
## http://docs.python.org/2/library/csv.html?highlight=strings |
|
## use "a" instead of "w" to append. http://docs.python.org/2/tutorial/inputoutput.html#reading-and-writing-files |
|
|
|
__base__ = '/home/amanda/Documents/Dropbox/Miscellany/Philadelphia/' |
|
def csv_row(data,fieldorder,filename, base=__base__): |
|
full_path = __base__+filename+'1127.csv' |
|
print "writing", full_path |
|
with open(full_path, 'a+') as csvfile: |
|
linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|', |
|
quotechar='"', quoting=csv.QUOTE_MINIMAL) |
|
linewriter.writerow(data) |
|
|
|
def pg_row(dictionary): |
|
connection = "host='localhost' dbname='Philadelphia'" |
|
# print the connection string we will use to connect |
|
print "Connecting to database\n ->%s" % (conn_string) |
|
# get a connection, if a connect cannot be made an exception will be raised here |
|
conn = psycopg2.connect(conn_string) |
|
# conn.cursor will return a cursor object, you can use this cursor to perform queries |
|
cursor = conn.cursor() |
|
print "Connected!\n" |
|
|
|
def get_pagecount(l): |
|
url = build_url(l) |
|
soup = get_soup(url) |
|
graph = soup.find('p',{'id':'search'}) |
|
strings = re.match("(.*) Records Found.\s*Displaying page(.*)\s*of(.*):",graph.text.strip()).groups() |
|
rec_count = strings[0].strip() |
|
page_no = strings[1].strip() |
|
total_pages = strings[2].strip() |
|
return {'rec_count':rec_count, 'page_no':page_no, 'total_pages':total_pages} |
|
|
|
def get_all_urls(alphabet): |
|
urls = [] |
|
try: |
|
for letter in alphabet: |
|
page_count = get_pagecount(letter) |
|
letter_data = { |
|
'letter' : letter, |
|
'records' : page_count['rec_count'], |
|
'pages' : page_count['total_pages'], |
|
'start' : page_count['page_no'] |
|
} |
|
data_order = ['letter' , 'records' , 'pages' , 'start'] |
|
csv_row(letter_data,data_order,'record_counts') |
|
pages = int(get_pagecount(letter)['total_pages']) |
|
for page in range(1,pages+1): |
|
url = build_url(letter, str(page)) |
|
urls.append(url) |
|
except Exception as error: |
|
store_exception(error,'51') |
|
return urls |
|
|
|
def scrape_soup(soup): |
|
table = soup.find('table' , {'id' : 'Defendant'}) |
|
table_data = [] |
|
for row in table.find_all('tr'): |
|
cells = row.find_all('td') |
|
if len(cells) > 0: |
|
name_bits = cells[0].get_text().strip().split(',') |
|
defendant_last = name_bits[0].strip() |
|
defendant_first = name_bits[1].strip() |
|
docket = cells[1].get_text().strip() |
|
CPCMS_docket = cells[2].get_text().strip() |
|
failure_to_appear_date = date_format(cells[3].get_text().strip()) |
|
bench_warrant_hearing_date = date_format(cells[4].get_text().strip()) |
|
judgment = re.sub(u'[\xa0\xc2\s]+',' ',str(cells[5]),flags=re.UNICODE).strip() |
|
j_strings = list(BeautifulSoup(judgment).td.strings) |
|
judgment_no = j_strings[1].strip() |
|
j_match = re.match("Date:(.*) Amount: \$(.*)",j_strings[2].strip()) |
|
judgment_date = date_format(j_match.groups()[0].strip()) |
|
judgment_amt = j_match.groups()[1].strip() |
|
if 'Judgment against' in j_strings[3]: |
|
judgment_against = j_strings[4].strip() |
|
else: |
|
judgment_against = "n/a" |
|
case_data = { |
|
'defendant_first' : defendant_first, |
|
'defendant_last' : defendant_last, |
|
'docket' : docket, |
|
'CPCMS_docket' : CPCMS_docket, |
|
'failure_to_appear_numeric' : failure_to_appear_date, |
|
'bench_warrant_hearing_numeric' : bench_warrant_hearing_date, |
|
'judgment_no' : judgment_no, |
|
'judgment_date_numeric' : judgment_date, |
|
'judgment_amt_numeric' : judgment_amt, |
|
'judgment_against' : judgment_against |
|
} |
|
table_data.append(case_data) |
|
return table_data |
|
|
|
def scrape_table(url,linewriter): |
|
soup = get_soup(url) |
|
table = soup.find('table' , {'id' : 'Defendant'}) |
|
for row in table.find_all('tr'): |
|
cells = row.find_all('td') |
|
if len(cells) > 0: |
|
name_bits = cells[0].get_text().strip().split(',') |
|
defendant_last = name_bits[0].strip() |
|
defendant_first = name_bits[1].strip() |
|
docket = cells[1].get_text().strip() |
|
CPCMS_docket = cells[2].get_text().strip() |
|
failure_to_appear_date = date_format(cells[3].get_text().strip()) |
|
bench_warrant_hearing_date = date_format(cells[4].get_text().strip()) |
|
judgment = re.sub(u'[\xa0\xc2\s]+',' ',str(cells[5]),flags=re.UNICODE).strip() |
|
j_strings = list(BeautifulSoup(judgment).td.strings) |
|
judgment_no = j_strings[1].strip() |
|
j_match = re.match("Date:(.*) Amount: \$(.*)",j_strings[2].strip()) |
|
judgment_date = date_format(j_match.groups()[0].strip()) |
|
judgment_amt = j_match.groups()[1].strip() |
|
if 'Judgment against' in j_strings[3]: |
|
judgment_against = j_strings[4].strip() |
|
else: |
|
judgment_against = "n/a" |
|
case_data = { |
|
'defendant_first' : defendant_first, |
|
'defendant_last' : defendant_last, |
|
'docket' : docket, |
|
'CPCMS_docket' : CPCMS_docket, |
|
'failure_to_appear_numeric' : failure_to_appear_date, |
|
'bench_warrant_hearing_numeric' : bench_warrant_hearing_date, |
|
'judgment_no' : judgment_no, |
|
'judgment_date_numeric' : judgment_date, |
|
'judgment_amt_numeric' : judgment_amt, |
|
'judgment_against' : judgment_against |
|
} |
|
# Moved the field order to the loop for efficiency. Per |
|
# http://stackoverflow.com/questions/13573122/scraping-tables-and-writing-rows-to-csv/ |
|
linewriter.writerow(case_data) |
|
else: |
|
continue |
|
|
|
## making this more efficient by keeping the page open. |
|
def scrape_all_pages(alphabet): |
|
for letter in alphabet: |
|
pages = get_all_urls(letter) |
|
full_path = __base__+'optimized_bail_1127.csv' |
|
with open(full_path, 'a+') as csvfile: |
|
fieldorder = ['defendant_first' , 'defendant_last' , 'docket' , 'CPCMS_docket' , |
|
'failure_to_appear_numeric', 'bench_warrant_hearing_numeric', |
|
'judgment_no', 'judgment_date_numeric', 'judgment_amt_numeric', |
|
'judgment_against'] |
|
linewriter = csv.DictWriter(csvfile, fieldorder, delimiter='|', |
|
quotechar='"', quoting=csv.QUOTE_MINIMAL) |
|
for page in pages: |
|
scrape_table(page,linewriter) |
|
|
|
|
|
alphabet = string.ascii_uppercase |