Created
February 26, 2022 14:54
-
-
Save mobbarley78110/c87518b2ac0d2e64c8c294cdec5e2f7e to your computer and use it in GitHub Desktop.
Scrape Fedex data and geolocation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests | |
from geopy.geocoders import Nominatim | |
from geopy.extra.rate_limiter import RateLimiter | |
# helper functions | |
def get_raw(a): | |
try: | |
return a.raw | |
except: | |
return None | |
def get_lat(a): | |
try: | |
return a.latitude | |
except: | |
return None | |
def get_lon(a): | |
try: | |
return a.longitude | |
except: | |
return None | |
# creating the Fedex web scrapper function, returns a dictionnary | |
def get_package_details_fedex(track_no): | |
try: | |
header = { | |
'Origin': 'https://www.fedex.com', | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/59.0.3071.115 Safari/537.36', | |
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', | |
'Accept': '*/*', | |
'X-Requested-With': 'XMLHttpRequest', | |
'Referer': 'https://www.fedex.com/apps/fedextrack/?tracknumbers=%s&locale=en_CA&cntry_code=ca_english' % ( | |
str(track_no)), | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,ta;q=0.4,bn;q=0.2' | |
} | |
data = { | |
'action': 'trackpackages', | |
'data': '{"TrackPackagesRequest":{"appType":"WTRK","appDeviceType":"DESKTOP","uniqueKey":"",' | |
'"processingParameters":{},"trackingInfoList":[{"trackNumberInfo":{"trackingNumber":"%s",' | |
'"trackingQualifier":"","trackingCarrier":""}}]}}' % ( | |
str(track_no)), | |
'format': 'json', | |
'locale': 'en_CA', | |
'version': '1' | |
} | |
url = "https://www.fedex.com/trackingCal/track" | |
response = requests.post(url, data=data, headers=header) | |
if response.status_code == 200: | |
pass | |
else: | |
return | |
res_json = response.json() | |
if res_json['TrackPackagesResponse']['packageList'][0]['errorList'][0]['message'] != "": | |
# exits the function if package id is wrong | |
return | |
result = { | |
'delivery_date':res_json['TrackPackagesResponse']['packageList'][0]['displayActDeliveryDt'], | |
'dest_city':res_json['TrackPackagesResponse']['packageList'][0]['destLocationCity'], | |
'dest_state':res_json['TrackPackagesResponse']['packageList'][0]['destLocationStateCD'], | |
'dest_zip':res_json['TrackPackagesResponse']['packageList'][0]['destLocationZip'], | |
'dest_country':res_json['TrackPackagesResponse']['packageList'][0]['destLocationCntryCD'], | |
'orig_city':res_json['TrackPackagesResponse']['packageList'][0]['originCity'], | |
'orig_state':res_json['TrackPackagesResponse']['packageList'][0]['originStateCD'], | |
'orig_zip':res_json['TrackPackagesResponse']['packageList'][0]['originZip'], | |
'orig_country':res_json['TrackPackagesResponse']['packageList'][0]['originCntryCD'] | |
} | |
return result | |
except Exception as e: | |
print(f'Error occurred on awb: {track_no}. \n Error Message : ' + str(e)) | |
pass | |
# create a sample data frame | |
data = {'LETTER_ID': ['L0001', 'L0002', 'L0003', 'L0004'], | |
'LOVER': ['Michael','Pam','Angela','Jim'], | |
'GIFT_VALUE':[2000,15,5,50], | |
'TRACKING_NO':['FDX GRND 289724287655','AWB 9128 4510 1597','FEDEX 288542243907','RECEIVED ON 511593037823']} | |
tracking = pd.DataFrame(data) | |
# use regex magic to clean tracking numbers that were poorly entered at data entry stage | |
tracking['TRACKING_NO_CLEAN'] = tracking['TRACKING_NO'].str.replace(r'[^0-9]', '', regex=True) | |
# adding columns to our data frame, and using the scraper function by iterating on rows | |
tracking = tracking.reindex(columns = tracking.columns.tolist() + ['STATUS','DELIVERY_DATE','ORIGIN','DESTINATION']) | |
for row in tracking.itertuples(): | |
index = row.Index | |
track_id = row.TRACKING_NO_CLEAN | |
result_dict = get_package_details_fedex(track_id) | |
if track_id is not None: | |
tracking.loc[index, 'STATUS'] = 'data complete' | |
tracking.loc[index, 'DELIVERY_DATE'] = result_dict['delivery_date'] | |
tracking.loc[index, 'ORIGIN'] = result_dict['orig_city'] + ', ' + result_dict['orig_state'] + ', ' + result_dict['orig_country'] | |
tracking.loc[index, 'DESTINATION'] = result_dict['dest_city'] + ', ' + result_dict['dest_state'] + ', ' + result_dict['dest_country'] | |
else: | |
tracking.loc[index, 'STATUS'] = 'no data found' | |
# clean city names in create an index with unique cities | |
tracking['ORIGIN_CLEAN'] = tracking['ORIGIN'].str.replace(r'[^a-zA-Z0-9 ]', '', regex=True) | |
tracking['ORIGIN_CLEAN'] = tracking['ORIGIN_CLEAN'].str.strip() | |
tracking['DESTINATION_CLEAN'] = tracking['DESTINATION'].str.replace(r'[^a-zA-Z0-9 ]', '', regex=True) | |
tracking['DESTINATION_CLEAN'] = tracking['DESTINATION_CLEAN'].str.strip() | |
cities = [*tracking['ORIGIN_CLEAN'].unique(), *tracking['DESTINATION_CLEAN'].unique()] | |
cities = list(set(cities)) | |
# use city index to create new data frame to store geo locs of each unique city | |
places = pd.DataFrame(columns = ['city','geocode','raw_geocode','lat','lon']) | |
places['city'] = cities | |
# create the geoloc function | |
geolocator = Nominatim(user_agent='myapplication') | |
limited_geolocator = RateLimiter(geolocator.geocode, min_delay_seconds = 5) | |
# use the geoloc function on our unique cities | |
places['geocode'] = places['city'].apply(geolocator.geocode) | |
places['raw_geocode'] = places['geocode'].apply(get_raw) | |
places['lat'] = places['geocode'].apply(get_lat) | |
places['lon'] = places['geocode'].apply(get_lon) | |
# bring back geoloc info on main data table, ready to be used in viz | |
tracking = tracking.merge(places[['city','lon','lat']], left_on = 'ORIGIN_CLEAN', right_on = 'city', how = 'left') | |
tracking.drop(columns=['city', | |
'TRACKING_NO', | |
'TRACKING_NO_CLEAN', | |
'STATUS','DELIVERY_DATE', | |
'ORIGIN','DESTINATION'], inplace = True) | |
# save file as csv | |
#tracking.to_csv('letters.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment