Last active
January 25, 2017 14:23
-
-
Save meunomemauricio/743522121e1040759e0ec196c744a16a to your computer and use it in GitHub Desktop.
A small web scraper to track packages from the Correios - SRO (Sistema de Rastreamento de Objetos) Website.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python3 | |
"""Scrape the correios website for package tracking.""" | |
import argparse | |
import http.client | |
import logging | |
import re | |
import requests | |
import sys | |
from bs4 import BeautifulSoup | |
URL = 'http://websro.correios.com.br/sro_bin/txect01$.QueryList' | |
CODE_PATTERN = re.compile(r'\w{2}\d{9}\w{2}') | |
class Package(): | |
def __init__(self, code, last_status): | |
self.code = code | |
self.last_status = last_status | |
def __str__(self): | |
return '{}: {}'.format(self.code, self.last_status) | |
def parse_arguments(): | |
parser = argparse.ArgumentParser(description=__doc__) | |
arg_help = 'Tracking Code. Cans specify more than one separated by ";"' | |
parser.add_argument('code', help=arg_help) | |
parser.add_argument('-d', '--debug', action='store_true', default=False, | |
help='Debug HTTP requests') | |
return parser.parse_args() | |
def enable_debug(): | |
http.client.HTTPConnection.debuglevel = 1 | |
# You must initialize logging, otherwise you'll not see debug output. | |
logging.basicConfig() | |
logging.getLogger().setLevel(logging.DEBUG) | |
requests_log = logging.getLogger("requests.packages.urllib3") | |
requests_log.setLevel(logging.DEBUG) | |
requests_log.propagate = True | |
def code_format_validation(code): | |
codes = code.split(';') | |
for code in codes: | |
if not CODE_PATTERN.match(code): | |
print('Code must be in the format "SS987654321XX"') | |
sys.exit(1) | |
def request_page(code): | |
"""Request the package information page and return its contents.""" | |
codes = code.split(';') | |
tipo = '001' if len(codes) == 1 else '003' | |
code_type = 'P_COD_UNI' if len(codes) == 1 else 'P_COD_LIS' | |
data = { | |
code_type: code, | |
'P_LINGUA': '001', | |
'Z_ACTION': 'Search', | |
'P_TIPO': tipo, | |
} | |
r = requests.post(URL, data=data) | |
return r.text | |
def parse_table(page): | |
"""Parse the html page and return a list of Packages. | |
Only the last status of the package is considered. | |
""" | |
soup = BeautifulSoup(page, 'html.parser') | |
table_type = _get_table_type(soup) | |
if table_type == '001': | |
return _parse_single_table(soup) | |
elif table_type == '003': | |
return _parse_list_table(soup) | |
raise RuntimeError('Invalid Table Type: {}'.format(table_type)) | |
def _get_table_type(soup): | |
"""Get the value of the INPUT tag with name equals P_TIPO""" | |
return soup.find(attrs={'name': 'P_TIPO'})['value'] | |
def _parse_single_table(soup): | |
"""Parse the table when consulting a single.""" | |
code = soup.find(attrs={'name': 'P_ITEMCODE'})['value'] | |
for font in soup.table.find_all('font'): | |
if not font.has_attr('face'): | |
last_status = font.string | |
break | |
return [Package(code, last_status)] | |
def _parse_list_table(soup): | |
"""Parse the table when consulting multiple objects.""" | |
def _filter_tr(tag): | |
"""TR tags that contain an A tag as child.""" | |
return tag.name == 'tr' and tag.find('a') | |
pkgs = [] | |
for tr in soup.table.find_all(_filter_tr): | |
code = tr.find('a').string | |
last_status = tr.find('font').string | |
pkgs.append(Package(code, last_status)) | |
return pkgs | |
def main(): | |
args = parse_arguments() | |
if args.debug: | |
enable_debug() | |
code_format_validation(args.code) | |
page = request_page(args.code) | |
pkgs = parse_table(page) | |
for pkg in pkgs: | |
print(pkg) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment