Created
September 15, 2020 23:47
-
-
Save idealwebsolutions/108541bed4a92d6d0f39ab4aeedc5b64 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from requests import get | |
from lxml.html import fromstring | |
from base64 import b64decode | |
from re import search | |
def _get_page_content(url): | |
""" Fetches page content | |
Keyword arguments: | |
url -- Page url to scrape content from | |
""" | |
# TODO: check url is valid | |
request = get(url) | |
# Exit on bad status code | |
if request.status_code != 200: | |
raise SystemExit('Invalid status code: {}'.format(request.status_code)) | |
# Return page content | |
return request.text | |
def find_adtrace_script(content=''): | |
""" Search document tree for specific script | |
Keyword arguments: | |
content -- HTML document containing both external and inlined scripts | |
""" | |
# Parse document from HTML string | |
tree = fromstring(content) | |
# Find all scripts on page | |
scripts = tree.xpath('//script') | |
# Find script containing adtrace content (encoded in base64) | |
for script in scripts: | |
# Skip over external scripts | |
if isinstance(script.text, str): | |
# Remove all whitespace | |
compact = script.text.strip() | |
# Match regex (this can break if variable is renamed) | |
match = search(r'var content = atob\(\"(.+)\"\)', compact) | |
# Find match | |
if match: | |
encoded_script = match.group(1) | |
# Verify match is actually base64 content | |
decoded = '' | |
# Try decoding script as base64 | |
try: | |
decoded = b64decode(encoded_script) | |
except TypeError: | |
pass | |
# Return decoded script as utf-8 string | |
return str(decoded, 'utf-8') | |
# Return NoneType if no matches were found | |
return None | |
def extract_pubadn(script=''): | |
""" Extracts a specific key property from script | |
Keyword arguments: | |
script -- Inline javascript | |
""" | |
match = search(r'pubws=(\d+)', script) | |
# Find match | |
if match: | |
return match.group(1) | |
# Return NoneType if a match wasn't found | |
return None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment