Skip to content

Instantly share code, notes, and snippets.

@idealwebsolutions
Created September 15, 2020 23:47
Show Gist options
  • Save idealwebsolutions/108541bed4a92d6d0f39ab4aeedc5b64 to your computer and use it in GitHub Desktop.
Save idealwebsolutions/108541bed4a92d6d0f39ab4aeedc5b64 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from requests import get
from lxml.html import fromstring
from base64 import b64decode
from re import search
def _get_page_content(url):
""" Fetches page content
Keyword arguments:
url -- Page url to scrape content from
"""
# TODO: check url is valid
request = get(url)
# Exit on bad status code
if request.status_code != 200:
raise SystemExit('Invalid status code: {}'.format(request.status_code))
# Return page content
return request.text
def find_adtrace_script(content=''):
""" Search document tree for specific script
Keyword arguments:
content -- HTML document containing both external and inlined scripts
"""
# Parse document from HTML string
tree = fromstring(content)
# Find all scripts on page
scripts = tree.xpath('//script')
# Find script containing adtrace content (encoded in base64)
for script in scripts:
# Skip over external scripts
if isinstance(script.text, str):
# Remove all whitespace
compact = script.text.strip()
# Match regex (this can break if variable is renamed)
match = search(r'var content = atob\(\"(.+)\"\)', compact)
# Find match
if match:
encoded_script = match.group(1)
# Verify match is actually base64 content
decoded = ''
# Try decoding script as base64
try:
decoded = b64decode(encoded_script)
except TypeError:
pass
# Return decoded script as utf-8 string
return str(decoded, 'utf-8')
# Return NoneType if no matches were found
return None
def extract_pubadn(script=''):
""" Extracts a specific key property from script
Keyword arguments:
script -- Inline javascript
"""
match = search(r'pubws=(\d+)', script)
# Find match
if match:
return match.group(1)
# Return NoneType if a match wasn't found
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment