idealwebsolutions · September 15, 2020 23:47
diff --git a/adtrace.py b/adtrace.py
 #!/usr/bin/env python

 from requests import get
 from lxml.html import fromstring
 from base64 import b64decode
 from re import search

 def _get_page_content(url):
    """ Fetches page content 

    Keyword arguments:
    url -- Page url to scrape content from

    """
    # TODO: check url is valid
    request = get(url)
    # Exit on bad status code
    if request.status_code != 200:
        raise SystemExit('Invalid status code: {}'.format(request.status_code))
    # Return page content
    return request.text

 def find_adtrace_script(content=''):
    """ Search document tree for specific script 

    Keyword arguments:
    content -- HTML document containing both external and inlined scripts

    """
    # Parse document from HTML string
    tree = fromstring(content)
    # Find all scripts on page
    scripts = tree.xpath('//script')
    # Find script containing adtrace content (encoded in base64)
    for script in scripts:
        # Skip over external scripts
        if isinstance(script.text, str):
            # Remove all whitespace
            compact = script.text.strip()
            # Match regex (this can break if variable is renamed)
            match = search(r'var content = atob\(\"(.+)\"\)', compact)
            # Find match
            if match:
                encoded_script = match.group(1)
                # Verify match is actually base64 content
                decoded = ''
                # Try decoding script as base64
                try:
                    decoded = b64decode(encoded_script)
                except TypeError:
                    pass
                # Return decoded script as utf-8 string
                return str(decoded, 'utf-8')
    # Return NoneType if no matches were found
    return None

 def extract_pubadn(script=''):
    """ Extracts a specific key property from script 
    
    Keyword arguments:
    script -- Inline javascript

    """
    match = search(r'pubws=(\d+)', script)
    # Find match
    if match:
        return match.group(1)
    # Return NoneType if a match wasn't found
    return None
	#!/usr/bin/env python

	from requests import get
	from lxml.html import fromstring
	from base64 import b64decode
	from re import search

	def _get_page_content(url):
	""" Fetches page content

	Keyword arguments:
	url -- Page url to scrape content from

	"""
	# TODO: check url is valid
	request = get(url)
	# Exit on bad status code
	if request.status_code != 200:
	raise SystemExit('Invalid status code: {}'.format(request.status_code))
	# Return page content
	return request.text

	def find_adtrace_script(content=''):
	""" Search document tree for specific script

	Keyword arguments:
	content -- HTML document containing both external and inlined scripts

	"""
	# Parse document from HTML string
	tree = fromstring(content)
	# Find all scripts on page
	scripts = tree.xpath('//script')
	# Find script containing adtrace content (encoded in base64)
	for script in scripts:
	# Skip over external scripts
	if isinstance(script.text, str):
	# Remove all whitespace
	compact = script.text.strip()
	# Match regex (this can break if variable is renamed)
	match = search(r'var content = atob\(\"(.+)\"\)', compact)
	# Find match
	if match:
	encoded_script = match.group(1)
	# Verify match is actually base64 content
	decoded = ''
	# Try decoding script as base64
	try:
	decoded = b64decode(encoded_script)
	except TypeError:
	pass
	# Return decoded script as utf-8 string
	return str(decoded, 'utf-8')
	# Return NoneType if no matches were found
	return None

	def extract_pubadn(script=''):
	""" Extracts a specific key property from script

	Keyword arguments:
	script -- Inline javascript

	"""
	match = search(r'pubws=(\d+)', script)
	# Find match
	if match:
	return match.group(1)
	# Return NoneType if a match wasn't found
	return None