Skip to content

Instantly share code, notes, and snippets.

@fhpriamo
Created January 15, 2021 04:25
Show Gist options
  • Save fhpriamo/23503c7ef527e9e5e435fb81b9b8cef2 to your computer and use it in GitHub Desktop.
Save fhpriamo/23503c7ef527e9e5e435fb81b9b8cef2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import sys
import re
import requests
uri_regex = re.compile(
r'^(?:http|ftp)s?://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
r'localhost|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def eprint(*args, **kwargs):
"""Print a messsage to STDERR"""
print(*args, file=sys.stderr, **kwargs)
EOF = ''
HELP_MESSAGE = """
Reads a newline separated list of URIs from a file and outputs to STDOUT
only the URIs that are valid, unique in the input file and that do not
return 404 on HTTP GET requests. Logs all other URIs and invalid inputs
to STDERR
Usage:
test-uris [--no-http] <INPUT_FILE> >> <OUTPUT_FILE>
Options:
-h, --help Show this help message.
--no-http Do not perform HTTP requests.
Error codes:
INVALID_ARG: <MSG> = An argument is either absent or invalid
INVALID_URI: <URI> = The line does not contain a valid URI
DUPLICATED_URI: <URI> = The line contains a duplicated URI
NOT_FOUND: <URI> = A GET request for the URI returned 404
REQ_FAILED (<HTTP_STATUS>): <URI> =
A GET request on the given URI returned a status code not in
the range [200,400[ other than 404
"""
SHORT_HELP_MESSAGE = """
To see the full list of options and arguments, please run:
test-uris --help
"""
switches = {'--no-http': False}
if __name__ == "__main__":
args = sys.argv[1:]
if not len(args):
eprint('!! INVALID_ARG: no argument provided.')
eprint(SHORT_HELP_MESSAGE)
sys.exit(1)
if args[0] in ['-h', '--help']:
print(HELP_MESSAGE)
sys.exit(0)
if args[0] in switches.keys():
switch_key = args.pop(0)
switches[switch_key] = not switches[switch_key]
if not len(args):
eprint('!! INVALID_ARG: path to input file was not provided.')
eprint(SHORT_HELP_MESSAGE)
sys.exit(1)
filename = args[0]
visited = []
with open(filename) as f:
while True:
line = f.readline()
if line == EOF:
break
uri = line.strip()
if uri == '':
continue
if uri_regex.match(uri) is None:
eprint(f'!! INVALID_URI: {uri}')
continue
if uri in visited:
eprint(f'!! DUPLICATED_URI: {uri}')
continue
if not switches['--no-http']:
status_code = requests.get(uri).status_code
if status_code == 404:
eprint(
f'!! NOT_FOUND: {uri}')
continue
if status_code < 200 or status_code >= 400:
eprint(f'!! REQ_FAILED ({status_code}): {uri}')
visited.append(uri)
print(uri)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment