Created
January 15, 2021 04:25
-
-
Save fhpriamo/23503c7ef527e9e5e435fb81b9b8cef2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import re | |
import requests | |
uri_regex = re.compile( | |
r'^(?:http|ftp)s?://' | |
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' | |
r'localhost|' | |
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' | |
r'(?::\d+)?' | |
r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
def eprint(*args, **kwargs): | |
"""Print a messsage to STDERR""" | |
print(*args, file=sys.stderr, **kwargs) | |
EOF = '' | |
HELP_MESSAGE = """ | |
Reads a newline separated list of URIs from a file and outputs to STDOUT | |
only the URIs that are valid, unique in the input file and that do not | |
return 404 on HTTP GET requests. Logs all other URIs and invalid inputs | |
to STDERR | |
Usage: | |
test-uris [--no-http] <INPUT_FILE> >> <OUTPUT_FILE> | |
Options: | |
-h, --help Show this help message. | |
--no-http Do not perform HTTP requests. | |
Error codes: | |
INVALID_ARG: <MSG> = An argument is either absent or invalid | |
INVALID_URI: <URI> = The line does not contain a valid URI | |
DUPLICATED_URI: <URI> = The line contains a duplicated URI | |
NOT_FOUND: <URI> = A GET request for the URI returned 404 | |
REQ_FAILED (<HTTP_STATUS>): <URI> = | |
A GET request on the given URI returned a status code not in | |
the range [200,400[ other than 404 | |
""" | |
SHORT_HELP_MESSAGE = """ | |
To see the full list of options and arguments, please run: | |
test-uris --help | |
""" | |
switches = {'--no-http': False} | |
if __name__ == "__main__": | |
args = sys.argv[1:] | |
if not len(args): | |
eprint('!! INVALID_ARG: no argument provided.') | |
eprint(SHORT_HELP_MESSAGE) | |
sys.exit(1) | |
if args[0] in ['-h', '--help']: | |
print(HELP_MESSAGE) | |
sys.exit(0) | |
if args[0] in switches.keys(): | |
switch_key = args.pop(0) | |
switches[switch_key] = not switches[switch_key] | |
if not len(args): | |
eprint('!! INVALID_ARG: path to input file was not provided.') | |
eprint(SHORT_HELP_MESSAGE) | |
sys.exit(1) | |
filename = args[0] | |
visited = [] | |
with open(filename) as f: | |
while True: | |
line = f.readline() | |
if line == EOF: | |
break | |
uri = line.strip() | |
if uri == '': | |
continue | |
if uri_regex.match(uri) is None: | |
eprint(f'!! INVALID_URI: {uri}') | |
continue | |
if uri in visited: | |
eprint(f'!! DUPLICATED_URI: {uri}') | |
continue | |
if not switches['--no-http']: | |
status_code = requests.get(uri).status_code | |
if status_code == 404: | |
eprint( | |
f'!! NOT_FOUND: {uri}') | |
continue | |
if status_code < 200 or status_code >= 400: | |
eprint(f'!! REQ_FAILED ({status_code}): {uri}') | |
visited.append(uri) | |
print(uri) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment