Skip to content

Instantly share code, notes, and snippets.

@gwgundersen
Created July 8, 2014 22:48
Show Gist options
  • Save gwgundersen/e6cd307ba2320c418a3f to your computer and use it in GitHub Desktop.
Save gwgundersen/e6cd307ba2320c418a3f to your computer and use it in GitHub Desktop.
from collections import namedtuple
import re
import pdb
# http://tools.ietf.org/html/rfc3986#section-3.3
"""
>> c = request.urlparse("http://gregorygundersen.com")
>> c
ParseResult(scheme='http', netloc='gregorygundersen.com', path='', params='', query='', fragment='')
"""
def urlparse(url):
pdb.set_trace()
url = url.lower()
# Check for protocol
temp = url.split('://', 1)
if len(temp) > 1:
scheme = temp[0]
remainder = temp[1]
else:
# If there's no protocol
scheme = ''
remainder = temp[0]
# Check for netloc, i.e. the domain
temp = remainder.split('/', 3)
if len(temp) > 1:
netloc = temp[0]
remainder = temp[1]
else:
netloc = temp[0]
remainder = ''
# Check for path
if len(remainder):
temp = remainder.split('?')
path = temp[0]
if len(temp) > 1:
query = temp[1]
else:
query = ''
else:
path = ''
query = ''
#ParseResult = namedtuple('ParseResult', 'scheme netloc path params query fragment')
#components = ParseResult(components[0], components[1]), components[2]), components[3]), components[4]), components[5])
#components
return (scheme, netloc, path, query)
def urlparse2(url):
url = url.lower()
components = url_parse_strings(url)
ParseResult = namedtuple('ParseResult', 'scheme netloc path params query fragment')
return ParseResult(components[0], components[1], components[2], components[3], components[4], components[5])
def url_parse_strings(url):
return url.replace(':', '|').replace('//', '|').replace('/', '|').replace('www.', '|').replace('.', '|').split('|')
def url_parse_re(url):
return re.split(':|\.|//|/|\?', url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment