Skip to content

Instantly share code, notes, and snippets.

@openroomxyz
Last active April 22, 2020 20:27
Show Gist options
  • Save openroomxyz/32e3e820326ec59272a447cccc2b6f7d to your computer and use it in GitHub Desktop.
Save openroomxyz/32e3e820326ec59272a447cccc2b6f7d to your computer and use it in GitHub Desktop.
Python : How to get a list of all URLs on some domain crawling from page to page on that domain?
import bs4 as bs
import urllib.request
import time
def filter_to(input_list,limit_to):
res = []
for i in input_list:
if i is not None:
if limit_to in i:
res.append(i)
return res
visited = {}
def Crawl(urls_list = ['https://www.x.si/'], limit_to = 'https://www.x.si/'):
res = []
one_is_to_visit_for_sure = False
for page in urls_list:
if page not in visited:
print("Visiting page : " + page)
time.sleep(1)
try:
sauce = urllib.request.urlopen(page).read()
soup = bs.BeautifulSoup(sauce, 'lxml')
except:
soup = False
if soup != False:
for url in soup.find_all('a'):
new_url = url.get('href')
if new_url not in visited:
res.append(new_url)
one_is_to_visit_for_sure = True
visited[page] = True;
res = filter_to(res, limit_to)
if one_is_to_visit_for_sure:
return Crawl(res, limit_to)
return res
print("----")
resx = Crawl(limit_to = 'https://somedomain.xyz/')
print("DONE")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment