Last active
January 18, 2018 04:30
-
-
Save psy901/8da054bf9743dbc377cffa36e28b9bed to your computer and use it in GitHub Desktop.
With provided URL, it scrawls email addresses within the landing page and the second level pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import re | |
def extract_email_from_url(url): | |
''' | |
From the provided url, it extracts all valid email addresses | |
:param url: URL to enter | |
:return: Set of email addresses in string | |
''' | |
# enter the URL and retrieved all text | |
source_code = requests.get(url) | |
plain_text = source_code.text | |
soup = BeautifulSoup(plain_text, "html.parser") | |
text = soup.text | |
# from the text, it collects all valid email addresses | |
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" | |
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|" | |
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) | |
emails_tuples = re.findall(regex, text) | |
# returns a set of email addresses | |
emails = set() | |
for email in emails_tuples: | |
emails.add(email[0]) | |
return emails | |
def get_links(url): | |
''' | |
From the URL, it returns all the links embedded | |
:param url: URL to enter | |
:return: Set of links in string | |
''' | |
source_code = requests.get(url) | |
plain_text = source_code.text | |
soup = BeautifulSoup(plain_text, "html.parser") | |
set_links = set() | |
# finds all the links and return them as a list | |
for link in soup.findAll('a'): | |
href = link.get('href') | |
set_links.add(href) | |
return set_links | |
def get_email(url): | |
''' | |
From the URL, it tries to extract emails from the landing page | |
and its 2nd level pages. And it prints all the email addresses it retrieved | |
:param url: provided URL from the user | |
''' | |
emails = set() | |
# 1st level | |
emails.update(extract_email_from_url(url)) | |
# 2nd level -- iterate over the set of links | |
set_links = get_links(url) | |
for link_url in set_links: | |
try: | |
print("working on: " + link_url) | |
emails.update(extract_email_from_url(link_url)) | |
except requests.exceptions.MissingSchema: | |
# some links relative paths, so change them into absolute path | |
new_url = url.strip('/') + link_url | |
print("working on changed one..." + new_url) | |
emails.update(extract_email_from_url(new_url)) | |
except: | |
pass | |
# print all email addresses found | |
for email in emails: | |
print(email) | |
get_email("https://www.wisc.edu/") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It takes the url, "https://www.wisc.edu", for now