Skip to content

Instantly share code, notes, and snippets.

@tperrelli
Created July 15, 2021 16:57
Show Gist options
  • Save tperrelli/947fb0594c58076c0baec02248dc1d68 to your computer and use it in GitHub Desktop.
Save tperrelli/947fb0594c58076c0baec02248dc1d68 to your computer and use it in GitHub Desktop.
import os
import glob
import json
import pprint
from os import path
from os import listdir
import requests
from bs4 import BeautifulSoup
links = []
linksWithAlerts = []
URL = "https://platformspfizerdocs-test.dev.pfizerstatic.io"
def getPages(soup):
links = []
for link in soup.findAll('a', {}):
if (link.get('target') is None):
title = link.get_text(strip=True)
full_link = URL + link["href"]
htmlFile = os.path.join('./pages', title + ".html")
if (path.exists("./pages/" + title + ".html") == False):
try:
f = open(htmlFile, "wb")
print(full_link)
f.write(session.get(full_link).content)
f.close()
break
except ValueError:
print("Oops!")
continue
links.append(link.get('href'))
return links
def scrapyPages():
htmlFiles = glob.glob("./pages/*.html")
print("Total pages: " + str(len(htmlFiles)))
for htmlFile in htmlFiles:
with open(htmlFile) as fp:
soup = BeautifulSoup(fp, "html.parser")
items = searchItems(soup)
if (len(items) > 0):
pos = htmlFile.rfind('/')
item = {
'page' : htmlFile[pos:],
'alerts': len(items)
}
linksWithAlerts.append(item)
def searchItems(soup, tag = "div", filters = {"class" : "alert"}):
results = soup.find_all(tag, filters)
return results
urls = []
status = []
edison = []
newton = []
def scrape(site, item=None):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
if (item is not None):
item['alerts'] = len(searchItems(s))
return item
for i in s.find_all("a"):
if ('href' in i.attrs):
href = i.attrs['href']
if href.startswith("/") and href.startswith("/cdn-cgi") == False:
site = URL + href
if site not in urls:
item = {
'url': site,
'alerts': []
}
if (href.startswith("/stratus")):
status.append(item)
if (href.startswith("/edison")):
edison.append(item)
if (href.startswith("/newton")):
newton.append(item)
print(site)
# calling it self
scrape(site, item)
newUrl = "https://platformspfizerdocs-test.dev.pfizerstatic.io/edison/docs"
scrape(newUrl)
print(edison)
exit(0)
# response = requests.get(URL + "/edison/docs")
with requests.Session() as session :
soup = BeautifulSoup(session.get(URL + "/edison/docs").content, "html.parser")
getPages(soup)
scrapyPages()
print(linksWithAlerts)
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment