tperrelli · July 15, 2021 16:57
diff --git a/scrapy.py b/scrapy.py
 import os
 import glob
 import json
 import pprint
 from os import path
 from os import listdir
 import requests
 from bs4 import BeautifulSoup

 links = []
 linksWithAlerts = []

 URL = "https://platformspfizerdocs-test.dev.pfizerstatic.io"

 def getPages(soup):
    links = []
    for link in soup.findAll('a', {}):
        if (link.get('target') is None):
            
            title = link.get_text(strip=True)
            full_link = URL + link["href"]
            htmlFile = os.path.join('./pages', title + ".html")

            if (path.exists("./pages/" + title + ".html") == False):
                
                try:
                    f = open(htmlFile, "wb")
                    print(full_link)
                    f.write(session.get(full_link).content)
                    f.close()
                    break
                except ValueError:
                    print("Oops!")
                    continue
            
            links.append(link.get('href'))
    
    return links

 def scrapyPages():

    htmlFiles = glob.glob("./pages/*.html")

    print("Total pages: " + str(len(htmlFiles)))
    
    for htmlFile in htmlFiles:
        with open(htmlFile) as fp:
            soup = BeautifulSoup(fp, "html.parser")
            items = searchItems(soup)

            if (len(items) > 0):

                pos = htmlFile.rfind('/')

                item = {
                    'page'  : htmlFile[pos:],
                    'alerts': len(items)
                }

                linksWithAlerts.append(item)

 def searchItems(soup, tag = "div", filters = {"class" : "alert"}):
    
    results = soup.find_all(tag, filters)

    return results


 urls = []

 status = []
 edison = []
 newton = []
 def scrape(site, item=None):
       
    # getting the request from url
    r = requests.get(site)
       
    # converting the text
    s = BeautifulSoup(r.text,"html.parser")

    if (item is not None):
        item['alerts'] = len(searchItems(s))

        return item
       
    for i in s.find_all("a"):
          
        if ('href' in i.attrs):
            href = i.attrs['href']

            if href.startswith("/") and href.startswith("/cdn-cgi") == False:
                site = URL + href
                
                if site not in urls:

                    item = {
                        'url': site,
                        'alerts': []
                    }

                    if (href.startswith("/stratus")):
                        status.append(item)
                    
                    if (href.startswith("/edison")):
                        edison.append(item) 
                    
                    if (href.startswith("/newton")):
                        newton.append(item) 

                    print(site)
                    
                    # calling it self
                    scrape(site, item)



 newUrl = "https://platformspfizerdocs-test.dev.pfizerstatic.io/edison/docs"

 scrape(newUrl)
 print(edison)

 exit(0)

 # response = requests.get(URL + "/edison/docs")

 with requests.Session() as session :
    soup = BeautifulSoup(session.get(URL + "/edison/docs").content, "html.parser")

    getPages(soup)
    scrapyPages()

    print(linksWithAlerts)
    exit(0)
	import os
	import glob
	import json
	import pprint
	from os import path
	from os import listdir
	import requests
	from bs4 import BeautifulSoup

	links = []
	linksWithAlerts = []

	URL = "https://platformspfizerdocs-test.dev.pfizerstatic.io"

	def getPages(soup):
	links = []
	for link in soup.findAll('a', {}):
	if (link.get('target') is None):

	title = link.get_text(strip=True)
	full_link = URL + link["href"]
	htmlFile = os.path.join('./pages', title + ".html")

	if (path.exists("./pages/" + title + ".html") == False):

	try:
	f = open(htmlFile, "wb")
	print(full_link)
	f.write(session.get(full_link).content)
	f.close()
	break
	except ValueError:
	print("Oops!")
	continue

	links.append(link.get('href'))

	return links

	def scrapyPages():

	htmlFiles = glob.glob("./pages/*.html")

	print("Total pages: " + str(len(htmlFiles)))

	for htmlFile in htmlFiles:
	with open(htmlFile) as fp:
	soup = BeautifulSoup(fp, "html.parser")
	items = searchItems(soup)

	if (len(items) > 0):

	pos = htmlFile.rfind('/')

	item = {
	'page' : htmlFile[pos:],
	'alerts': len(items)
	}

	linksWithAlerts.append(item)

	def searchItems(soup, tag = "div", filters = {"class" : "alert"}):

	results = soup.find_all(tag, filters)

	return results


	urls = []

	status = []
	edison = []
	newton = []
	def scrape(site, item=None):

	# getting the request from url
	r = requests.get(site)

	# converting the text
	s = BeautifulSoup(r.text,"html.parser")

	if (item is not None):
	item['alerts'] = len(searchItems(s))

	return item

	for i in s.find_all("a"):

	if ('href' in i.attrs):
	href = i.attrs['href']

	if href.startswith("/") and href.startswith("/cdn-cgi") == False:
	site = URL + href

	if site not in urls:

	item = {
	'url': site,
	'alerts': []
	}

	if (href.startswith("/stratus")):
	status.append(item)

	if (href.startswith("/edison")):
	edison.append(item)

	if (href.startswith("/newton")):
	newton.append(item)

	print(site)

	# calling it self
	scrape(site, item)



	newUrl = "https://platformspfizerdocs-test.dev.pfizerstatic.io/edison/docs"

	scrape(newUrl)
	print(edison)

	exit(0)

	# response = requests.get(URL + "/edison/docs")

	with requests.Session() as session :
	soup = BeautifulSoup(session.get(URL + "/edison/docs").content, "html.parser")

	getPages(soup)
	scrapyPages()

	print(linksWithAlerts)
	exit(0)