JnsFerreira · August 3, 2021 19:06
diff --git a/crawler.py b/crawler.py
 # Built-in libraries
 from time import time
 import threading
 import logging
 import requests
 import math

 # Project libraries
 from config import functions as f
 from config import selenium_functions as sf

 # External libraries
 import pandas as pd
 from bs4 import BeautifulSoup


 class Crawler:
    """Crawler that retrieves books name from a book store website"""

    def __init__(self, base_url: str = None)

        if base_url:
            self.base_url = base_url
        else:
            # Using default base url
            self.base_url = 'https://store.ubook.com/'

        self.logger = logging.getLogger().setLevel(logging.DEBUG)
        self.data = pd.DataFrame(columns=['book_name'])
        

    def _get_soup(self, url: str):
        """
        Generates a soup object from a given url

        Args:
            url: An url to generate BS object
        Returns:
            soup: A BeautifulSoup object for url parameter
        """

        self.logger.info(f"Getting soup for {url}")
        response = requests.get(url)
        
        return BeautifulSoup(response.content, 'html5lib')

    def _get_gender_content(self, url: str):
        """
        """
        contents = self._get_soup(url)\
                       .find('li', attrs={'class': 'level0 nav-1 first level-top parent'})

        for content in contents.find_all('a', href=True):
            yield gender['href']


    def _get_book_content(self, url: str):
        """
        Gets book content from a given url

        Args:
            url: some url to search for book name
        Yields:
            book_name: Founded book name
        """

        contents = self.get_soup(url) \
                       .find('p', attrs={'class': 'toolbar-amount'})
        
        page_quantity = m.ceil(f.get_int(contents) / 12)

        # Iterate over all pages
        for page_number in range(1, page_quantity):
            formatted_url = f"{url}?p={page_number}"

            self.logger.info(f"Getting books for url {formatted_url}")

            books = self.get_soup(formatted_url) \
                        .find('div', attrs={'class': 'products wrapper grid products-grid'})

            try:
                for book in books.findAll('a', attrs={'class': 'product-item-link'}):
                    # Yields book name 
                    book_name = str(book.text)[2:].replace(" ", "")

                    self.data.append({'book_name': book_name}, ignore_index=True)

            except Exception as e:
                self.logger.info(f"Error while getting book info. Details: {e}")

    
    def run(self, export: bool = True) -> None:
        """
        Runs the crawler using multithreads

        Retuns:
            None

        See Also:
            https://realpython.com/intro-to-python-threading/https://realpython.com/intro-to-python-threading/

        """
        threads = list()

        for url in self._get_gender_content(url=self.base_url):
            # Creates the thread
            thread = threading.Thread(target=self._get_book_content, args=(url))
            
            # Appends to thread list
            threads.append(thread)

            # Start the thread
            thread.start()

        for thread in threads:
            thread.join()

        if export:
            self.data.csv(f"export-{int(time())}.csv")


 def main():
    Crawler().run()


 if __name__ == '__main__':
    main()
	# Built-in libraries
	from time import time
	import threading
	import logging
	import requests
	import math

	# Project libraries
	from config import functions as f
	from config import selenium_functions as sf

	# External libraries
	import pandas as pd
	from bs4 import BeautifulSoup


	class Crawler:
	"""Crawler that retrieves books name from a book store website"""

	def __init__(self, base_url: str = None)

	if base_url:
	self.base_url = base_url
	else:
	# Using default base url
	self.base_url = 'https://store.ubook.com/'

	self.logger = logging.getLogger().setLevel(logging.DEBUG)
	self.data = pd.DataFrame(columns=['book_name'])


	def _get_soup(self, url: str):
	"""
	Generates a soup object from a given url

	Args:
	url: An url to generate BS object
	Returns:
	soup: A BeautifulSoup object for url parameter
	"""

	self.logger.info(f"Getting soup for {url}")
	response = requests.get(url)

	return BeautifulSoup(response.content, 'html5lib')

	def _get_gender_content(self, url: str):
	"""
	"""
	contents = self._get_soup(url)\
	.find('li', attrs={'class': 'level0 nav-1 first level-top parent'})

	for content in contents.find_all('a', href=True):
	yield gender['href']


	def _get_book_content(self, url: str):
	"""
	Gets book content from a given url

	Args:
	url: some url to search for book name
	Yields:
	book_name: Founded book name
	"""

	contents = self.get_soup(url) \
	.find('p', attrs={'class': 'toolbar-amount'})

	page_quantity = m.ceil(f.get_int(contents) / 12)

	# Iterate over all pages
	for page_number in range(1, page_quantity):
	formatted_url = f"{url}?p={page_number}"

	self.logger.info(f"Getting books for url {formatted_url}")

	books = self.get_soup(formatted_url) \
	.find('div', attrs={'class': 'products wrapper grid products-grid'})

	try:
	for book in books.findAll('a', attrs={'class': 'product-item-link'}):
	# Yields book name
	book_name = str(book.text)[2:].replace(" ", "")

	self.data.append({'book_name': book_name}, ignore_index=True)

	except Exception as e:
	self.logger.info(f"Error while getting book info. Details: {e}")


	def run(self, export: bool = True) -> None:
	"""
	Runs the crawler using multithreads

	Retuns:
	None

	See Also:
	https://realpython.com/intro-to-python-threading/https://realpython.com/intro-to-python-threading/

	"""
	threads = list()

	for url in self._get_gender_content(url=self.base_url):
	# Creates the thread
	thread = threading.Thread(target=self._get_book_content, args=(url))

	# Appends to thread list
	threads.append(thread)

	# Start the thread
	thread.start()

	for thread in threads:
	thread.join()

	if export:
	self.data.csv(f"export-{int(time())}.csv")


	def main():
	Crawler().run()


	if __name__ == '__main__':
	main()