Last active
August 3, 2021 19:06
-
-
Save JnsFerreira/9c062d79d09a89d6b867b5f69836f865 to your computer and use it in GitHub Desktop.
book_crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Built-in libraries | |
from time import time | |
import threading | |
import logging | |
import requests | |
import math | |
# Project libraries | |
from config import functions as f | |
from config import selenium_functions as sf | |
# External libraries | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
class Crawler: | |
"""Crawler that retrieves books name from a book store website""" | |
def __init__(self, base_url: str = None) | |
if base_url: | |
self.base_url = base_url | |
else: | |
# Using default base url | |
self.base_url = 'https://store.ubook.com/' | |
self.logger = logging.getLogger().setLevel(logging.DEBUG) | |
self.data = pd.DataFrame(columns=['book_name']) | |
def _get_soup(self, url: str): | |
""" | |
Generates a soup object from a given url | |
Args: | |
url: An url to generate BS object | |
Returns: | |
soup: A BeautifulSoup object for url parameter | |
""" | |
self.logger.info(f"Getting soup for {url}") | |
response = requests.get(url) | |
return BeautifulSoup(response.content, 'html5lib') | |
def _get_gender_content(self, url: str): | |
""" | |
""" | |
contents = self._get_soup(url)\ | |
.find('li', attrs={'class': 'level0 nav-1 first level-top parent'}) | |
for content in contents.find_all('a', href=True): | |
yield gender['href'] | |
def _get_book_content(self, url: str): | |
""" | |
Gets book content from a given url | |
Args: | |
url: some url to search for book name | |
Yields: | |
book_name: Founded book name | |
""" | |
contents = self.get_soup(url) \ | |
.find('p', attrs={'class': 'toolbar-amount'}) | |
page_quantity = m.ceil(f.get_int(contents) / 12) | |
# Iterate over all pages | |
for page_number in range(1, page_quantity): | |
formatted_url = f"{url}?p={page_number}" | |
self.logger.info(f"Getting books for url {formatted_url}") | |
books = self.get_soup(formatted_url) \ | |
.find('div', attrs={'class': 'products wrapper grid products-grid'}) | |
try: | |
for book in books.findAll('a', attrs={'class': 'product-item-link'}): | |
# Yields book name | |
book_name = str(book.text)[2:].replace(" ", "") | |
self.data.append({'book_name': book_name}, ignore_index=True) | |
except Exception as e: | |
self.logger.info(f"Error while getting book info. Details: {e}") | |
def run(self, export: bool = True) -> None: | |
""" | |
Runs the crawler using multithreads | |
Retuns: | |
None | |
See Also: | |
https://realpython.com/intro-to-python-threading/https://realpython.com/intro-to-python-threading/ | |
""" | |
threads = list() | |
for url in self._get_gender_content(url=self.base_url): | |
# Creates the thread | |
thread = threading.Thread(target=self._get_book_content, args=(url)) | |
# Appends to thread list | |
threads.append(thread) | |
# Start the thread | |
thread.start() | |
for thread in threads: | |
thread.join() | |
if export: | |
self.data.csv(f"export-{int(time())}.csv") | |
def main(): | |
Crawler().run() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment