Skip to content

Instantly share code, notes, and snippets.

@JnsFerreira
Last active August 3, 2021 19:06
Show Gist options
  • Save JnsFerreira/9c062d79d09a89d6b867b5f69836f865 to your computer and use it in GitHub Desktop.
Save JnsFerreira/9c062d79d09a89d6b867b5f69836f865 to your computer and use it in GitHub Desktop.
book_crawler
# Built-in libraries
from time import time
import threading
import logging
import requests
import math
# Project libraries
from config import functions as f
from config import selenium_functions as sf
# External libraries
import pandas as pd
from bs4 import BeautifulSoup
class Crawler:
"""Crawler that retrieves books name from a book store website"""
def __init__(self, base_url: str = None)
if base_url:
self.base_url = base_url
else:
# Using default base url
self.base_url = 'https://store.ubook.com/'
self.logger = logging.getLogger().setLevel(logging.DEBUG)
self.data = pd.DataFrame(columns=['book_name'])
def _get_soup(self, url: str):
"""
Generates a soup object from a given url
Args:
url: An url to generate BS object
Returns:
soup: A BeautifulSoup object for url parameter
"""
self.logger.info(f"Getting soup for {url}")
response = requests.get(url)
return BeautifulSoup(response.content, 'html5lib')
def _get_gender_content(self, url: str):
"""
"""
contents = self._get_soup(url)\
.find('li', attrs={'class': 'level0 nav-1 first level-top parent'})
for content in contents.find_all('a', href=True):
yield gender['href']
def _get_book_content(self, url: str):
"""
Gets book content from a given url
Args:
url: some url to search for book name
Yields:
book_name: Founded book name
"""
contents = self.get_soup(url) \
.find('p', attrs={'class': 'toolbar-amount'})
page_quantity = m.ceil(f.get_int(contents) / 12)
# Iterate over all pages
for page_number in range(1, page_quantity):
formatted_url = f"{url}?p={page_number}"
self.logger.info(f"Getting books for url {formatted_url}")
books = self.get_soup(formatted_url) \
.find('div', attrs={'class': 'products wrapper grid products-grid'})
try:
for book in books.findAll('a', attrs={'class': 'product-item-link'}):
# Yields book name
book_name = str(book.text)[2:].replace(" ", "")
self.data.append({'book_name': book_name}, ignore_index=True)
except Exception as e:
self.logger.info(f"Error while getting book info. Details: {e}")
def run(self, export: bool = True) -> None:
"""
Runs the crawler using multithreads
Retuns:
None
See Also:
https://realpython.com/intro-to-python-threading/https://realpython.com/intro-to-python-threading/
"""
threads = list()
for url in self._get_gender_content(url=self.base_url):
# Creates the thread
thread = threading.Thread(target=self._get_book_content, args=(url))
# Appends to thread list
threads.append(thread)
# Start the thread
thread.start()
for thread in threads:
thread.join()
if export:
self.data.csv(f"export-{int(time())}.csv")
def main():
Crawler().run()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment