Last active
June 22, 2019 17:40
-
-
Save cosmoscalibur/107d5e7b20eca3154a7369bd9b116604 to your computer and use it in GitHub Desktop.
web scraping example with requests and bs4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# author: Edward Villegas-Pulgarin (@cosmoscalibur) | |
# Scraping web site and convert to markdown format. | |
# Extract product catalog. | |
# June 28, 2017. | |
# Last test: June 22, 2019. | |
import requests | |
import bs4 | |
site = "https://www.vexrobotics.com/vexedr/products/view-all" | |
site_req = requests.get(site) | |
site_bs = bs4.BeautifulSoup(site_req.text, "lxml") | |
products_table = site_bs.find_all('li', {'class': 'item'}) | |
productos = len(products_table) | |
count = 0 | |
for product in products_table: | |
count = count + 1 | |
if product.find('button', {'title': 'Add to Cart'}): | |
product_tag = product.find('h2', {'class': 'product-name'}).find('a') | |
product_url = product_tag.get('href') | |
product_name = product_tag.string | |
product_req = requests.get(product_url) | |
product_bs = bs4.BeautifulSoup(product_req.text, "lxml") | |
product_img_tag = product_bs.find_all('a', {'rel': 'gallery', 'class': 'thumb-link'}) | |
product_img_url = [] | |
for img in product_img_tag: | |
product_img_url.append(img.get('href')) | |
product_price = product_bs.find('span', {'class': 'price'}).string | |
product_sku = product_bs.find('h3', {'class': 'sku'}).string | |
product_short = product_bs.find('div', {'class': 'std'}).get_text(" ") | |
print("Producto {} de {}\n".format(count, productos)) | |
print("# Nombre: {} \n__URL__: {} \n__SKU__: {} \n__Precio__: {} \n__Descripción__: {} \n".format(product_name, product_url, product_sku, product_price, product_short)) | |
for img in product_img_url: | |
print("![]({})\n".format(img)) | |
print("\n\n") | |
print("Procesados {} productos".format(productos)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment