Last active
August 9, 2018 17:31
-
-
Save opyapeus/821079f96849a8fd2da8873b5dacd22b to your computer and use it in GitHub Desktop.
Amazon large image scraping from ASIN
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python3 | |
from urllib import request | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
import re | |
import os | |
SAVE_DIR = 'imgs' | |
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36' | |
AMAZON_BASE_URL = 'https://www.amazon.co.jp/dp' | |
asins = ['B01J6RPOJY', 'B072B5BTLK'] # Ex: Fire HD 10 32GB, Amazon Echo Dot | |
# create save dir | |
[os.makedirs(os.path.join(SAVE_DIR, asin), exist_ok=True) for asin in asins] | |
# set user agent | |
ua = dict(DesiredCapabilities.PHANTOMJS) | |
ua['phantomjs.page.settings.userAgent'] = (USER_AGENT) | |
# start driver | |
driver = webdriver.PhantomJS(desired_capabilities=ua) | |
for asin in asins: | |
# fetch html source | |
url = os.path.join(AMAZON_BASE_URL, asin) | |
driver.get(url) | |
# NOTE: to show large image url list (needs hovering thumbnail) | |
thumb_list = driver.find_elements_by_css_selector('li.imageThumbnail') | |
[thumb.click() for thumb in thumb_list] | |
# read html as bs4 | |
soup = BeautifulSoup(driver.page_source, 'html5lib') | |
#NOTE: get img tags (condition is temporary) | |
img_tags = soup.select('.image img') | |
for idx, img_tag in enumerate(img_tags): | |
url = img_tag['src'] | |
# fix large image url (xxx._YYYYY_.jpg -> xxx.jpg) | |
large_img_url = re.sub(r'\._.*_', '', url) | |
# create save path | |
_, ext = os.path.splitext(url) | |
fn = str(idx) + ext | |
save_path = os.path.join(SAVE_DIR, asin, fn) | |
# save | |
request.urlretrieve(large_img_url, save_path) | |
# end driver | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment