Last active
September 19, 2019 18:34
-
-
Save scottpham/9386852354a7ed50eee0b986267f2fc9 to your computer and use it in GitHub Desktop.
headless_scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium | |
from time import sleep | |
# remember to pip install lxml | |
# boot up the headless chrome | |
def start_driver(): | |
# create an options object | |
options = webdriver.ChromeOptions() | |
# only do this if I have to use weird features | |
#options.binary_location = '/usr/bin/google-chrome-unstable' | |
options.add_argument('headless') | |
# seems like a good idea i dunno | |
options.add_argument('window-size=1200x600') | |
# initialize | |
return webdriver.Chrome(options=options).webdriver.support.ui import Select | |
def get_soup(driver, url): | |
# time out after 30 seconds | |
driver.set_page_load_timeout(30) | |
# let javascript load for 10 seconds (overkill, but gets the job done) | |
driver.implicitly_wait(5) | |
driver.get(url) | |
# scroll down | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
# I'm not sure if the site will wait 10 seconds again at this point, so sometimes I just manually throw in a wait | |
sleep(2) | |
# get html | |
html = driver.page_source | |
# soupify | |
soup = BeautifulSoup( | |
html, | |
"lxml" | |
) | |
return soup | |
# start driver | |
driver = start_driver() | |
URL = "URL THAT YOU WANT" | |
soup = get_soup(driver, URL) | |
# Guide to selenium waits: https://selenium-python.readthedocs.io/waits.html |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment